In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
path = '/content/drive/MyDrive/Data Science/Project-49 IPL Cricket Score Prediction Using TPOT (Auto ML)/ipl.csv'

In [None]:
df = pd.read_csv(path)

In [None]:
df.shape

In [None]:
df.head()

In [None]:
df.info()

In [None]:
# checking Null values

df.isna().sum()

In [None]:
# checking unique values

df.nunique()

## Data Analyis and cleaning the data

In [None]:
# dropping unwanted columns

df.drop(['mid', 'venue', 'batsman', 'bowler', 'striker', 'non-striker'], axis=1, inplace=True)

In [None]:
df.head()

In [None]:
df['bat_team'].value_counts().plot(kind='pie', autopct="%.f%%", figsize=(8, 8), title='bat team')

In [None]:
df.replace({'bat_team': {"Deccan Chargers": 'Sunrisers Hyderabad'}}, inplace=True)
df.replace({'bat_team': {"Pune Warriors": 'Rising Pune Supergiants',
                         'Rising Pune Supergiant': 'Rising Pune Supergiants'}}, inplace=True)

In [None]:
df['bowl_team'].value_counts().plot(kind='pie', autopct="%.f%%", figsize=(8, 8), title='bat team')

In [None]:
df.replace({'bowl_team': {"Deccan Chargers": 'Sunrisers Hyderabad'}}, inplace=True)
df.replace({'bowl_team': {"Pune Warriors": 'Rising Pune Supergiants',
                         'Rising Pune Supergiant': 'Rising Pune Supergiants'}}, inplace=True)

In [None]:
selected_teams = df['bat_team'].value_counts().index[:-2]
selected_teams

In [None]:
df = df[(df['bat_team'].isin(selected_teams)) & (df['bowl_team'].isin(selected_teams))].copy()

In [None]:
df.head()

In [None]:
#  change inot date time object

df['date'] = pd.to_datetime(df['date'])

In [None]:
df.dtypes

In [None]:
df.head()

## Feature engineering

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
for i in df.select_dtypes('object').columns:
  encoder = LabelEncoder().fit(df[i])
  df[i] = encoder.transform(df[i])

  print("--"*10, i, "--"*10)
  print(dict(zip(encoder.classes_, encoder.transform(encoder.classes_))))
  print()

In [None]:
df.head()

## Splitting data

In [None]:
train = df[df['date'].dt.year <= 2016].copy()
test = df[df['date'].dt.year >= 2017].copy()

In [None]:
x_train = train.drop(['total', 'date'], axis=1)
y_train = train['total'].values

In [None]:
x_test = test.drop(['total', 'date'], axis=1)
y_test = test['total'].values

In [None]:
x_train.head()

In [None]:
sns.kdeplot(x=y_train)

## Buiding model

In [None]:
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [None]:
all_models = [LinearRegression, Lasso, RandomForestRegressor, DecisionTreeRegressor, XGBRegressor, ExtraTreesRegressor]

In [None]:
def get_model_score(model):
    model = model()
    model.fit(x_train, y_train)
    
    print(type(model).__name__)
    print("Training Score", model.score(x_train, y_train))
    print("Testing SCore", model.score(x_test, y_test))
    return model

In [None]:
models = {}
for i in all_models:
    model = get_model_score(i)
    models[type(model).__name__] = model
    print('---'*20)

In [None]:
model = models['Lasso']
model.get_params()

In [None]:
param = {
    'alpha': [1e-15, 1e-10, 1e-8, 1e-5, 1e-2, 1, 5, 10, 20, 30, 50],
    'tol':[1e-15, 1e-10, 1e-8, 1e-5, 1e-2],
    "max_iter":[100, 200, 500, 1000, 5000, 10000],
}

In [None]:
gr = GridSearchCV(estimator=Lasso(), param_grid=param, scoring='neg_mean_squared_error', cv=5)

In [None]:
gr.fit(x_train, y_train)

In [None]:
gr.best_params_

In [None]:
model = Lasso(alpha = 0.01, max_iter= 1000, tol = 1e-15)

In [None]:
model.fit(x_train, y_train)

In [None]:
model.score(x_test, y_test)

In [None]:
pred = model.predict(x_test)

In [None]:
sns.displot(x=y_test-pred, kde=True)

## Random forest 

In [None]:
Params = {'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
 'max_features': ["sqrt", "log2", None],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}

In [None]:
modelTune = RandomizedSearchCV(estimator=RandomForestRegressor(),
                              param_distributions=Params,
                              n_iter=5, n_jobs=-1, cv=5)

In [None]:
%%time
modelTune.fit(x_train, y_train)

In [None]:
modelTune.best_params_

In [None]:
model = RandomForestRegressor(n_estimators= 400,
 min_samples_split= 10,
 min_samples_leaf= 2,
 max_features= 'log2',
 max_depth= 60,
 bootstrap= True)

In [None]:
model.fit(x_train, y_train)

In [None]:
model.score(x_test, y_test)

In [None]:
pred = model.predict(x_test)

In [None]:
sns.displot(x=y_test-pred, kde=True)

## Auto ML