## **Import all basic modules**

In [1]:
import numpy as np
import pandas as pd

## **Import the Dataset**

In [2]:
df = pd.read_csv("T20_mens_dataset.csv")
df.head(10)

Unnamed: 0,batting_team,bowling_team,city,current_score,balls_left,wickets_left,crr,last_five,runs_x
0,Sri Lanka,Australia,Colombo,0,119,10,0.0,,128
1,Sri Lanka,Australia,Colombo,1,118,10,3.0,,128
2,Sri Lanka,Australia,Colombo,5,117,10,10.0,,128
3,Sri Lanka,Australia,Colombo,6,116,10,9.0,,128
4,Sri Lanka,Australia,Colombo,7,115,10,8.4,,128
5,Sri Lanka,Australia,Colombo,9,114,10,9.0,,128
6,Sri Lanka,Australia,Colombo,9,113,10,7.714286,,128
7,Sri Lanka,Australia,Colombo,9,113,10,7.714286,,128
8,Sri Lanka,Australia,Colombo,9,112,9,6.75,,128
9,Sri Lanka,Australia,Colombo,9,111,9,6.0,,128


In [3]:
df.shape

(50609, 9)

In [4]:
df.dtypes

batting_team      object
bowling_team      object
city              object
current_score      int64
balls_left         int64
wickets_left       int64
crr              float64
last_five        float64
runs_x             int64
dtype: object

### **Dropping all rows containing null values**

In [5]:
df.isnull().sum()

batting_team         0
bowling_team         0
city                 0
current_score        0
balls_left           0
wickets_left         0
crr                  0
last_five        12053
runs_x               0
dtype: int64

In [6]:
# df["last_five"].fillna(value)

df.dropna(inplace=True)

df.isnull().sum()

batting_team     0
bowling_team     0
city             0
current_score    0
balls_left       0
wickets_left     0
crr              0
last_five        0
runs_x           0
dtype: int64

## **EDA: Exploratory Data Analysis**

In [7]:
df.describe()

Unnamed: 0,current_score,balls_left,wickets_left,crr,last_five,runs_x
count,38556.0,38556.0,38556.0,38556.0,38556.0,38556.0
mean,94.078224,45.88448,6.733634,7.563875,38.484023,160.082192
std,41.630432,26.542472,2.034015,1.722,11.810156,32.203038
min,12.0,0.0,0.0,2.27027,9.0,72.0
25%,60.0,23.0,5.0,6.375,30.0,139.0
50%,89.0,46.0,7.0,7.5,38.0,159.0
75%,123.0,69.0,8.0,8.630497,46.0,182.0
max,263.0,98.0,10.0,16.6,89.0,263.0


In [66]:
import matplotlib.pyplot as plt

ModuleNotFoundError: No module named 'matplotlib'

In [65]:
x = df["city"].value_counts()/90
type(x)


plt.bar(df["city"], x)

pandas.core.series.Series

# **Model Building**

### **1. Train Test Split**

In [8]:
from sklearn.model_selection import train_test_split

y = df[['runs_x']]
X = df.drop(columns=['runs_x'])

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [10]:
X_test.shape

(7712, 8)

### **2. Data Transformation**

In [12]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [25]:
trf = ColumnTransformer([
    ('trf', OneHotEncoder(sparse_output=False), ['batting_team', 'bowling_team', 'city'])
], remainder='passthrough')

transformed_X_train = trf.fit_transform(X_train)
transformed_X_test = trf.fit_transform(X_test)


In [26]:
pd.DataFrame(transformed_X_train)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,50,51,52,53,54,55,56,57,58,59
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,108.0,23.0,6.0,6.680412,44.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,46.0,79.0,8.0,6.731707,29.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,95.0,60.0,10.0,9.500000,55.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,81.0,46.0,5.0,6.567568,28.0
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,77.0,68.0,6.0,8.884615,46.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30839,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,98.0,24.0,5.0,6.125000,31.0
30840,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,90.0,65.0,8.0,9.818182,34.0
30841,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,67.0,76.0,10.0,9.136364,41.0
30842,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,59.0,74.0,9.0,7.695652,37.0


In [35]:
scl = StandardScaler()
scaled_X_train = scl.fit_transform(transformed_X_train)

scaled_X_test = scl.fit_transform(transformed_X_test)


In [36]:
pd.DataFrame(scaled_X_train)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,50,51,52,53,54,55,56,57,58,59
0,8.303074,-0.349764,-0.232128,-0.322883,-0.35192,-0.415985,-0.392863,-0.367272,-0.37299,-0.331023,...,9.334116,-0.172771,-0.178847,-0.111239,-0.174357,0.335150,-0.861642,-0.360787,-0.511052,0.470926
1,-0.120437,2.859073,-0.232128,-0.322883,-0.35192,-0.415985,-0.392863,-0.367272,-0.37299,-0.331023,...,-0.107134,-0.172771,-0.178847,-0.111239,-0.174357,-1.154803,1.248347,0.623958,-0.481256,-0.802935
2,-0.120437,-0.349764,-0.232128,-0.322883,-0.35192,-0.415985,-0.392863,-0.367272,-0.37299,3.020938,...,-0.107134,-0.172771,-0.178847,-0.111239,-0.174357,0.022740,0.532458,1.608703,1.126801,1.405092
3,-0.120437,-0.349764,-0.232128,3.097101,-0.35192,-0.415985,-0.392863,-0.367272,-0.37299,-0.331023,...,-0.107134,-0.172771,-0.178847,-0.111239,-0.174357,-0.313701,0.004961,-0.853160,-0.576602,-0.887860
4,-0.120437,-0.349764,-0.232128,3.097101,-0.35192,-0.415985,-0.392863,-0.367272,-0.37299,-0.331023,...,-0.107134,-0.172771,-0.178847,-0.111239,-0.174357,-0.409827,0.833885,-0.360787,0.769334,0.640775
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30839,-0.120437,-0.349764,4.307976,-0.322883,-0.35192,-0.415985,-0.392863,-0.367272,-0.37299,-0.331023,...,-0.107134,-0.172771,-0.178847,-0.111239,-0.174357,0.094835,-0.823963,-0.853160,-0.833683,-0.633087
30840,-0.120437,-0.349764,4.307976,-0.322883,-0.35192,-0.415985,-0.392863,-0.367272,-0.37299,-0.331023,...,-0.107134,-0.172771,-0.178847,-0.111239,-0.174357,-0.097417,0.720850,0.623958,1.311628,-0.378315
30841,-0.120437,-0.349764,-0.232128,-0.322883,-0.35192,-0.415985,-0.392863,2.722780,-0.37299,-0.331023,...,-0.107134,-0.172771,-0.178847,-0.111239,-0.174357,-0.650142,1.135312,1.608703,0.915571,0.216154
30842,-0.120437,-0.349764,-0.232128,-0.322883,-0.35192,-0.415985,2.545417,-0.367272,-0.37299,-0.331023,...,-0.107134,-0.172771,-0.178847,-0.111239,-0.174357,-0.842394,1.059955,1.116331,0.078684,-0.123543


### **3. Model Taining**

In [32]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

# **Linear Regression**

### **$\hat{y}$ = $\theta_0 + \theta_1 x_1$**

#### $\theta_0$ = intercept
#### $\theta_1$ = slope

### Mean Absolute Error = $\frac{1}{n} \Sigma_{i=1}^n|({y}-\hat{y})|$
### Mean Squared Error = $\frac{1}{n} \Sigma_{i=1}^n({y}-\hat{y})^2$

In [51]:
lr = LinearRegression()
lr.fit(scaled_X_train, y_train)

y_pred = lr.predict(scaled_X_test)

print(f'mean absolute error is: {mean_absolute_error(y_test, y_pred)}')
print(f'mean squared error is: {mean_squared_error(y_test, y_pred)}')

mean absolute error is: 65820073990.080345
mean squared error is: 7.171654374324997e+21


In [49]:
y_pred

array([[181.53295082],
       [150.00902504],
       [168.10082192],
       ...,
       [163.99364418],
       [191.64568276],
       [215.50463051]])

In [50]:
y_test

Unnamed: 0,runs_x
36120,214
37592,147
50591,168
24339,128
34178,144
...,...
4231,215
33502,164
20558,171
39271,179


### **Create Pipeline**

In [53]:
algos = [XGBRegressor(n_estimators = 1000, learning_rate = 0.15, max_depth = 12, random_state = 1), RandomForestRegressor(n_estimators=100, random_state=1, oob_score=True), LinearRegression()]

for algo in algos:
        pipe = Pipeline(steps=[
        ("step1", trf),
        ('step2', scl),
        ('step3', algo)     
    ])
        pipe.fit(X_train, y_train)
        y_pred = pipe.predict(X_test)

        print(f'Scores of {str(algo).split("(")[0]} Model \n')

        print(f'\t mean absolute error is: {mean_absolute_error(y_test, y_pred)}')
        print(f'\t mean squared error is: {mean_squared_error(y_test, y_pred)} \n')

        print("-"*25)


Scores of XGBRegressor Model 

	 mean absolute error is: 1.5571246997944053
	 mean squared error is: 11.341998688228887 

-------------------------


  return fit_method(estimator, *args, **kwargs)


Scores of RandomForestRegressor Model 

	 mean absolute error is: 1.9641168741355466
	 mean squared error is: 18.20858998031927 

-------------------------
Scores of LinearRegression Model 

	 mean absolute error is: 13.0455764886242
	 mean squared error is: 304.6850059118985 

-------------------------


### **Final Model Pipeline: XGBRegressor**

In [109]:
pipe = Pipeline(steps=[
        ("step1", trf),
        ('step2', StandardScaler()),
        ('step3', XGBRegressor(n_estimators = 1000, learning_rate = 0.15, max_depth = 12, random_state = 1))     
    ])

pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)

print(f'mean absolute error is: {mean_absolute_error(y_test, y_pred)}')
print(f'mean squared error is: {mean_squared_error(y_test, y_pred)}')

R2 score is: 0.9891198369000512
mean absolute error is: 1.5571246997944053
mean squared error is: 11.341998688228887


### **4. Save the Model**

In [111]:
import pickle 
pickle.dump(pipe, open('model_pipeline.pkl', 'wb'))

# **Task: Customer Yearly Expense Prediction in Ecommerce PLatform**



## **Checkpoints:**

1. Read the Dataset
2. Do some Exploratory Data Analysis (EDA)

    **df.info()**
    
    **df.describe()**

    scatter plot between column:  **'Time on App', 'Yearly Amount Spent'**
    
    scatter plot between column:  **'Time on Web', 'Yearly Amount Spent'**

    any other you can

3. **Train Test Split** the dataset
4. Train the Model using **Linear Regression** on any other regression model
5. Display the model performance score (**MAE** or **MSE**)


    
 