In [45]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [46]:
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.preprocessing import OrdinalEncoder,OneHotEncoder
from sklearn.preprocessing import PolynomialFeatures
from sklearn.impute import SimpleImputer

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.model_selection import GridSearchCV

from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error

from sklearn.linear_model import LinearRegression,Ridge,Lasso,ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

In [47]:
df = pd.read_csv('D:\Guvi projects\YoutubeAdRevProject\Datasets\preprocessed_df.csv')
df.head()

  df = pd.read_csv('D:\Guvi projects\YoutubeAdRevProject\Datasets\preprocessed_df.csv')


Unnamed: 0,category,device,country,watch_time_days,day_name,engagement_rate,sub_engagement,ad_revenue_usd
0,Entertainment,TV,IN,18.4,Tuesday,0.155093,35374.449074,203.178237
1,Gaming,Tablet,CA,10.56,Sunday,0.098632,72594.87072,140.880508
2,Education,TV,CA,39.81,Thursday,0.214519,51599.15262,360.134008
3,Entertainment,Mobile,UK,21.76,Tuesday,0.142814,62050.299581,224.638261
4,Education,Mobile,CA,10.88,Monday,0.236121,9924.163212,165.514388


In [48]:
x = df.drop('ad_revenue_usd', axis=1)
y = df['ad_revenue_usd']

In [49]:
num_x = x.select_dtypes(exclude='object')
cat_x = x.select_dtypes(include='object')

### Scaling and encoding

In [50]:
num_x # everything is continuous

Unnamed: 0,watch_time_days,engagement_rate,sub_engagement
0,18.40,0.155093,35374.449074
1,10.56,0.098632,72594.870720
2,39.81,0.214519,51599.152620
3,21.76,0.142814,62050.299581
4,10.88,0.236121,9924.163212
...,...,...,...
119995,29.22,0.184715,38941.313306
119996,39.97,0.174961,153765.789889
119997,19.13,0.068180,39322.996007
119998,39.56,0.168164,98399.183203


In [51]:
cat_x

Unnamed: 0,category,device,country,day_name
0,Entertainment,TV,IN,Tuesday
1,Gaming,Tablet,CA,Sunday
2,Education,TV,CA,Thursday
3,Entertainment,Mobile,UK,Tuesday
4,Education,Mobile,CA,Monday
...,...,...,...,...
119995,Education,Tablet,US,Saturday
119996,Music,Desktop,UK,Saturday
119997,Tech,Tablet,CA,Monday
119998,Music,Mobile,UK,Sunday


In [52]:
cat_rank = []
for i in cat_x.columns:
    cat_rank.append(list(df.groupby(i)['ad_revenue_usd'].mean().sort_values(ascending=False).index))
cat_rank

[['Tech', 'Gaming', 'Education', 'Music', 'Entertainment', 'Lifestyle'],
 ['Mobile', 'Tablet', 'TV', 'Desktop'],
 ['US', 'CA', 'DE', 'UK', 'IN', 'AU'],
 ['Friday',
  'Sunday',
  'Thursday',
  'Monday',
  'Tuesday',
  'Wednesday',
  'Saturday']]

### Pipeline

In [53]:
numerical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='median')), ('scalar', StandardScaler())])
categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')), ('ordinal', OrdinalEncoder(categories=cat_rank))])

In [54]:
# Preprocessing
preprocessor = ColumnTransformer(transformers=[('num', numerical_transformer, num_x.columns),
                                               ('cat', categorical_transformer, cat_x.columns)])
preprocessor

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,"[['Tech', 'Gaming', ...], ['Mobile', 'Tablet', ...], ...]"
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,


In [55]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((96000, 7), (24000, 7), (96000,), (24000,))

#### Setting up the preprocessor pipeline for gridsearchcv for auto detect best tuning parameter for model

In [56]:
preprocess_pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

x_train_transformed = preprocess_pipeline.fit_transform(x_train)
x_test_transformed = preprocess_pipeline.transform(x_test)

### Model Selection

#### Linear Regression

In [57]:
LR_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('regressor', LinearRegression())])

LR_pipeline.fit(x_train, y_train)

y_train_pred = LR_pipeline.predict(x_train)
y_test_pred = LR_pipeline.predict(x_test)

print('Evaluation:')
print('R2 score for train data',r2_score(y_train,y_train_pred))
print('R2 score for test data',r2_score(y_test,y_test_pred))
print('---'*50)
print('MAE for train data',mean_absolute_error(y_train,y_train_pred))
print('MAE for test data',mean_absolute_error(y_test,y_test_pred))
print('---'*50)
print('MSE for test data',mean_squared_error(y_train,y_train_pred))
print('MSE for test data',mean_squared_error(y_test,y_test_pred))

Evaluation:
R2 score for train data 0.9457878549583433
R2 score for test data 0.9485649142843713
------------------------------------------------------------------------------------------------------------------------------------------------------
MAE for train data 5.028141868853842
MAE for test data 4.878711746969023
------------------------------------------------------------------------------------------------------------------------------------------------------
MSE for test data 208.1737118372405
MSE for test data 197.0722572549061


In [72]:
from sklearn.model_selection import cross_val_score

# R² scores across 5 folds
r2_scores = cross_val_score(LR_pipeline, x_train, y_train, cv=5, scoring='r2')

print("R² scores per fold:", r2_scores)
print("Average R² score  :", r2_scores.mean())

MAE = -cross_val_score(LR_pipeline, x_train, y_train, cv=5, scoring='neg_mean_absolute_error')
print("MAE :", MAE)
print("Average MAE  :", MAE.mean())

R² scores per fold: [0.9471054  0.94726867 0.9431773  0.94493301 0.94636893]
Average R² score  : 0.9457706634528741
MAE : [4.94418218 5.00057436 5.17545649 5.0016806  5.02472829]
Average MAE  : 5.029324383638159


In [58]:
import pickle
pickle.dump(LR_pipeline, open('D:\Guvi projects\YoutubeAdRevProject\Models\LR_model.pkl', 'wb'))

  pickle.dump(LR_pipeline, open('D:\Guvi projects\YoutubeAdRevProject\Models\LR_model.pkl', 'wb'))


#### Polynomial feature

In [59]:
poly_LR_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('poly', PolynomialFeatures(degree=2)),
                              ('regressor', LinearRegression())])

poly_LR_pipeline.fit(x_train, y_train)

y_train_pred = poly_LR_pipeline.predict(x_train)
y_test_pred = poly_LR_pipeline.predict(x_test)

print('Evaluation:')
print('R2 score for train data',r2_score(y_train,y_train_pred))
print('R2 score for test data',r2_score(y_test,y_test_pred))
print('---'*50)
print('MAE for train data',mean_absolute_error(y_train,y_train_pred))
print('MAE for test data',mean_absolute_error(y_test,y_test_pred))
print('---'*50)
print('MSE for test data',mean_squared_error(y_train,y_train_pred))
print('MSE for test data',mean_squared_error(y_test,y_test_pred))

Evaluation:
R2 score for train data 0.9469071567580767
R2 score for test data 0.9497099587108688
------------------------------------------------------------------------------------------------------------------------------------------------------
MAE for train data 4.840693958094443
MAE for test data 4.700797849703075
------------------------------------------------------------------------------------------------------------------------------------------------------
MSE for test data 203.87561202699396
MSE for test data 192.68504788901478


In [60]:
poly_LR_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('poly', PolynomialFeatures(degree=3)),
                              ('regressor', LinearRegression())])

poly_LR_pipeline.fit(x_train, y_train)

y_train_pred = poly_LR_pipeline.predict(x_train)
y_test_pred = poly_LR_pipeline.predict(x_test)

print('Evaluation:')
print('R2 score for train data',r2_score(y_train,y_train_pred))
print('R2 score for test data',r2_score(y_test,y_test_pred))
print('---'*50)
print('MAE for train data',mean_absolute_error(y_train,y_train_pred))
print('MAE for test data',mean_absolute_error(y_test,y_test_pred))
print('---'*50)
print('MSE for test data',mean_squared_error(y_train,y_train_pred))
print('MSE for test data',mean_squared_error(y_test,y_test_pred))

Evaluation:
R2 score for train data 0.9476930668451704
R2 score for test data 0.9504881916540276
------------------------------------------------------------------------------------------------------------------------------------------------------
MAE for train data 4.472456093307152
MAE for test data 4.322290780893967
------------------------------------------------------------------------------------------------------------------------------------------------------
MSE for test data 200.85773070400057
MSE for test data 189.7032676383437


In [73]:
poly_LR_pipeline_4 = Pipeline(steps=[('preprocessor', preprocessor),
                              ('poly', PolynomialFeatures(degree=4)),
                              ('regressor', LinearRegression())])

poly_LR_pipeline_4.fit(x_train, y_train)

y_train_pred = poly_LR_pipeline_4.predict(x_train)
y_test_pred = poly_LR_pipeline_4.predict(x_test)

print('Evaluation:')
print('R2 score for train data',r2_score(y_train,y_train_pred))
print('R2 score for test data',r2_score(y_test,y_test_pred))
print('---'*50)
print('MAE for train data',mean_absolute_error(y_train,y_train_pred))
print('MAE for test data',mean_absolute_error(y_test,y_test_pred))
print('---'*50)
print('MSE for test data',mean_squared_error(y_train,y_train_pred))
print('MSE for test data',mean_squared_error(y_test,y_test_pred))

Evaluation:
R2 score for train data 0.9479971123617289
R2 score for test data 0.9505695696812843
------------------------------------------------------------------------------------------------------------------------------------------------------
MAE for train data 4.343445091842412
MAE for test data 4.199945481925558
------------------------------------------------------------------------------------------------------------------------------------------------------
MSE for test data 199.69020110891006
MSE for test data 189.3914697420385


In [74]:
# R² scores across 5 folds
r2_scores = cross_val_score(poly_LR_pipeline_4, x_train, y_train, cv=5, scoring='r2')

print("R² scores per fold:", r2_scores)
print("Average R² score  :", r2_scores.mean())

MAE = -cross_val_score(poly_LR_pipeline_4, x_train, y_train, cv=5, scoring='neg_mean_absolute_error')
print("MAE :", MAE)
print("Average MAE  :", MAE.mean())

R² scores per fold: [0.94891825 0.94902705 0.94529422 0.94676152 0.94840879]
Average R² score  : 0.9476819640791472
MAE : [4.29938495 4.37091205 4.48382332 4.40257825 4.38238857]
Average MAE  : 4.387817428316929


In [75]:
pickle.dump(poly_LR_pipeline_4, open('D:\Guvi projects\YoutubeAdRevProject\Models\poly_LR_model.pkl', 'wb'))

  pickle.dump(poly_LR_pipeline_4, open('D:\Guvi projects\YoutubeAdRevProject\Models\poly_LR_model.pkl', 'wb'))


In [67]:
poly_LR_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('poly', PolynomialFeatures(degree=5)),
                              ('regressor', LinearRegression())])

poly_LR_pipeline.fit(x_train, y_train)

y_train_pred = poly_LR_pipeline.predict(x_train)
y_test_pred = poly_LR_pipeline.predict(x_test)

print('Evaluation:')
print('R2 score for train data',r2_score(y_train,y_train_pred))
print('R2 score for test data',r2_score(y_test,y_test_pred))
print('---'*50)
print('MAE for train data',mean_absolute_error(y_train,y_train_pred))
print('MAE for test data',mean_absolute_error(y_test,y_test_pred))
print('---'*50)
print('MSE for test data',mean_squared_error(y_train,y_train_pred))
print('MSE for test data',mean_squared_error(y_test,y_test_pred))

Evaluation:
R2 score for train data 0.9481484592975943
R2 score for test data 0.9503936260408947
------------------------------------------------------------------------------------------------------------------------------------------------------
MAE for train data 4.457216055120911
MAE for test data 4.324492103246021
------------------------------------------------------------------------------------------------------------------------------------------------------
MSE for test data 199.10903145790144
MSE for test data 190.06559344337563


#### Support Vector Machines (SVM)

In [62]:
# SVM_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
#                               ('regressor', SVR(kernel='rbf'))])

# SVM_pipeline.fit(x_train, y_train)

# y_train_pred = SVM_pipeline.predict(x_train)
# y_test_pred = SVM_pipeline.predict(x_test)

# print('Evaluation:')
# print('R2 score for train data',r2_score(y_train,y_train_pred))
# print('R2 score for test data',r2_score(y_test,y_test_pred))
# print('---'*50)
# print('MAE for train data',mean_absolute_error(y_train,y_train_pred))
# print('MAE for test data',mean_absolute_error(y_test,y_test_pred))
# print('---'*50)
# print('MSE for test data',mean_squared_error(y_train,y_train_pred))
# print('MSE for test data',mean_squared_error(y_test,y_test_pred))

#### Decision Tree

In [63]:
DT_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('regressor', DecisionTreeRegressor())])

DT_pipeline.fit(x_train, y_train)

y_train_pred = DT_pipeline.predict(x_train)
y_test_pred = DT_pipeline.predict(x_test)

print('Evaluation:')
print('R2 score for train data',r2_score(y_train,y_train_pred))
print('R2 score for test data',r2_score(y_test,y_test_pred))
print('---'*50)
print('MAE for train data',mean_absolute_error(y_train,y_train_pred))
print('MAE for test data',mean_absolute_error(y_test,y_test_pred))
print('---'*50)
print('MSE for test data',mean_squared_error(y_train,y_train_pred))
print('MSE for test data',mean_squared_error(y_test,y_test_pred))

Evaluation:
R2 score for train data 1.0
R2 score for test data 0.8908090625870084
------------------------------------------------------------------------------------------------------------------------------------------------------
MAE for train data 0.0
MAE for test data 6.531301359839592
------------------------------------------------------------------------------------------------------------------------------------------------------
MSE for test data 0.0
MSE for test data 418.36237285046326


In [64]:
# Lets do with GridSearchCV for hyperparameter tuning
# DT_param = {'max_depth': [5,7,8,9,10],
#             'min_samples_split': [10,20,30,40,50],
#             'min_samples_leaf': [4,5,10,15,20]
# }

# DT_hyper_param = GridSearchCV(DecisionTreeRegressor(), DT_param, cv=5)
# DT_hyper_param.fit(x_train_transformed, y_train)

# y_train_pred = DT_hyper_param.predict(x_train_transformed)
# y_test_pred = DT_hyper_param.predict(x_test_transformed)

# print('Evaluation:')
# print('R2 score for train data',r2_score(y_train,y_train_pred))
# print('R2 score for test data',r2_score(y_test,y_test_pred))
# print('---'*50)
# print('MAE for train data',mean_absolute_error(y_train,y_train_pred))
# print('MAE for test data',mean_absolute_error(y_test,y_test_pred))
# print('---'*50)
# print('MSE for test data',mean_squared_error(y_train,y_train_pred))
# print('MSE for test data',mean_squared_error(y_test,y_test_pred))

### Ensemble Method

In [65]:
# RandomForestRegressor

RF_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('regressor', RandomForestRegressor())])

RF_pipeline.fit(x_train, y_train)

y_train_pred = RF_pipeline.predict(x_train)
y_test_pred = RF_pipeline.predict(x_test)

print('Evaluation:')
print('R2 score for train data',r2_score(y_train,y_train_pred))
print('R2 score for test data',r2_score(y_test,y_test_pred))
print('---'*50)
print('MAE for train data',mean_absolute_error(y_train,y_train_pred))
print('MAE for test data',mean_absolute_error(y_test,y_test_pred))
print('---'*50)
print('MSE for test data',mean_squared_error(y_train,y_train_pred))
print('MSE for test data',mean_squared_error(y_test,y_test_pred))

Evaluation:
R2 score for train data 0.9921799848325362
R2 score for test data 0.9480086538978678
------------------------------------------------------------------------------------------------------------------------------------------------------
MAE for train data 1.7204733958840188
MAE for test data 4.422824972244853
------------------------------------------------------------------------------------------------------------------------------------------------------
MSE for test data 30.028724795588904
MSE for test data 199.20355515136134


In [66]:
# GridSearchCv
# RF_param={'n_estimators':[30,50,100,],
#     'max_depth':[5,9,10],
#     'min_samples_split':[10,50],
#     'min_samples_leaf':[5,25]
# }

# RF_hyper_param = GridSearchCV(RandomForestRegressor(), RF_param, cv=5)
# RF_hyper_param.fit(x_train_transformed, y_train)

# y_train_pred = RF_hyper_param.predict(x_train_transformed)
# y_test_pred = RF_hyper_param.predict(x_test_transformed)

# print('Evaluation:')
# print('R2 score for train data',r2_score(y_train,y_train_pred))
# print('R2 score for test data',r2_score(y_test,y_test_pred))
# print('---'*50)
# print('MAE for train data',mean_absolute_error(y_train,y_train_pred))
# print('MAE for test data',mean_absolute_error(y_test,y_test_pred))
# print('---'*50)
# print('MSE for test data',mean_squared_error(y_train,y_train_pred))
# print('MSE for test data',mean_squared_error(y_test,y_test_pred))