# Engagement Score Prediction  


Objective: <br>
The main objective of the problem is to develop the machine learning approach to predict the engagement score of the video on the user level.

</h4>

In [40]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import normalize

from sklearn.metrics import mean_squared_error, r2_score


from sklearn.model_selection import train_test_split

from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor

In [41]:
cn = pd.read_csv('train_0OECtn8.csv')

cn.shape

(89197, 10)

In [42]:
cn = cn.drop('row_id', axis=1)

In [43]:
# Categorize the Views column
cn.loc[cn['views'] <= 30, 'views'] = 1
cn.loc[(cn['views'] > 30) & (cn['views'] <= 229), 'views'] = 2
cn.loc[(cn['views'] > 229) & (cn['views'] <= 467), 'views'] = 3
cn.loc[(cn['views'] > 467) & (cn['views'] <= 714), 'views'] = 4
cn.loc[cn['views'] > 714, 'views'] = 5

print(cn.views.value_counts())

# Categorize the Views column
cn.loc[cn['age'] <= 18, 'age'] = 1
cn.loc[(cn['age'] > 18) & (cn['age'] <= 32), 'age'] = 2
cn.loc[cn['age'] > 32, 'age'] = 3

print(cn.age.value_counts())

4    23939
2    23869
3    21343
5    19879
1      167
Name: views, dtype: int64
2    42935
1    26488
3    19774
Name: age, dtype: int64


In [44]:
cn.corr()

Unnamed: 0,user_id,category_id,video_id,age,followers,views,engagement_score
user_id,1.0,0.002139,0.005232,0.001468,-0.002026,-0.002592,-0.0029
category_id,0.002139,1.0,0.55655,-0.004067,0.056432,0.112234,-0.094288
video_id,0.005232,0.55655,1.0,-0.003557,0.098778,-0.021715,-0.030552
age,0.001468,-0.004067,-0.003557,1.0,0.004733,0.010938,-0.207981
followers,-0.002026,0.056432,0.098778,0.004733,1.0,0.315671,0.000855
views,-0.002592,0.112234,-0.021715,0.010938,0.315671,1.0,-0.0909
engagement_score,-0.0029,-0.094288,-0.030552,-0.207981,0.000855,-0.0909,1.0


In [45]:
# Creating new column 'user_cat' which will represent info of users with interest in particular category videos

cn1 = cn.copy()

cn['user_cat'] = cn['user_id'].astype(str) + '_' + cn['category_id'].astype(str)
# cn['user_cat'] = cn['user_cat'].astype(float)

# Creating new column 'user_vid' which will represent info of users with interest in a particular video 
cn['user_vid'] = cn['user_id'].astype(str) + '_' + cn['video_id'].astype(str)
# cn['user_vid'] = cn['user_vid'].astype(float)

cn.head()

Unnamed: 0,user_id,category_id,video_id,age,gender,profession,followers,views,engagement_score,user_cat,user_vid
0,19990,37,128,2,Male,Student,180,5,4.33,19990_37,19990_128
1,5304,32,132,1,Female,Student,330,4,1.79,5304_32,5304_132
2,1840,12,24,2,Male,Student,180,2,4.35,1840_12,1840_24
3,12597,23,112,2,Male,Student,220,4,3.77,12597_23,12597_112
4,13626,23,112,2,Male,Working Professional,220,4,3.13,13626_23,13626_112


In [46]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

cn.category_id = le.fit_transform(cn.category_id)
cn.gender = le.fit_transform(cn.gender)
cn.profession = le.fit_transform(cn.profession)
cn.user_id = le.fit_transform(cn.user_id)
cn.video_id = le.fit_transform(cn.video_id)
cn.followers = le.fit_transform(cn.followers)
cn.user_cat = le.fit_transform(cn.user_cat)
cn.user_vid = le.fit_transform(cn.user_vid)

In [47]:
cn.corr() # After adding new columns

Unnamed: 0,user_id,category_id,video_id,age,gender,profession,followers,views,engagement_score,user_cat,user_vid
user_id,1.0,0.002139,0.005232,0.001468,-0.003807,-0.003439,-0.00235,-0.002592,-0.0029,-0.19041,-0.190009
category_id,0.002139,1.0,0.55655,-0.004067,-0.008274,0.002205,0.065904,0.112234,-0.094288,0.003293,0.003279
video_id,0.005232,0.55655,1.0,-0.003557,0.003251,-0.002929,0.117055,-0.021715,-0.030552,0.002547,0.002529
age,0.001468,-0.004067,-0.003557,1.0,-0.012114,-0.097866,0.005317,0.010938,-0.207981,-0.000549,-0.000555
gender,-0.003807,-0.008274,0.003251,-0.012114,1.0,0.004622,0.003852,-0.032858,0.408702,0.007873,0.007878
profession,-0.003439,0.002205,-0.002929,-0.097866,0.004622,1.0,-0.005352,0.000785,-0.049821,-0.00298,-0.002966
followers,-0.00235,0.065904,0.117055,0.005317,0.003852,-0.005352,1.0,0.291693,0.007187,0.004203,0.004197
views,-0.002592,0.112234,-0.021715,0.010938,-0.032858,0.000785,0.291693,1.0,-0.0909,-0.00793,-0.007939
engagement_score,-0.0029,-0.094288,-0.030552,-0.207981,0.408702,-0.049821,0.007187,-0.0909,1.0,-0.00283,-0.002829
user_cat,-0.19041,0.003293,0.002547,-0.000549,0.007873,-0.00298,0.004203,-0.00793,-0.00283,1.0,0.999999


In [39]:
# Normalize the data in order to see more effective scores

cn = normalize(cn)
cn = pd.DataFrame(cn)
cn.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,0.438261,0.000789,0.002784,4.4e-05,2.2e-05,2.2e-05,2.2e-05,0.00011,9.5e-05,0.439269,0.784195
1,0.06375,0.000373,0.001575,1.2e-05,0.0,1.2e-05,0.000168,4.8e-05,2.2e-05,0.487366,0.870866
2,0.053104,0.000318,0.000664,5.8e-05,2.9e-05,2.9e-05,2.9e-05,5.8e-05,0.000126,0.487466,0.871525
3,0.765585,0.001337,0.006747,0.000122,6.1e-05,6.1e-05,0.000304,0.000243,0.000229,0.316117,0.56027
4,0.675251,0.00109,0.005501,9.9e-05,5e-05,9.9e-05,0.000248,0.000198,0.000155,0.361786,0.64274


In [48]:
cn_x = cn.drop(['engagement_score'], axis =1)
cn_y = cn['engagement_score']

cn_x_train, cn_x_test, cn_y_train, cn_y_test = train_test_split(cn_x, cn_y, test_size= .2)

In [49]:

model_name_lst = []
Rsquare = []
R2_Score = []
mape_test = []
mse_test = []

#Scores = pd.DataFrame({'R2 Score' : R2_Score, 'Rsquare': Rsquare, 'MSE': mse_test, 'MAPE': mape_test}, index= [model_name])

def model_scores(model_name, model_obj):
    
    pred_y = model_obj.predict(cn_x_test)
    
    print(pred_y)
    
    err_test = cn_y_test - pred_y
    
    Rsquare.append(model_obj.score(cn_x_test, cn_y_test) )# Rsquare
    
    R2_Score.append( r2_score(cn_y_test, pred_y)) # R2 score
    
    mape_test.append(np.mean(np.abs(err_test * 100 / cn_y_test))) # MAPE
    
    mse_test.append( mean_squared_error(cn_y_test, pred_y)) # MSE
    
    model_name_lst.append(model_name)
    
    #print(Scores)
    

In [50]:
# Linear Regression Implementation

lr = LinearRegression()

lr.fit(cn_x_train, cn_y_train)

model_scores( 'LR',lr)

[3.83138533 3.02899388 2.87404137 ... 3.52289758 3.66607885 2.92717162]


In [65]:
# XGBoost Regressor Implementation

xgbr = XGBRegressor(verbosity = 0)

xgbr.fit(cn_x, cn_y)

model_scores('XGBoost', xgbr) 

[3.8693562 3.2434616 2.434934  ... 3.542472  3.3972754 2.4715118]


In [52]:
# Decision Tree Implementation

dt  = DecisionTreeRegressor()

dt.fit(cn_x_train, cn_y_train)

model_scores('Decision Tree', dt)

[3.77 1.95 1.37 ... 3.22 4.39 1.15]


In [53]:
# Random Forest Implementation

rf = RandomForestRegressor()

rf.fit(cn_x_train, cn_y_train)

model_scores('Random Forest', rf)

[3.739  2.8824 1.8694 ... 3.6411 3.8915 1.5259]


In [54]:
# AdaBoost Boosting Implementation 

ada = AdaBoostRegressor()

ada.fit(cn_x_train, cn_y_train)

model_scores('AdaBoost', ada)

[3.80962553 2.89326249 2.79873455 ... 3.36557183 3.36557183 2.79873455]


In [55]:
Scores = pd.DataFrame({'R2 Score' : R2_Score, 'Rsquare': Rsquare, 'MSE': mse_test, 'MAPE': mape_test}, index= [model_name_lst])
Scores   

Unnamed: 0,R2 Score,Rsquare,MSE,MAPE
LR,0.229489,0.229489,0.562459,inf
XGBoost,0.366677,0.366677,0.462315,inf
Decision Tree,-0.119847,-0.119847,0.817468,inf
Random Forest,0.328495,0.328495,0.490186,inf
AdaBoost,0.246119,0.246119,0.550319,inf


In [56]:
# Test Data Pre processing and Model Performance 

In [74]:
tf = pd.read_csv('test_1zqHu22.csv')
tf.head()
tf.shape

(11121, 9)

In [75]:
tf1 = tf.copy()
tf = tf.drop('row_id', axis=1)

In [76]:

tf['user_cat'] = tf['user_id'].astype(str) + '_' + tf['category_id'].astype(str)
# cn['user_cat'] = cn['user_cat'].astype(float)

# Creating new column 'user_vid' which will represent info of users with interest in a particular video 
tf['user_vid'] = tf['user_id'].astype(str) + '_' + tf['video_id'].astype(str)
# cn['user_vid'] = cn['user_vid'].astype(float)

tf.head()

Unnamed: 0,user_id,category_id,video_id,age,gender,profession,followers,views,user_cat,user_vid
0,7986,12,42,14,Male,Student,180,138,7986_12,7986_42
1,11278,34,115,14,Male,Student,230,840,11278_34,11278_115
2,17245,8,110,44,Female,Working Professional,280,628,17245_8,17245_110
3,9851,16,137,18,Male,Student,270,462,9851_16,9851_137
4,16008,34,96,47,Female,Other,230,840,16008_34,16008_96


In [77]:
tf.category_id = le.fit_transform(tf.category_id)
tf.gender = le.fit_transform(tf.gender)
tf.profession = le.fit_transform(tf.profession)
tf.user_id = le.fit_transform(tf.user_id)
tf.video_id = le.fit_transform(tf.video_id)
tf.followers = le.fit_transform(tf.followers)
tf.user_cat = le.fit_transform(tf.user_cat)
tf.user_vid = le.fit_transform(tf.user_vid)

In [78]:
# Categorize the Views column
tf.loc[tf['views'] <= 229, 'views'] = 1
tf.loc[(tf['views'] > 229) & (tf['views'] <= 714), 'views'] = 2
tf.loc[tf['views'] > 714, 'views'] = 3

tf.views.value_counts()

2    5411
1    3681
3    2029
Name: views, dtype: int64

In [79]:
# Categorize the Views column
tf.loc[tf['age'] <= 18, 'age'] = 1
tf.loc[(tf['age'] > 18) & (tf['age'] <= 32), 'age'] = 2
tf.loc[tf['age'] > 32, 'age'] = 3

tf.age.value_counts()

2    5296
1    3334
3    2491
Name: age, dtype: int64

In [80]:
tf.shape
cn_x.shape

(89197, 10)

In [81]:
y_pred = xgbr.predict(tf)
y_pred

array([4.1637754, 3.9865012, 2.859155 , ..., 4.4768076, 3.5806081,
       3.466711 ], dtype=float32)

In [25]:
import csv 
    
# field names 
fields = ['row_id', 'engagement_score'] 
    
# name of csv file 
filename = "Predicted_Values.csv"
    
# writing to csv file 
with open(filename, 'a', newline='') as csvfile: 
    # creating a csv writer object 
    csvwriter = csv.writer(csvfile, dialect='excel') 
        
    # writing the fields 
    csvwriter.writerow(fields) 
        
    # writing the data rows 
    for w in range(0, len(y_pred)):
        csvwriter.writerow([tf1.row_id[w], y_pred[w]])

# Conclusion : 

<h3>R2 Score : 0.12659 </h3>
<h4>
1]  We have managed to get an R2 Score : 0.12659 using XGBoost and Label Encoder. <br><br>
2]  XGBoost has proved to be the best fit for predicting our numerical continuous target variable    <br><br>
2]  Since our Correlation between features initially shows a maximum of 0.4 units between Gender and Target Variable we cannot get higher R2 score that 0.16 </h4>