In [None]:
import pandas as pd
import numpy as np
from datetime import timedelta
import time
import pickle

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate, RandomizedSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import explained_variance_score, r2_score, mean_absolute_error, mean_squared_error
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression, SGDRegressor, ElasticNet, Ridge

from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor, XGBRFRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

import tensorflow as tf
from tensorflow import keras

# Preparation

In [None]:
train = pd.read_csv('train_0OECtn8.csv')
test = pd.read_csv('/content/test_1zqHu22.csv')

In [None]:
train['avg_cat_eng'] = train['category_id'].map(train.groupby('category_id')['engagement_score'].mean().to_dict())
test['avg_cat_eng'] = test['category_id'].map(train.groupby('category_id')['engagement_score'].mean().to_dict())


In [None]:
train['avg_prof_eng'] = train['profession'].map(train.groupby('profession')['engagement_score'].mean().to_dict())
test['avg_prof_eng'] = test['profession'].map(train.groupby('profession')['engagement_score'].mean().to_dict())

In [None]:
train['avg_gender_eng'] = train['gender'].map(train.groupby('gender')['engagement_score'].mean().to_dict())
test['avg_gender_eng'] = test['gender'].map(train.groupby('gender')['engagement_score'].mean().to_dict())

In [None]:
train['avg_user_eng'] = train['user_id'].map(train.groupby('user_id')['engagement_score'].mean().to_dict())
test['avg_user_eng'] = test['user_id'].map(train.groupby('user_id')['engagement_score'].mean().to_dict())

In [None]:
avg_category_prof_eng = train.groupby(['category_id','profession'])['engagement_score'].mean().to_dict()
scores = []
for x,y in zip(train['category_id'],train['profession']):
    scores.append(avg_category_prof_eng[(x,y)])
train['avg_category_prof_eng'] = scores


scores = []
for x,y in zip(test['category_id'],test['profession']):
    scores.append(avg_category_prof_eng[(x,y)])
test['avg_category_prof_eng'] = scores


In [None]:
avg_category_gender_eng = train.groupby(['category_id','gender'])['engagement_score'].mean().to_dict()
scores = []
for x,y in zip(train['category_id'],train['gender']):
    scores.append(avg_category_gender_eng[(x,y)])
train['avg_category_gender_eng'] = scores


scores = []
for x,y in zip(test['category_id'],test['gender']):
    scores.append(avg_category_gender_eng[(x,y)])
test['avg_category_gender_eng'] = scores

In [None]:
avg_category_user_eng = train.groupby(['category_id','user_id'])['engagement_score'].mean().to_dict()
scores = []
for x,y in zip(train['category_id'],train['user_id']):
    scores.append(avg_category_user_eng[(x,y)])
train['avg_category_user_eng'] = scores


scores = []
for x,y in zip(test['category_id'],test['user_id']):
    scores.append(avg_category_user_eng[(x,y)])
test['avg_category_user_eng'] = scores


avg_category_user_eng = train.groupby(['category_id','user_id'])['engagement_score'].max().to_dict()
scores = []
for x,y in zip(train['category_id'],train['user_id']):
    scores.append(avg_category_user_eng[(x,y)])
train['max_category_user_eng'] = scores


scores = []
for x,y in zip(test['category_id'],test['user_id']):
    scores.append(avg_category_user_eng[(x,y)])
test['max_category_user_eng'] = scores

avg_category_user_eng = train.groupby(['category_id','user_id'])['engagement_score'].min().to_dict()
scores = []
for x,y in zip(train['category_id'],train['user_id']):
    scores.append(avg_category_user_eng[(x,y)])
train['min_category_user_eng'] = scores


scores = []
for x,y in zip(test['category_id'],test['user_id']):
    scores.append(avg_category_user_eng[(x,y)])
test['min_category_user_eng'] = scores

In [None]:
train['user_cat_eng_range_dif'] = train['max_category_user_eng'] - train['min_category_user_eng']
test['user_cat_eng_range_dif'] = test['max_category_user_eng'] - test['min_category_user_eng']

In [None]:
avg_category_age_eng = train.groupby(['category_id','age'])['engagement_score'].mean().to_dict()
scores = []
for x,y in zip(train['category_id'],train['age']):
    scores.append(avg_category_age_eng[(x,y)])
train['avg_category_age_eng'] = scores


scores = []
for x,y in zip(test['category_id'],test['age']):
    scores.append(avg_category_age_eng[(x,y)])
test['avg_category_age_eng'] = scores

In [None]:
avg_video_age_eng = train.groupby(['video_id','age'])['engagement_score'].mean().to_dict()
scores = []
for x,y in zip(train['video_id'],train['age']):
    scores.append(avg_video_age_eng[(x,y)])
train['avg_video_age_eng'] = scores


scores = []
for x,y in zip(test['video_id'],test['age']):
    try:
        scores.append(avg_video_age_eng[(x,y)])
    except Exception:
        scores.append(np.NaN)
test['avg_video_age_eng'] = scores

test['avg_video_age_eng'] = test['avg_video_age_eng'].fillna(np.mean(test['avg_video_age_eng']))
test['avg_video_age_eng'] = test['avg_video_age_eng'].astype('float32')

In [None]:
avg_prof_age_eng = train.groupby(['profession','age'])['engagement_score'].mean().to_dict()
scores = []
for x,y in zip(train['profession'],train['age']):
    scores.append(avg_prof_age_eng[(x,y)])
train['avg_prof_age_eng'] = scores


scores = []
for x,y in zip(test['profession'],test['age']):
    try:
        scores.append(avg_prof_age_eng[(x,y)])
    except Exception:
        scores.append(np.NaN)
test['avg_prof_age_eng'] = scores

test['avg_prof_age_eng'] = test['avg_prof_age_eng'].fillna(np.mean(test['avg_prof_age_eng']))
test['avg_prof_age_eng'] = test['avg_prof_age_eng'].astype('float32')

In [None]:
avg_video_gender_eng = train.groupby(['video_id','gender'])['engagement_score'].mean().to_dict()
scores = []
for x,y in zip(train['video_id'],train['gender']):
    scores.append(avg_video_gender_eng[(x,y)])
train['avg_video_gender_eng'] = scores


scores = []
for x,y in zip(test['video_id'],test['gender']):
    scores.append(avg_video_gender_eng[(x,y)])
test['avg_video_gender_eng'] = scores

In [None]:
cat_gender_aud = train.groupby('category_id')['gender'].value_counts().to_dict()
scores = []
for x,y in zip(train['category_id'],train['gender']):
    scores.append(cat_gender_aud[(x,y)])
train['cat_gender_aud'] = scores


scores = []
for x,y in zip(test['category_id'],test['gender']):
    scores.append(cat_gender_aud[(x,y)])
test['cat_gender_aud'] = scores

In [None]:
train['gender']= train['gender'].apply(lambda x: 1 if x=='Female' else 0)
test['gender']= test['gender'].apply(lambda x: 1 if x=='Female' else 0)

In [None]:
train['views_per_follower'] = train['views']/train['followers']
test['views_per_follower'] = test['views']/test['followers']

In [None]:
train['user_number_of_cat'] = train['user_id'].map(train.groupby('user_id')['category_id'].nunique().to_dict())
test['user_number_of_cat'] = test['user_id'].map(train.groupby('user_id')['category_id'].nunique().to_dict())

In [None]:
train['is_below18'] = train['age'].apply(lambda x: 1 if x<18 else 0)
test['is_below18'] = test['age'].apply(lambda x: 1 if x<18 else 0)

In [None]:
train['avg_video_eng'] = train['video_id'].map(train.groupby('video_id')['engagement_score'].mean().to_dict())
test['avg_video_eng'] = test['video_id'].map(train.groupby('video_id')['engagement_score'].mean().to_dict())

In [None]:
train['expected_avg_user_eng'] = (train['avg_cat_eng'] + train['avg_prof_eng']+ train['avg_gender_eng']+ train['avg_user_eng']+ train['avg_category_prof_eng']+train['avg_category_gender_eng']+train['avg_category_user_eng']+train['avg_category_age_eng']+train['avg_user_gender_eng']+train['avg_video_cat_eng']+train['avg_video_gender_eng'] + train['avg_video_eng'])/12  
test['expected_avg_user_eng'] = (test['avg_cat_eng'] + test['avg_prof_eng']+ test['avg_gender_eng']+ test['avg_user_eng']+ test['avg_category_prof_eng']+test['avg_category_gender_eng']+test['avg_category_user_eng']+test['avg_category_age_eng']+test['avg_user_gender_eng']+test['avg_video_cat_eng']+test['avg_video_gender_eng'] + test['avg_video_eng'])/12
                                               

In [None]:
train.drop_duplicates(subset='user_id', keep='first', inplace=True)

In [None]:
train = pd.get_dummies(train)
test = pd.get_dummies(test)

In [None]:
train = train[['engagement_score', 'avg_category_user_eng', 'min_category_user_eng',
       'max_category_user_eng', 'expected_avg_user_eng', 'avg_user_eng',
       'avg_user_gender_eng', 'avg_video_gender_eng',
       'avg_category_gender_eng', 'avg_video_age_eng', 'avg_category_prof_eng',
       'avg_category_age_eng', 'avg_gender_eng', 'gender',
       'avg_video_cat_eng']]

test= test[['avg_category_user_eng', 'min_category_user_eng',
       'max_category_user_eng', 'expected_avg_user_eng', 'avg_user_eng',
       'avg_user_gender_eng', 'avg_video_gender_eng',
       'avg_category_gender_eng', 'avg_video_age_eng', 'avg_category_prof_eng',
       'avg_category_age_eng', 'avg_gender_eng', 'gender',
       'avg_video_cat_eng']]

# Models

In [None]:
train1, val = train_test_split(train, test_size=0.2, random_state=0)

In [None]:
x_train = train1.drop('engagement_score', axis=1)
y_train = train1['engagement_score']

x_val = val.drop('engagement_score', axis=1)
y_val = val['engagement_score']

In [None]:
scaler = StandardScaler()
x_train= scaler.fit_transform(x_train)
x_val = scaler.transform(x_val)
test = scaler.transform(test)

  "X does not have valid feature names, but"


In [None]:
def build_models(X,y):
    models={}
    training_times = {}
    
    # Linear Regression    
    lr = LinearRegression()
    start_time = time.time()
    lr.fit(X,y)
    elapsed_time_secs = timedelta(seconds=round(time.time() - start_time))
    models.update({"Linear Regression":lr})
    training_times.update({"Linear Regression":elapsed_time_secs.seconds})

    # SGD Regression    
    SGD = SGDRegressor(early_stopping=True)
    start_time = time.time()
    SGD.fit(X,y)
    elapsed_time_secs = timedelta(seconds=round(time.time() - start_time))
    models.update({"SGD":SGD})
    training_times.update({"SGD":elapsed_time_secs.seconds})

    # GB Regression    
    GB = GradientBoostingRegressor()
    start_time = time.time()
    GB.fit(X,y)
    elapsed_time_secs = timedelta(seconds=round(time.time() - start_time))
    models.update({"GB":GB})
    training_times.update({"GB":elapsed_time_secs.seconds})

    # Ridge
    ridge = Ridge(alpha=0.5)
    start_time = time.time()
    ridge.fit(X,y)
    elapsed_time_secs = timedelta(seconds=round(time.time() - start_time))
    models.update({"Ridge":ridge})
    training_times.update({"Ridge":elapsed_time_secs.seconds})
    
    # XGBoost
    xg = XGBRegressor()
    start_time = time.time()
    xg.fit(X,y)
    elapsed_time_secs = timedelta(seconds=round(time.time() - start_time))
    models.update({"XG Boost":xg})
    training_times.update({"XG Boost":elapsed_time_secs.seconds})
    


def model_evaluation(models, X_test, y_test, training_dur=None, nn=False):
    scores={}  
    class_wise_accuracy = {}

    for model in models:
        if nn:
            pred = model.predict(X_test)
        else:
            pred = models[model].predict(X_test)


        EVS = explained_variance_score(y_test, pred)
        R2_Score = r2_score(y_test,pred)
        neg_MAG = mean_absolute_error(y_test, pred)
        neg_MSE = mean_squared_error(y_test, pred)
        neg_RMSE = np.sqrt(mean_squared_error(y_test, pred))



        #roc_auc = roc_auc_score(y_test, pred, , average='macro')

        scores.update({model:[EVS,R2_Score,neg_MAG,neg_MSE,neg_RMSE]})

    metric_df = pd.DataFrame(scores, index=["EVS","R2 Score","Neg MAG","Neg MSE", "Neg RMSE"]).T
    metric_and_training_dur = pd.merge(metric_df,training_dur, left_index=True, right_index=True)

    final_df = metric_and_training_dur

    return final_df

In [None]:
models, training_dur = build_models(x_train, y_train)



In [None]:
models

{'Elastic Net': ElasticNet(),
 'GB': GradientBoostingRegressor(),
 'Linear Regression': LinearRegression(),
 'Random Forest': RandomForestRegressor(),
 'Ridge': Ridge(alpha=0.5),
 'SGD': SGDRegressor(early_stopping=True),
 'XG Boost': XGBRegressor(),
 'XG Random Forest': XGBRFRegressor()}

In [None]:
model_evaluation(models, x_val, y_val, training_dur)

Unnamed: 0,EVS,R2 Score,Neg MAG,Neg MSE,Neg RMSE,Training Time
Linear Regression,0.365403,-14.773794,3.392402,11.990796,3.462773,0
SGD,-0.012511,-18.768757,3.775973,15.027655,3.876552,0
GB,0.49655,-12.509782,3.144382,10.269757,3.204646,25
Ridge,0.335228,-15.033043,3.417971,12.18787,3.491113,0
Elastic Net,0.538795,-0.950822,1.102769,1.48296,1.217769,0
XG Boost,0.496521,-12.500094,3.143193,10.262393,3.203497,6
XG Random Forest,0.056263,-7.768488,2.451749,6.665559,2.581774,4
Random Forest,0.471597,-12.537585,3.144765,10.290892,3.207942,116


In [None]:
submisson = pd.read_csv('/content/sample_submission_JPlpRcN.csv')

In [None]:
submisson

Unnamed: 0,row_id,engagement_score
0,89198,5.0
1,89199,5.0
2,89200,5.0
3,89201,5.0
4,89202,5.0
...,...,...
11116,100314,5.0
11117,100315,5.0
11118,100316,5.0
11119,100317,5.0


In [None]:
#submisson['engagement_score'] = test_copy['pred']
submisson['engagement_score'] = models['SGD'].predict(test)


In [None]:
submisson.to_csv('submission_1.csv',index=False)

In [None]:
filename = 'SGD 48.sav'
pickle.dump(models['SGD'], open(filename, 'wb'))