### Import Required Files

In [47]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,MinMaxScaler,OneHotEncoder,LabelEncoder,Normalizer
from sklearn.model_selection import cross_val_score
import category_encoders as ce

### Read train & test files

In [48]:
data=pd.read_csv('train_0OECtn8.csv')
test_data=pd.read_csv('test_1zqHu22.csv')

### Drop row_id and video_id column

In [49]:
### row_id is unique so it don't show any realtionship
### video_id also don't show any specific realtionship with engagement_score 
### and also it also show correlation with category_id

data=data.drop(['row_id','video_id'],axis=1)

## Save row_id for prediction
row_id=test_data['row_id']
test_data=test_data.drop(['row_id','video_id'],axis=1)

In [50]:
data.head()

Unnamed: 0,user_id,category_id,age,gender,profession,followers,views,engagement_score
0,19990,37,24,Male,Student,180,1000,4.33
1,5304,32,14,Female,Student,330,714,1.79
2,1840,12,19,Male,Student,180,138,4.35
3,12597,23,19,Male,Student,220,613,3.77
4,13626,23,27,Male,Working Professional,220,613,3.13


## Add new column for age group

In [51]:
### Function to create multiple groups of ages
### As we shown that younger age person video showing more engagement_score compare to others
### We can create 3 different groups

def get_age_group(value):
    ### Teenager
    if value<18:
        return 1
    ### Younger
    elif value>=18 and value < 30:
    ### Mid Age Person
        return 2
    elif value>=30 and value < 50:
        return 3
    ### Older
    else:
        return 4

In [52]:
data['age_group']=list(map(get_age_group,data['age']))
test_data['age_group']=list(map(get_age_group,test_data['age']))

## Perform Encoding for categorical columns

In [53]:
cate_dict=data.groupby(['category_id'])['engagement_score'].mean().to_dict()

def get_cat_dict(key):
    return round(cate_dict[key],2)

data['category_id']=list(map(get_cat_dict,data['category_id']))
test_data['category_id']=list(map(get_cat_dict,test_data['category_id']))

In [54]:
### Final targte column value if very low we can scale the value so modelcan learn small
### changes easily later we convert to intial format
data['engagement_score']=data['engagement_score']*100
data.head()

Unnamed: 0,user_id,category_id,age,gender,profession,followers,views,engagement_score,age_group
0,19990,3.31,24,Male,Student,180,1000,433.0,2
1,5304,3.44,14,Female,Student,330,714,179.0,1
2,1840,3.57,19,Male,Student,180,138,435.0,2
3,12597,3.96,19,Male,Student,220,613,377.0,2
4,13626,3.96,27,Male,Working Professional,220,613,313.0,2


In [55]:
### Encoding for gender column
data['gender_new']=pd.get_dummies(data['gender'],drop_first=True)
test_data['gender_new']=pd.get_dummies(test_data['gender'],drop_first=True)

In [56]:
### Encoding for profession column
data=pd.concat([data,pd.get_dummies(data['profession'],drop_first=True)],axis=1)
test_data=pd.concat([test_data,pd.get_dummies(test_data['profession'],drop_first=True)],axis=1)

In [57]:
### Remove extra columns
data=data.drop(['gender','profession'],axis=1)
test_data=test_data.drop(['gender','profession'],axis=1)

## Add duplicate data


In [58]:
### our model is not learning all the relationship properly
### due to lack ok features and data
### so we can add some duplicate data

data=pd.concat([data,data])

## Train Test Split

In [59]:
y=data['engagement_score']
x=data.drop(['engagement_score'],axis=1)

In [60]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)

In [67]:
X_test.shape

(44599, 9)

## Perform Scaling

In [68]:
scaler = Normalizer()

In [69]:
X_train=scaler.fit_transform(X_train)
X_train=pd.DataFrame(X_train,columns=x.columns)

In [70]:
X_train.head()

Unnamed: 0,user_id,category_id,age,followers,views,age_group,gender_new,Student,Working Professional
0,0.932159,0.001159,0.008727,0.095585,0.349092,0.000831,0.0,0.000416,0.0
1,0.998277,0.000191,0.001213,0.010393,0.057741,0.000115,5.8e-05,5.8e-05,0.0
2,0.998708,0.000266,0.001182,0.020686,0.046397,7.4e-05,7.4e-05,7.4e-05,0.0
3,0.999894,0.000123,0.000439,0.008776,0.011592,3.7e-05,3.7e-05,3.7e-05,0.0
4,0.999724,0.000156,0.001009,0.011844,0.020266,8.8e-05,4.4e-05,4.4e-05,0.0


In [71]:
X_test=scaler.transform(X_test)
X_test=pd.DataFrame(X_test,columns=x.columns)

## Train Model & Check Performance

In [42]:
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor,GradientBoostingRegressor     
from sklearn.metrics import r2_score
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb
from sklearn.svm import SVR
from sklearn.linear_model import Ridge
from sklearn.neighbors import KNeighborsRegressor
from hyperopt import hp,fmin,tpe,STATUS_OK,Trials
from lightgbm import LGBMRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import ExtraTreesRegressor,   GradientBoostingRegressor                                                 


In [72]:
model =LGBMRegressor(bagging_fraction=0.8, bagging_freq=6, boosting_type='gbdt',
              class_weight=None, colsample_bytree=1.0, feature_fraction=0.7,
              importance_type='split', learning_rate=0.15, max_depth=-1,
              min_child_samples=26, min_child_weight=0.001, min_split_gain=0.5,
              n_estimators=1000, n_jobs=-1, num_leaves=200, objective=None,
              random_state=5272, reg_alpha=0.1, reg_lambda=5, silent=True,
              subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [73]:
model.fit(X_train, y_train)



LGBMRegressor(bagging_fraction=0.8, bagging_freq=6, feature_fraction=0.7,
              learning_rate=0.15, min_child_samples=26, min_split_gain=0.5,
              n_estimators=1000, num_leaves=200, random_state=5272,
              reg_alpha=0.1, reg_lambda=5)

In [74]:
predictions=model.predict(X_test)

In [75]:
r2_score(y_test, predictions)

ValueError: Found input variables with inconsistent numbers of samples: [44599, 133795]

In [78]:
X_test.shape

(133795, 9)