In [2624]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,MinMaxScaler,OneHotEncoder,LabelEncoder,Normalizer
from sklearn.model_selection import cross_val_score
import category_encoders as ce

In [2625]:
data=pd.read_csv('train_0OECtn8.csv')
data.head()

Unnamed: 0,row_id,user_id,category_id,video_id,age,gender,profession,followers,views,engagement_score
0,1,19990,37,128,24,Male,Student,180,1000,4.33
1,2,5304,32,132,14,Female,Student,330,714,1.79
2,3,1840,12,24,19,Male,Student,180,138,4.35
3,4,12597,23,112,19,Male,Student,220,613,3.77
4,5,13626,23,112,27,Male,Working Professional,220,613,3.13


In [2626]:
def get_age_group(value):
    if value<18:
        return 1
    elif value>=18 and value < 30:
        return 2
    elif value>=30 and value < 50:
        return 3
    else:
        return 4

In [2627]:
data['age_group']=list(map(get_age_group,data['age']))
data.head()

Unnamed: 0,row_id,user_id,category_id,video_id,age,gender,profession,followers,views,engagement_score,age_group
0,1,19990,37,128,24,Male,Student,180,1000,4.33,2
1,2,5304,32,132,14,Female,Student,330,714,1.79,1
2,3,1840,12,24,19,Male,Student,180,138,4.35,2
3,4,12597,23,112,19,Male,Student,220,613,3.77,2
4,5,13626,23,112,27,Male,Working Professional,220,613,3.13,2


In [2628]:
test_data=pd.read_csv('test_1zqHu22.csv')
row_id=test_data['row_id']
test_data=test_data.drop(['row_id','video_id'],axis=1)
test_data.head()

Unnamed: 0,user_id,category_id,age,gender,profession,followers,views
0,7986,12,14,Male,Student,180,138
1,11278,34,14,Male,Student,230,840
2,17245,8,44,Female,Working Professional,280,628
3,9851,16,18,Male,Student,270,462
4,16008,34,47,Female,Other,230,840


In [2629]:
cate_dict=data.groupby(['category_id'])['engagement_score'].mean().to_dict()

def get_cat_dict(key):
    return round(cate_dict[key],2)

data['category_id']=list(map(get_cat_dict,data['category_id']))
test_data['category_id']=list(map(get_cat_dict,test_data['category_id']))
# data=data.drop(['category_id'],axis=1)
# test_data=test_data.drop(['category_id'],axis=1)

# data.head()


In [2630]:
data=data.drop(['row_id','video_id'],axis=1)
data['engagement_score']=data['engagement_score']*100
data.head()

Unnamed: 0,user_id,category_id,age,gender,profession,followers,views,engagement_score,age_group
0,19990,3.31,24,Male,Student,180,1000,433.0,2
1,5304,3.44,14,Female,Student,330,714,179.0,1
2,1840,3.57,19,Male,Student,180,138,435.0,2
3,12597,3.96,19,Male,Student,220,613,377.0,2
4,13626,3.96,27,Male,Working Professional,220,613,313.0,2


In [2631]:
test_data.head()

Unnamed: 0,user_id,category_id,age,gender,profession,followers,views
0,7986,3.57,14,Male,Student,180,138
1,11278,2.79,14,Male,Student,230,840
2,17245,3.6,44,Female,Working Professional,280,628
3,9851,3.56,18,Male,Student,270,462
4,16008,2.79,47,Female,Other,230,840


In [2632]:
test_data['age_group']=list(map(get_age_group,test_data['age']))

In [2633]:
data['gender_new']=pd.get_dummies(data['gender'],drop_first=True)
test_data['gender_new']=pd.get_dummies(test_data['gender'],drop_first=True)

In [2634]:
data.head()

Unnamed: 0,user_id,category_id,age,gender,profession,followers,views,engagement_score,age_group,gender_new
0,19990,3.31,24,Male,Student,180,1000,433.0,2,1
1,5304,3.44,14,Female,Student,330,714,179.0,1,0
2,1840,3.57,19,Male,Student,180,138,435.0,2,1
3,12597,3.96,19,Male,Student,220,613,377.0,2,1
4,13626,3.96,27,Male,Working Professional,220,613,313.0,2,1


In [2635]:
test_data.head()

Unnamed: 0,user_id,category_id,age,gender,profession,followers,views,age_group,gender_new
0,7986,3.57,14,Male,Student,180,138,1,1
1,11278,2.79,14,Male,Student,230,840,1,1
2,17245,3.6,44,Female,Working Professional,280,628,3,0
3,9851,3.56,18,Male,Student,270,462,2,1
4,16008,2.79,47,Female,Other,230,840,3,0


In [2636]:
data=pd.concat([data,pd.get_dummies(data['profession'],drop_first=True)],axis=1)
test_data=pd.concat([test_data,pd.get_dummies(test_data['profession'],drop_first=True)],axis=1)

In [2637]:
# data=pd.concat([data,pd.get_dummies(data['age_group'],drop_first=True)],axis=1)
# test_data=pd.concat([test_data,pd.get_dummies(test_data['age_group'],drop_first=True)],axis=1)

In [2638]:
data=data.drop(['gender','profession'],axis=1)
test_data=test_data.drop(['gender','profession'],axis=1)


In [2639]:
data=pd.concat([data,data,data,data,data,data,data,data,data,data,data,data,data,data,
               data,data,data,data,data,data,data,data,data,data,data,data,data,data,
               ])

In [2640]:
data.shape

(2497516, 10)

In [2641]:
y=data['engagement_score']
x=data.drop(['engagement_score'],axis=1)

In [2642]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)

In [2643]:
scaler = Normalizer()

In [2644]:
# X_train_s=scaler.fit_transform(X_train)
# X_train_s=pd.DataFrame(X_train_s,columns=X_train.columns)

# X_test_s=scaler.transform(X_test)
# X_test_s=pd.DataFrame(X_test_s,columns=X_test.columns)

x=scaler.fit_transform(x)
x=pd.DataFrame(x,columns=X_train.columns)

In [2645]:
x.head()

Unnamed: 0,user_id,category_id,age,followers,views,age_group,gender_new,Student,Working Professional
0,0.99871,0.000165,0.001199,0.008993,0.04996,0.0001,5e-05,5e-05,0.0
1,0.989178,0.000642,0.002611,0.061544,0.133159,0.000186,0.0,0.000186,0.0
2,0.992433,0.001926,0.010248,0.097086,0.074432,0.001079,0.000539,0.000539,0.0
3,0.998665,0.000314,0.001506,0.017441,0.048597,0.000159,7.9e-05,7.9e-05,0.0
4,0.998858,0.00029,0.001979,0.016127,0.044936,0.000147,7.3e-05,0.0,7.3e-05


In [2646]:
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor,GradientBoostingRegressor     
from sklearn.metrics import r2_score
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb
from sklearn.svm import SVR
from sklearn.linear_model import Ridge
from sklearn.neighbors import KNeighborsRegressor
from hyperopt import hp,fmin,tpe,STATUS_OK,Trials
from lightgbm import LGBMRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import ExtraTreesRegressor,   GradientBoostingRegressor                                                 


In [2647]:
model =LGBMRegressor(bagging_fraction=0.8, bagging_freq=6, boosting_type='gbdt',
              class_weight=None, colsample_bytree=1.0, feature_fraction=0.7,
              importance_type='split', learning_rate=0.15, max_depth=-1,
              min_child_samples=26, min_child_weight=0.001, min_split_gain=0.5,
              n_estimators=1000, n_jobs=-1, num_leaves=200, objective=None,
              random_state=5272, reg_alpha=0.1, reg_lambda=5, silent=True,
              subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

# model=SVR(C=1.0, epsilon=0.2)

from sklearn.decomposition import PCA
model.fit(x, y)



LGBMRegressor(bagging_fraction=0.8, bagging_freq=6, feature_fraction=0.7,
              learning_rate=0.15, min_child_samples=26, min_split_gain=0.5,
              n_estimators=1000, num_leaves=200, random_state=5272,
              reg_alpha=0.1, reg_lambda=5)

In [2648]:
# scores = cross_val_score(model, x, y, cv=5)
# scores

In [2649]:
def predict_fun(data):
    data_r=data
    data=scaler.transform(data)
    data=pd.DataFrame(data,columns=data_r.columns)
    predictions=model.predict(data)
    dict_={
        'row_id':np.array(row_id),
        'engagement_score':np.round(predictions/100,2)+0.12
    }
    
    result=pd.DataFrame(dict_)
    result.to_csv('submission.csv',index=False)
    
    return result

In [2623]:
predict_fun(test_data)

Unnamed: 0,row_id,engagement_score
0,89198,4.21
1,89199,4.09
2,89200,2.53
3,89201,3.13
4,89202,2.24
...,...,...
11116,100314,4.06
11117,100315,3.30
11118,100316,2.91
11119,100317,3.95


In [2449]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11121 entries, 0 to 11120
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   user_id               11121 non-null  int64  
 1   category_id           11121 non-null  float64
 2   age                   11121 non-null  int64  
 3   followers             11121 non-null  int64  
 4   views                 11121 non-null  int64  
 5   age_group             11121 non-null  int64  
 6   gender_new            11121 non-null  uint8  
 7   Student               11121 non-null  uint8  
 8   Working Professional  11121 non-null  uint8  
dtypes: float64(1), int64(5), uint8(3)
memory usage: 554.0 KB


In [2345]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1248758 entries, 0 to 89196
Data columns (total 10 columns):
 #   Column                Non-Null Count    Dtype  
---  ------                --------------    -----  
 0   user_id               1248758 non-null  int64  
 1   category_id           1248758 non-null  float64
 2   age                   1248758 non-null  int64  
 3   followers             1248758 non-null  int64  
 4   views                 1248758 non-null  int64  
 5   engagement_score      1248758 non-null  float64
 6   age_group             1248758 non-null  int64  
 7   gender_new            1248758 non-null  uint8  
 8   Student               1248758 non-null  uint8  
 9   Working Professional  1248758 non-null  uint8  
dtypes: float64(2), int64(5), uint8(3)
memory usage: 79.8 MB


In [2346]:
from sklearn.decomposition import PCA
pca = PCA(n_components=4, svd_solver='full')
pca.fit(x)

PCA(n_components=4, svd_solver='full')

In [2347]:
# import pycaret
# from pycaret.regression import *
# data=

In [2348]:
# Building the model
extra_tree_forest = ExtraTreesRegressor(n_estimators = 100, max_features = 2)
  
# Training the model
extra_tree_forest.fit(x, y)
  
# Computing the importance of each feature
feature_importance = extra_tree_forest.feature_importances_
  
# Normalizing the individual importances
feature_importance_normalized = np.std([tree.feature_importances_ for tree in 
                                        extra_tree_forest.estimators_],
                                        axis = 0)

In [2351]:
feature_importance

array([0.10208494, 0.12208896, 0.10856477, 0.10154061, 0.10808285,
       0.10153806, 0.21098258, 0.09491624, 0.050201  ])

In [2350]:
extra_tree_forest.feature_names_in_

array(['user_id', 'category_id', 'age', 'followers', 'views', 'age_group',
       'gender_new', 'Student', 'Working Professional'], dtype=object)