In [1]:
!pip install pycaret

Collecting pycaret
  Downloading pycaret-2.3.6-py3-none-any.whl (301 kB)
[K     |████████████████████████████████| 301 kB 5.2 MB/s 
[?25hCollecting mlxtend>=0.17.0
  Downloading mlxtend-0.19.0-py2.py3-none-any.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 40.3 MB/s 
Collecting scikit-plot
  Downloading scikit_plot-0.3.7-py3-none-any.whl (33 kB)
Collecting mlflow
  Downloading mlflow-1.23.1-py3-none-any.whl (15.6 MB)
[K     |████████████████████████████████| 15.6 MB 520 kB/s 
[?25hCollecting imbalanced-learn==0.7.0
  Downloading imbalanced_learn-0.7.0-py3-none-any.whl (167 kB)
[K     |████████████████████████████████| 167 kB 47.3 MB/s 
[?25hCollecting Boruta
  Downloading Boruta-0.3-py3-none-any.whl (56 kB)
[K     |████████████████████████████████| 56 kB 2.9 MB/s 
Collecting umap-learn
  Downloading umap-learn-0.5.2.tar.gz (86 kB)
[K     |████████████████████████████████| 86 kB 3.7 MB/s 
Collecting lightgbm>=2.3.1
  Downloading lightgbm-3.3.2-py3-none-manylinux1

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,Normalizer

In [3]:
data=pd.read_csv('train.csv')
data.head()

Unnamed: 0,row_id,user_id,category_id,video_id,age,gender,profession,followers,views,engagement_score
0,1,19990,37,128,24,Male,Student,180,1000,4.33
1,2,5304,32,132,14,Female,Student,330,714,1.79
2,3,1840,12,24,19,Male,Student,180,138,4.35
3,4,12597,23,112,19,Male,Student,220,613,3.77
4,5,13626,23,112,27,Male,Working Professional,220,613,3.13


In [4]:
def get_age_group(value):
    if value<18:
        return 1
    elif value>=18 and value < 30:
        return 2
    elif value>=30 and value < 50:
        return 3
    else:
        return 4
data['age_group']=list(map(get_age_group,data['age']))

data=data.drop(['row_id'],axis=1)
data['gender_new']=pd.get_dummies(data['gender'],drop_first=True)

data=pd.concat([data,pd.get_dummies(data['profession'],drop_first=True)],axis=1)
data=data.drop(['gender','profession'],axis=1)

y=data['engagement_score']
x=data.drop(['engagement_score'],axis=1)

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)
scaler = Normalizer()

X_train_s=scaler.fit_transform(X_train)
X_train_s=pd.DataFrame(X_train_s,columns=X_train.columns)

X_test_s=scaler.transform(X_test)
X_test_s=pd.DataFrame(X_test_s,columns=X_test.columns)


In [5]:
import pycaret
# from pycaret.regression import *
from pycaret.regression import *

  defaults = yaml.load(f)


In [6]:
clf1 = setup(data = data, target = 'engagement_score')
# compare models
best = compare_models()

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.5228,0.4805,0.6931,0.3573,0.1896,0.248,0.468
rf,Random Forest Regressor,0.5208,0.4891,0.6993,0.3456,0.1894,0.2414,25.233
gbr,Gradient Boosting Regressor,0.5356,0.4956,0.7039,0.3372,0.1924,0.2539,6.654
lr,Linear Regression,0.5616,0.5326,0.7297,0.2876,0.1981,0.2648,0.699
ridge,Ridge Regression,0.5616,0.5327,0.7297,0.2876,0.1982,0.265,0.074
br,Bayesian Ridge,0.5616,0.5326,0.7297,0.2876,0.1982,0.265,0.173
lar,Least Angle Regression,0.562,0.5338,0.7305,0.2861,0.1983,0.2652,0.089
omp,Orthogonal Matching Pursuit,0.579,0.5627,0.75,0.2474,0.2034,0.2718,0.067
ada,AdaBoost Regressor,0.6044,0.5757,0.7587,0.2299,0.2003,0.2626,1.538
et,Extra Trees Regressor,0.5762,0.6071,0.7791,0.1878,0.2087,0.2596,18.828


In [7]:
# train model
dt = create_model('gbr')

# tune model
tuned_dt = tune_model(dt)

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,0.518,0.4676,0.6838,0.3759,0.1862,0.2445
1,0.515,0.468,0.6841,0.3641,0.1859,0.2191
2,0.5227,0.4847,0.6962,0.3643,0.1918,0.2481
3,0.5248,0.4746,0.6889,0.3514,0.1883,0.2644
4,0.5177,0.4719,0.687,0.3686,0.1881,0.2386
5,0.5316,0.4973,0.7052,0.3545,0.1926,0.2385
6,0.5161,0.4762,0.6901,0.3755,0.1913,0.2007
7,0.5238,0.486,0.6972,0.3516,0.1888,0.3303
8,0.5121,0.4611,0.679,0.3515,0.1834,0.2369
9,0.5262,0.4791,0.6922,0.3662,0.1897,0.241


In [8]:
tuned_dt

GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.05, loss='ls',
                          max_depth=11, max_features=1.0, max_leaf_nodes=None,
                          min_impurity_decrease=0.005, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=7,
                          min_weight_fraction_leaf=0.0, n_estimators=90,
                          n_iter_no_change=None, presort='deprecated',
                          random_state=4070, subsample=0.3, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

In [9]:
tuned_dt

GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.05, loss='ls',
                          max_depth=11, max_features=1.0, max_leaf_nodes=None,
                          min_impurity_decrease=0.005, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=7,
                          min_weight_fraction_leaf=0.0, n_estimators=90,
                          n_iter_no_change=None, presort='deprecated',
                          random_state=4070, subsample=0.3, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)