In [None]:
! pip install catboost

### Importing all the libraries and model

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn import metrics
from sklearn import preprocessing
from sklearn import tree
from sklearn import model_selection
from sklearn.ensemble import RandomForestRegressor
import xgboost
import lightgbm as lgb
from xgboost.sklearn import XGBRegressor
import catboost as cat

#### Getting Training Data  and Testing Data

In [None]:
TrainData = pd.read_csv("/content/drive/MyDrive/Analytics_jobthakon_data/train_0OECtn8.csv") # Reading Training data file
TestData = pd.read_csv("/content/drive/MyDrive/Analytics_jobthakon_data/test_1zqHu22.csv")# Reading Testing data file
SampleData = pd.read_csv("/content/drive/MyDrive/Analytics_jobthakon_data/sample_submission_JPlpRcN.csv") # Reading sample data file

In [None]:
Target = TrainData['engagement_score']  #Assign target column to  Target variable

#### Separating numerical and categorical columns

In [None]:
numerical_columns = [
                     "row_id" , "user_id", "category_id",
                     "video_id", "age", "followers", "views"]

categorical_columns = [
                       "gender", "profession"]


all_columns = numerical_columns + categorical_columns + ["engagement_score"]

#### Encoding categorical columns of both train and test data

In [None]:
TestData.loc[:,"engagement_score"] = -1

#concantenating both train and test data
Data = pd.concat([TrainData, TestData]).reset_index(drop = True)

for feat in categorical_columns:
  lbl_enc = preprocessing.LabelEncoder()
  temp_col = Data[feat].fillna("NONE").astype(str).values

  # we can use fit_transform here as we do not
  # have any extra test data that we need to 
  # transfrom on separately
  Data.loc[:, feat] = lbl_enc.fit_transform(temp_col)


train = Data[Data.engagement_score!= -1].reset_index(drop = True)
test = Data[Data.engagement_score == -1].reset_index(drop = True)


In [None]:
test.drop(labels ='engagement_score', axis = 1, inplace = True)

In [None]:
train.to_csv("/content/drive/MyDrive/Analytics_jobthakon_data/encodedTrainData.csv",index = False) # saving encoded train data as csv file
test.to_csv("/content/drive/MyDrive/Analytics_jobthakon_data/encodedTestData.csv",index = False) # saving encoded test data as csv file

#### Creating folds of encoded Training Data

In [None]:
train1['kfold'] = -1

train1 = train1.sample(frac = 1).reset_index(drop = True)

kf = model_selection.KFold(n_splits = 5)

for fold, (trn_, val_) in enumerate(kf.split(X = train1)):
  train1.loc[val_, 'kfold'] = fold

train1.to_csv("/content/drive/MyDrive/Analytics_jobthakon_data/encodedfoldTrainData",index = False)


#### Hyperparameter tuning of different models using 5-folded training data

In [None]:
trainData = pd.read_csv("/content/drive/MyDrive/Analytics_jobthakon_data/encodedfoldTrainData") # loading encoded 5- fold data
traindata.drop(labels = 'row_id', axis = 1, inplace = True) # removing row_id columns from training data

In [None]:
# function for hyperparameter tuning of different models.

def run(fold, model,params):
  df_train = traindata[traindata.kfold!= fold].reset_index(drop = True)

  df_valid = traindata[traindata.kfold ==fold].reset_index(drop = True)
  


  x_train = df_train.drop(labels=["engagement_score","kfold"],axis = 1).values
  y_train = df_train.engagement_score.values

  
  x_valid = df_valid.drop(labels =["engagement_score","kfold"], axis = 1).values
  y_valid = df_valid.engagement_score.values


  cf = model_selection.RandomizedSearchCV(
      estimator = model,
      param_distributions=  params,
      n_iter = 20,
      scoring = 'r2',
      verbose = 3, 
      n_jobs = 3,
      cv =5
  )

  

  cf.fit(x_train, y_train)
  print(f"Best parameters set :{cf.best_score_}")
  print("Best parameters set:")
  best_parameters = cf.best_estimator_.get_params()
  for param_name in sorted(params.keys()):
    print(f"\t{param_name}:{best_parameters[param_name]}")


  preds = cf.predict(x_valid)
  accuracy = metrics.r2_score(y_valid, preds)
  print(f"Fold = {fold}, Accuracy = {accuracy}")

##### 1. Hyperparameter tuning of Random forest Regressor model

In [None]:
model = RandomForestRegressor() # Loading of model RandomForestRegressor



#parameter grid of RandomforestRegressor model
param1 = {'n_estimators':  [700],
                'max_depth':  [10,20,30,40],
                'max_features': ['auto', 'sqrt'],
                'min_samples_split':[1,2, 5, 10, 15],
                'min_samples_leaf': [1, 2, 4, 5]
}



run(1, model1, param1)    # calling of function for hyperparameter tuning

##### 2. Hyperparameter tuning of LGBMRegressor model

In [None]:
model2 = lgb.LGBMRegressor() # Loading of model 


param2 = {'num_leaves':[20,40,60,80,100], 'min_child_samples':[5,10,15],'max_depth':[-1,5,10,20],     # Parameter grid of LGBMRegressor model
             'learning_rate':[0.05,0.1,0.2],'reg_alpha':[0,0.01,0.03],
          'n_iter': [2000,2500,3000,3500,4000,4500]}


run(1, model2 , param2) # Calling of function for hyperparameter tuning.

##### 3. Hyperparameter tuning for XGBRegressor model

In [None]:
model3 = XGBRegressor()


param3 = {
    "learning_rate": [0.1, 0.01,0.05],
    "colsample_bytree": [0.6, 0.8, 0.9,1.0],
    "subsample": [0.6, 0.8,0.9, 1.0],
    "max_depth": [2, 3, 4],
    "n_estimators": [400, 500, 600, 700,800],
    "reg_lambda": [1, 1.5, 2],
    "gamma": [0, 0.1, 0.3],
    "min_child_weight": [1,3,5,7]
}


 run(1, model3, param3)

##### 4. Hyperparameter tuning for CatboostRegressor model

In [None]:
model4 = cat.CatBoostRegressor()


param4  = {'depth':[3,1,2,6,4,5,7,8,9,10],
          'iterations':[1000,1500,2000,2500,3000],
          'learning_rate':[0.03,0.001,0.01,0.1,0.2,0.3], 
          'l2_leaf_reg':[3,1,5,10,100]}


run(1, model4, param4)

#### Training of models and Prediction from test data

In [None]:
df_train = TrainData.drop(labels=['row_id','engagement_score'],axis =1).values
df_test = TestData.drop(labels = 'row_id',axis =1).values

##### 1. RandomForestRegressor model

In [None]:
model = RandomForestRegressor(n_estimators=800,max_depth=40,
	min_samples_leaf=1,
	min_samples_split=10)
model.fit(df_train, target)
y = model.predict(df_test)

submission = pd.read_csv("/content/drive/MyDrive/Analytics_jobthakon_data/sample_submission_JPlpRcN.csv")
submission.engagement_score =y

submission.to_csv("/content/drive/MyDrive/Analytics_jobthakon_data/Submission8.csv",index = False)

##### 2. XGboost regressor model

In [None]:
paramXGb = {'colsample_bytree':1.0,
	'gamma':0,
	'learning_rate':0.1,
	'max_depth':4,
	'min_child_weight':7,
	'n_estimators':3850,
	'reg_lambda':1.5,
	'subsample':0.8
    
}

In [None]:
model = XGBRegressor(**paramXGb)
model.fit(df_train,target)
y = model.predict(df_test)
submission = pd.read_csv("/content/drive/MyDrive/Analytics_jobthakon_data/sample_submission_JPlpRcN.csv")
submission.engagement_score =y

submission.to_csv("/content/drive/MyDrive/Analytics_jobthakon_data/Submission6.csv",index = False)

##### 3. lightgbm regressor model

In [None]:
paralgb= {
    "learning_rate":0.05,
	"max_depth":-1,
	"min_child_samples":10,
	"n_iter":2500,
	"num_leaves":60,
	"reg_alpha":0
}

In [None]:
model = lgb.LGBMRegressor(**paralgb)
model.fit(df_train,target)
y = model.predict(df_test)
submission = pd.read_csv("/content/drive/MyDrive/Analytics_jobthakon_data/sample_submission_JPlpRcN.csv")
submission.engagement_score =y

submission.to_csv("/content/drive/MyDrive/Analytics_jobthakon_data/Submission9.csv",index = False)

##### 4. Catboost regressor model

In [None]:
paracat = {
  'depth':8,
	'iterations':3500,
	'l2_leaf_reg':5,
	'learning_rate':0.1
}

In [None]:
model = cat.CatBoostRegressor(**paracat)
model.fit(df_train,target)
y = model.predict(df_test)
submission = pd.read_csv("/content/drive/MyDrive/Analytics_jobthakon_data/sample_submission_JPlpRcN.csv")
submission.engagement_score =y

submission.to_csv("/content/drive/MyDrive/Analytics_jobthakon_data/Submission10.csv",index = False)

### Average voting Ensemble of models prediction

In [None]:
submission1 = pd.read_csv("/content/drive/MyDrive/Analytics_jobthakon_data/Submission10.csv")
submission2 = pd.read_csv("/content/drive/MyDrive/Analytics_jobthakon_data/Submission9.csv")
submission3 = pd.read_csv("/content/drive/MyDrive/Analytics_jobthakon_data/Submission8.csv" )
submission4 = pd.read_csv("/content/drive/MyDrive/Analytics_jobthakon_data/Submission7.csv")
submission5 = pd.read_csv("/content/drive/MyDrive/Analytics_jobthakon_data/Submission6.csv")
submission6 = pd.read_csv("/content/drive/MyDrive/Analytics_jobthakon_data/Submission2.csv")

In [None]:
submission1.drop(labels = 'row_id',inplace = True, axis =1)
submission2.drop(labels = 'row_id',inplace = True, axis =1)
submission3.drop(labels = 'row_id',inplace = True, axis =1)
submission4.drop(labels = 'row_id',inplace = True, axis =1)
submission5.drop(labels = 'row_id',inplace = True, axis =1)
submission6.drop(labels = 'row_id',inplace = True, axis =1)

In [None]:
submission =submission1.values + submission2.values + submission3.values + submission4.values + submission5.values + submission6.values
submission = submission/6

In [None]:
sample_submission = pd.read_csv("/content/drive/MyDrive/Analytics_jobthakon_data/sample_submission_JPlpRcN.csv")
sample_submission['engagement_score'] = submission
sample_submission.to_csv("/content/drive/MyDrive/Analytics_jobthakon_data/Submission12.csv", index =False)