In [None]:
import numpy as np
import pandas as pd
import re

import catboost
import lightgbm as lgb
from xgboost.sklearn import XGBRegressor

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn import preprocessing
from sklearn.decomposition import PCA

## Load the train and test dataset

In [None]:
train_data = pd.read_csv('Data_Train.csv')
test_data = pd.read_csv('Data_Test.csv')
train_data.head()

## Check the shape of the training dataset

In [None]:
train_data.shape

## Train a label encoder for the column Genre. This label encoder will then be used to transform the test set. 

In [None]:
train_data.Genre = train_data.Genre.astype('category')
le_genre = preprocessing.LabelEncoder()
le_genre.fit(train_data.Genre)
train_data.Genre = le_genre.transform(train_data.Genre)
train_data.head()

## Train a label encoder for the column Name. This label encoder will then be used to transform the same column in the test set. 

In [None]:
train_data.Name = train_data.Name.astype('category')
le_name = preprocessing.LabelEncoder()
le_name.fit(list(train_data.Name)+list(test_data.Name))
train_data.Name = le_name.transform(train_data.Name)
train_data.head()

## Extract the Year and month from the timestamp column

In [None]:
train_data['Year'] = pd.to_datetime(train_data.Timestamp).apply(lambda x: x.year)
train_data['Month'] = pd.to_datetime(train_data.Timestamp).apply(lambda x: x.month)

In [None]:
train_data.head()

## Defined a function process_decimal which will remove the comma, M(short hand for Million), K(short hand for Thousand) etc and process the deciaml values accordingly. 

## Applied the above function on Popularity and Likes columns.

In [None]:
def process_decimal(x):
    if x[-1].lower()=='k':
        return pd.to_numeric(re.sub("[^0-9.]", "", x))*1000
    elif x[-1].lower()=='m':
        return pd.to_numeric(re.sub("[^0-9.]", "", x))*1000000
    else:
        return pd.to_numeric(re.sub("[^0-9.]", "", x))
    
train_data.Popularity = pd.to_numeric(train_data.Popularity.apply(process_decimal))
train_data.Likes = pd.to_numeric(train_data.Likes.apply(process_decimal))

In [None]:
train_data.head()

## Calculated the mean and variance of Views(Target Variable) with respect to each Genre

In [None]:
stats_genre = pd.DataFrame()
stats_genre['genre_var'] = train_data.groupby(['Genre']).Views.var()
stats_genre['genre_mean'] = train_data.groupby(['Genre']).Views.mean()
stats_genre['genre_var'].fillna(stats_genre['genre_var'].mean(), inplace=True)
stats_genre['genre_mean'].fillna(stats_genre['genre_mean'].mean(), inplace=True)

## Calculated the mean and variance of Views(Target Variable) with respect to each Name

In [None]:
stats_name = pd.DataFrame()
stats_name['name_mean'] = train_data.groupby(['Name']).Views.mean()
stats_name['name_var'] = train_data.groupby(['Name']).Views.var()
stats_name['name_mean'].fillna(stats_name['name_mean'].mean(), inplace=True)
stats_name['name_var'].fillna(stats_name['name_var'].mean(), inplace=True)

## Calculated the mean and variance of Views(Target Variable) with respect to each Year

In [None]:
stats_year = pd.DataFrame()
stats_year['year_mean'] = train_data.groupby(['Year']).Views.mean()
stats_year['year_var'] = train_data.groupby(['Year']).Views.var()
stats_year['year_mean'].fillna(stats_year['year_mean'].mean(), inplace=True)
stats_year['year_var'].fillna(stats_year['year_var'].mean(), inplace=True)

In [None]:
stats_genre.head(), stats_genre.shape, stats_genre.isna().sum().sum()

In [None]:
stats_name.head(), stats_name.shape, stats_name.isna().sum().sum()

In [None]:
stats_year.head(), stats_year.shape, stats_year.isna().sum().sum()

## Merge the calculated stats columns to the main dataset

In [None]:
join = train_data.merge(stats_genre, on='Genre',how='left').merge(stats_name, on='Name',how='left').merge(stats_year, on='Year',how='left')

## Check for the nan cells

In [None]:
join.shape, join.isna().sum().sum()

## Drop the columns which are not required

In [None]:
train_data = join.drop(['Unique_ID', 'Country', 'Timestamp', 'Song_Name'], axis=1)

## Seperate the dependent column and independent column

In [None]:
Y = train_data.Views
X = train_data.drop(['Views'], 1)

In [None]:
X.head()

## Added two more composite features

In [None]:
X['feature_1'] = np.sqrt(X.Likes * X.Comments)
X['feature_2'] = np.sqrt(X.name_mean * X.Comments)

In [None]:
X.head()

## Apply PCA on the dataset

In [None]:
pca=PCA(n_components=3)
pca_X = pca.fit_transform(X)
print(pca_X.shape, X.shape)
X = X.merge(pd.DataFrame(pca_X), how='left', left_index=True, right_index=True)
print(X.shape)

## Check the explaned variance ration in order to calculate the number of components we need to have.

In [None]:
pca.explained_variance_ratio_

In [None]:
X.head()

# Train the model

# LightGBM

In [None]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'rmse',
    'max_depth': 16, 
    'learning_rate': 0.01,
    'verbose': 0, 
    'early_stopping_round': 1000,
    'num_leaves':4096, 
    'max_bin':2048}
params['metric'] = ['rmse']
n_estimators = 3000

x_train, x_valid, y_train, y_valid = train_test_split(X,Y, test_size=0.10, random_state=1)
d_train = lgb.Dataset(x_train, label=y_train)
d_valid = lgb.Dataset(x_valid, label=y_valid)
watchlist = [d_valid]

model = lgb.train(params, d_train, n_estimators, watchlist, verbose_eval=1)

preds = model.predict(x_valid)
print("Val Loss:", np.sqrt(np.sum(np.power(np.array(preds)-np.array(y_valid),2))/len(y_valid)))
model_2 = model

# Random Forest Regressor

In [None]:
#Hyper parameter tunning
# param_grid = {
#           'n_estimators':[25, 50], 
#           'criterion':['mae', 'mse'],
#           'max_depth':[8],
#           'max_leaf_nodes':[50],
#           'random_state':[1]
#             }

param_grid = {'criterion': ['mse'], 'n_estimators': [25], 'random_state': [1]}

model=RandomForestRegressor()
grid = GridSearchCV(estimator=model,
                    param_grid=param_grid,
#                     param_distributions=param_grid, n_iter=10,
                    scoring=['r2', 'neg_mean_squared_error'],
                    verbose=1,
                    n_jobs=4,
                    refit = 'neg_mean_squared_error',
                    cv=5
                   )
grid_result = grid.fit(X, Y)

print('Best Score: ', grid_result.best_score_)
print('Best Params: ', grid_result.best_params_)
model_3 = grid.best_estimator_
# Best Score:  -500903439391.60547
# Best Params:  {'criterion': 'mse', 'n_estimators': 25, 'random_state': 1}

# KNN Regressor

In [None]:
#Hyper parameter tunning
# param_grid = {
#                 'n_neighbors':[10, 20, 30, 50, 100],
#                 'algorithm':['auto', 'ball_tree', 'kd_tree'],
#                 'p':[1, 2, 3, 4], 
#                 'weights':['uniform','distance'],
#                 'n_jobs':[4]
#              }

param_grid = {'algorithm': ['auto'], 'n_jobs': [4], 'n_neighbors': [10], 'p': [2], 'weights': ['distance']}

model=KNeighborsRegressor()
grid = GridSearchCV(estimator=model,
                    param_grid=param_grid,
#                     param_distributions=param_grid, n_iter=10,
                    scoring=['r2', 'neg_mean_squared_error'],
                    verbose=1,
                    n_jobs=4,
                    refit = 'neg_mean_squared_error',
                    cv=5
                   )
grid_result = grid.fit(X, Y)

print('Best Score: ', grid_result.best_score_)
print('Best Params: ', grid_result.best_params_)
model_4 = grid.best_estimator_
# Best Score:  -9453498861626.154
# Best Params:  {'algorithm': 'auto', 'n_jobs': 4, 'n_neighbors': 10, 'p': 2, 'weights': 'distance'}

# GOSS

In [None]:
params = {
    'boosting_type': 'goss',
    'objective': 'regression',
    'metric': 'rmse',
    'max_depth': 16, 
    'learning_rate': 0.01,
    'verbose': 0, 
    'early_stopping_round': 1000,
    'num_leaves':2048, 
    'max_bin':2048}
params['metric'] = ['rmse']
n_estimators = 2000

x_train, x_valid, y_train, y_valid = train_test_split(X,Y, test_size=0.10, random_state=1)
d_train = lgb.Dataset(x_train, label=y_train)
d_valid = lgb.Dataset(x_valid, label=y_valid)
watchlist = [d_valid]

model = lgb.train(params, d_train, n_estimators, watchlist, verbose_eval=1)

preds = model.predict(x_valid)
print("Val Loss:", np.sqrt(np.sum(np.power(np.array(preds)-np.array(y_valid),2))/len(y_valid)))
model_5 = model

# CatBoost Regressor

In [None]:
#Hyper parameter tunning
# param_grid = {'depth':[13,10],
#           'iterations':[1000],
#           'learning_rate':[0.01, 0.1], 
#           'l2_leaf_reg':[5,10],
#           'border_count':[ 300],
#           'ctr_border_count':[ 300],
#           'random_state':[1]}

param_grid = {'border_count': [200], 'ctr_border_count': [200], 'depth': [12], 'iterations': [1000], 'l2_leaf_reg': [5], 'learning_rate': [0.01], 'random_state': [1]}

cat_feats=[0,1,6,7]

model=catboost.CatBoostRegressor(cat_features=cat_feats)
grid = GridSearchCV(estimator=model,
                    param_grid=param_grid,
#                     param_distributions=param_grid, n_iter=10,
                    scoring=['r2', 'neg_mean_squared_error'],
                    verbose=1,
                    n_jobs=4,
                    refit = 'neg_mean_squared_error',
                    cv=5
                   )
grid_result = grid.fit(X, Y)

print('Best Score: ', grid_result.best_score_)
print('Best Params: ', grid_result.best_params_)
model_1 = grid.best_estimator_
# Best Score:  -2186169917327.6897
# Best Params:  {'border_count': 200, 'ctr_border_count': 200, 'depth': 12, 'iterations': 1000, 'l2_leaf_reg': 5, 'learning_rate': 0.01, 'random_state': 1}

# XG Boost Regressor

In [None]:
# param_grid = {'min_child_weight':[1, 2, 4,5], 'gamma':[i/10.0 for i in range(3,6)],  'subsample':[i/10.0 for i in range(6,11)],
# 'colsample_bytree':[i/10.0 for i in range(6,11)], 'max_depth': [2,3,4,5,8,11], 'random_state':[1]}

param_grid={'colsample_bytree': [0.7], 'gamma': [0.3], 'max_depth': [11], 'min_child_weight': [4], 'random_state': [1], 'subsample': [0.8]}

model = XGBRegressor()
grid = GridSearchCV(estimator=model,
                    param_grid=param_grid,
#                     param_distributions=param_grid, n_iter=10,
                    scoring=['r2', 'neg_mean_squared_error'],
                    verbose=1,
                    n_jobs=4,
                    refit = 'neg_mean_squared_error',
                    cv=5
                   )
grid_result = grid.fit(X, Y)

print('Best Score: ', grid_result.best_score_)
print('Best Params: ', grid_result.best_params_)
model_6 = grid.best_estimator_
preds = model_6.predict(x_valid)
print("Val Loss:", np.sqrt(np.sum(np.power(np.array(preds)-np.array(y_valid),2))/len(y_valid)))
# Best Score:  -359030292377.03534
# Best Params:  {'colsample_bytree': 0.7, 'gamma': 0.3, 'max_depth': 11, 'min_child_weight': 4, 'random_state': 1, 'subsample': 0.8}

# Test Data 

In [None]:
test_data.head()

In [None]:
test_data.shape

## Transform the Genre column

In [None]:
test_data.Genre = test_data.Genre.astype('category')
test_data.Genre = le_genre.transform(test_data.Genre)
test_data.head()

## Transform the Name column

In [None]:
test_data.Name = test_data.Name.astype('category')
test_data.Name = le_name.transform(test_data.Name)
test_data.head()

In [None]:
test_data.head()

## Extract the Year & Month from timestamp column

In [None]:
test_data['Year'] = pd.to_datetime(test_data.Timestamp).apply(lambda x: x.year)
test_data['Month'] = pd.to_datetime(test_data.Timestamp).apply(lambda x: x.month)

In [None]:
test_data.head()

## Process the columns Likes & Popularity 

In [None]:
test_data.Popularity = pd.to_numeric(test_data.Popularity.apply(process_decimal))
test_data.Likes = pd.to_numeric(test_data.Likes.apply(process_decimal))

In [None]:
test_data.head()

## Append the stats columns calculated with the training data

In [None]:
test_data = test_data.merge(stats_genre, on='Genre',how='left').merge(stats_name, on='Name',how='left').merge(stats_year, on='Year',how='left')

## Remove the columns which aren't required

In [None]:
test_ids = test_data.Unique_ID
test_data = test_data.drop(['Unique_ID', 'Country', 'Timestamp', 'Song_Name'], axis=1)
test_data.head()

## Replace the nan with 0

In [None]:
print(test_data.isna().sum().sum())
test_data.fillna(0, inplace=True)

In [None]:
test_data.isna().sum().sum()

## Add the two new compound columns

In [None]:
test_data['feature_1'] = np.sqrt(test_data.Likes * test_data.Comments)
test_data['feature_2'] = np.sqrt(test_data.name_mean * test_data.Comments)

In [None]:
test_data.head()

## Apply PCA on the test set

In [None]:
pca_X = pca.transform(test_data)
print(pca_X.shape, test_data.shape)
test_data = test_data.merge(pd.DataFrame(pca_X), how='left', left_index=True, right_index=True)
print(test_data.shape)

In [None]:
test_data.head()

In [None]:
test_data.dtypes

In [None]:
test_data.shape

## Predict the Views for the test set

In [None]:
# y_test_1 = model_1.predict(test_data)
# y_test_2 = model_2.predict(test_data)
# y_test_3 = model_3.predict(test_data)
# y_test_4 = model_4.predict(test_data)
# y_test_5 = model_5.predict(test_data)
y_test_6 = model_6.predict(test_data)

## Generate Excel file to submit the results

In [None]:
results = pd.DataFrame()
results['Unique_ID'] = test_ids
results['Views'] = y_test_6
results.Views = results.Views.apply(lambda x: np.abs(x))
results.to_excel('results.xlsx', index=False)

# Rank: 34 
# LeaderBoard Score: 674507.59773