## Loading required libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from catboost import CatBoostRegressor

from mlxtend.regressor import StackingCVRegressor

  import pandas.util.testing as tm


In [None]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

## Reading input data

In [None]:
train = pd.read_csv("/content/Train.csv")
test = pd.read_csv("/content/Test.csv")

In [None]:
train = train.drop(columns='ID',axis=1)
test = test.drop(columns='ID',axis=1)

In [None]:
train.nunique() , print(test.nunique())

CONSOLE             17
YEAR                25
CATEGORY            12
PUBLISHER          136
RATING               5
CRITICS_POINTS     995
USER_POINTS       1110
dtype: int64


(CONSOLE              17
 YEAR                 23
 CATEGORY             12
 PUBLISHER           204
 RATING                6
 CRITICS_POINTS     1683
 USER_POINTS        2187
 SalesInMillions    3506
 dtype: int64, None)

### EDA and feature engineering

In [None]:
sns.pairplot(train, vars=["RATING","PUBLISHER", "SalesInMillions"])

In [None]:
set(train['RATING']),print(set(test['RATING']))

{'E10+', 'M', 'AO', 'E', 'T'}


({'E', 'E10+', 'K-A', 'M', 'RP', 'T'}, None)

In [None]:
data = train.append(test)
data=data.reset_index()
data=data.drop(['index'],axis=1)

data=data.drop(['RATING'],axis=1)
data=data.drop(['PUBLISHER'],axis=1)

In [None]:
set(data['CATEGORY'])

{'action',
 'adventure',
 'fighting',
 'misc',
 'platform',
 'puzzle',
 'racing',
 'role-playing',
 'shooter',
 'simulation',
 'sports',
 'strategy'}

In [None]:
category = {'action': '1',
         'adventure': '2',
         'fighting': '3',
         'misc': '4',
         'platform': '5',
         'puzzle': '6',
         'racing': '7',
         'role-playing': '8',
         'shooter': '9',
         'simulation': '10',
         'sports': '11',
         'strategy': '12'}

data['CATEGORY'] = data['CATEGORY'].map(category)

In [None]:
cat = data[['CATEGORY','CONSOLE']]
cat = pd.get_dummies(cat)
#cat.head()
num = data.drop(['CATEGORY'],axis=1)
num = data.drop(['CONSOLE'],axis=1)

data=pd.concat([cat, num], axis=1)

In [None]:
#data.head()
data = data.drop('CATEGORY',axis=1)

### Model building


In [None]:
tr=data[:train.shape[0]]
te=data[train.shape[0]:]

x=tr.drop('SalesInMillions',axis=1)
y=tr['SalesInMillions']

In [None]:
lgb = LGBMRegressor(objective='regression',num_leaves=100,learning_rate=0.1, n_estimators=1200,random_state=111)

xgb=XGBRegressor(learning_rate =0.1, 
      n_estimators=1700, max_depth=12, min_child_weight=1,gamma=0, reg_alpha=2e-5,
      subsample=0.8,colsample_bytree=0.8,
      nthread=4,scale_pos_weight=1,seed=27,verbose=True,random_state=10)

rf=RandomForestRegressor(n_estimators=500,random_state =111,max_features = "auto",criterion='mse',max_depth=100)

model = CatBoostRegressor(iterations=100,depth=10,learning_rate=0.1,loss_function='RMSE')

In [None]:
avg = StackingCVRegressor(regressors=(lgb,xgb,rf,model),meta_regressor=xgb, use_features_in_secondary=True)

In [None]:
X=np.array(x)
Y=np.array(y)

In [None]:
avg.fit(X,Y)



StackingCVRegressor(cv=5,
                    meta_regressor=XGBRegressor(base_score=0.5,
                                                booster='gbtree',
                                                colsample_bylevel=1,
                                                colsample_bynode=1,
                                                colsample_bytree=1, gamma=0,
                                                importance_type='gain',
                                                learning_rate=0.05,
                                                max_delta_step=0, max_depth=3,
                                                min_child_weight=1,
                                                missing=None, n_estimators=1500,
                                                n_jobs=1, nthread=None,
                                                objective='reg:linear',
                                                random_state=111, reg_alpha=0,
                                       

In [None]:
Predict=avg.predict(np.array(te.drop('SalesInMillions',axis=1)))

In [None]:
Submission=pd.DataFrame(Predict,columns=['SalesInMillions'])

In [None]:
Submission.to_excel('/content/sub1.xlsx',index=False)

##### For this problem statement and given data, CatBoost model performed well and gave better results compared to other models stacked together

In [None]:
Submission.head()

Unnamed: 0,SalesInMillions
0,1.392137
1,2.91767
2,2.629541
3,1.79435
4,1.909511
