In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeRegressor
from xgboost.sklearn import XGBRegressor
from sklearn.metrics import mean_squared_error

In [3]:
train = pd.read_csv('gdrive/My Drive/Colab Notebooks/BF_train.csv')

In [4]:
test = pd.read_csv('gdrive/My Drive/Colab Notebooks/BF_test.csv')

In [5]:
submission = pd.read_csv('gdrive/My Drive/Colab Notebooks/BF_sample_submission.csv')

In [6]:
train['source']='train'
test['source']='test'
data=pd.concat([train,test],ignore_index=True,sort=True)

In [7]:
data.head()

Unnamed: 0,Age,City_Category,Gender,Marital_Status,Occupation,Product_Category_1,Product_Category_2,Product_Category_3,Product_ID,Purchase,Stay_In_Current_City_Years,User_ID,source
0,0-17,A,F,0,10,3,,,P00069042,8370.0,2,1000001,train
1,0-17,A,F,0,10,1,6.0,14.0,P00248942,15200.0,2,1000001,train
2,0-17,A,F,0,10,12,,,P00087842,1422.0,2,1000001,train
3,0-17,A,F,0,10,12,14.0,,P00085442,1057.0,2,1000001,train
4,55+,C,M,0,16,8,,,P00285442,7969.0,4+,1000002,train


In [8]:
# User_ID data preprocess. e.g. 1000002 -> 2
data['User_ID'] = data['User_ID'] - 1000000

In [9]:
# Product_ID preprocess e.g. P00069042 -> 69042
data['Product_ID'] = data['Product_ID'].str.replace('P00', '')

scaler = StandardScaler()
data['Product_ID'] = scaler.fit_transform(data['Product_ID'].values.reshape(-1, 1))

In [10]:
data.isnull().sum()

Age                                0
City_Category                      0
Gender                             0
Marital_Status                     0
Occupation                         0
Product_Category_1                 0
Product_Category_2            245982
Product_Category_3            545809
Product_ID                         0
Purchase                      233599
Stay_In_Current_City_Years         0
User_ID                            0
source                             0
dtype: int64

In [11]:
data['Product_Category_2'].fillna((data['Product_Category_2'].mean()),inplace=True)
data['Product_Category_3'].fillna((data['Product_Category_3'].mean()),inplace=True)

In [12]:
data.isnull().sum()

Age                                0
City_Category                      0
Gender                             0
Marital_Status                     0
Occupation                         0
Product_Category_1                 0
Product_Category_2                 0
Product_Category_3                 0
Product_ID                         0
Purchase                      233599
Stay_In_Current_City_Years         0
User_ID                            0
source                             0
dtype: int64

In [13]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
cat_col = ['Gender', 'City_Category','Age', 'Stay_In_Current_City_Years']
for i in cat_col:
    data[i] = le.fit_transform(data[i])

In [14]:
data=pd.get_dummies(data,columns=['Gender', 'City_Category', 'Age', 'Stay_In_Current_City_Years'])

In [15]:
data.head()

Unnamed: 0,Marital_Status,Occupation,Product_Category_1,Product_Category_2,Product_Category_3,Product_ID,Purchase,User_ID,source,Gender_0,Gender_1,City_Category_0,City_Category_1,City_Category_2,Age_0,Age_1,Age_2,Age_3,Age_4,Age_5,Age_6,Stay_In_Current_City_Years_0,Stay_In_Current_City_Years_1,Stay_In_Current_City_Years_2,Stay_In_Current_City_Years_3,Stay_In_Current_City_Years_4
0,0,10,3,9.844506,12.668605,-1.027277,8370.0,1,train,1,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0
1,0,10,1,6.0,14.0,0.728995,15200.0,1,train,1,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0
2,0,10,12,9.844506,12.668605,-0.843742,1422.0,1,train,1,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0
3,0,10,12,14.0,12.668605,-0.867172,1057.0,1,train,1,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0
4,0,16,8,9.844506,12.668605,1.085326,7969.0,2,train,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1


In [16]:
#remove dummy variable trap
del data['Stay_In_Current_City_Years_0']
del data['Age_0']
del data['City_Category_0']
del data['Gender_0']

In [17]:
# Scale numerical columns
scaler = StandardScaler()
num_col = ['Occupation','Product_Category_1',
           'Product_Category_2', 'Product_Category_3']
for col in num_col:
    data[col] = scaler.fit_transform(data[col].values.reshape(-1,1))

In [18]:
data.head()

Unnamed: 0,Marital_Status,Occupation,Product_Category_1,Product_Category_2,Product_Category_3,Product_ID,Purchase,User_ID,source,Gender_1,City_Category_1,City_Category_2,Age_1,Age_2,Age_3,Age_4,Age_5,Age_6,Stay_In_Current_City_Years_1,Stay_In_Current_City_Years_2,Stay_In_Current_City_Years_3,Stay_In_Current_City_Years_4
0,0,0.294486,-0.610134,4.213976e-16,7.815565e-16,-1.027277,8370.0,1,train,0,0,0,0,0,0,0,0,0,0,1,0,0
1,0,0.294486,-1.125843,-0.9120158,0.5857835,0.728995,15200.0,1,train,0,0,0,0,0,0,0,0,0,0,1,0,0
2,0,0.294486,1.710556,4.213976e-16,7.815565e-16,-0.843742,1422.0,1,train,0,0,0,0,0,0,0,0,0,0,1,0,0
3,0,0.294486,1.710556,0.9857904,7.815565e-16,-0.867172,1057.0,1,train,0,0,0,0,0,0,0,0,0,0,1,0,0
4,0,1.214421,0.679138,4.213976e-16,7.815565e-16,1.085326,7969.0,2,train,1,0,1,0,0,0,0,0,1,0,0,0,1


In [19]:
data.head()

Unnamed: 0,Marital_Status,Occupation,Product_Category_1,Product_Category_2,Product_Category_3,Product_ID,Purchase,User_ID,source,Gender_1,City_Category_1,City_Category_2,Age_1,Age_2,Age_3,Age_4,Age_5,Age_6,Stay_In_Current_City_Years_1,Stay_In_Current_City_Years_2,Stay_In_Current_City_Years_3,Stay_In_Current_City_Years_4
0,0,0.294486,-0.610134,4.213976e-16,7.815565e-16,-1.027277,8370.0,1,train,0,0,0,0,0,0,0,0,0,0,1,0,0
1,0,0.294486,-1.125843,-0.9120158,0.5857835,0.728995,15200.0,1,train,0,0,0,0,0,0,0,0,0,0,1,0,0
2,0,0.294486,1.710556,4.213976e-16,7.815565e-16,-0.843742,1422.0,1,train,0,0,0,0,0,0,0,0,0,0,1,0,0
3,0,0.294486,1.710556,0.9857904,7.815565e-16,-0.867172,1057.0,1,train,0,0,0,0,0,0,0,0,0,0,1,0,0
4,0,1.214421,0.679138,4.213976e-16,7.815565e-16,1.085326,7969.0,2,train,1,0,1,0,0,0,0,0,1,0,0,0,1


In [20]:
#split dataset into train and test set
train=data.loc[data['source']=='train']
test=data.loc[data['source']=='test']

In [21]:
X = train.drop(['Purchase','source'], axis=1)
y = train[['Purchase']]
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.20, random_state=42, shuffle=True)

In [22]:
X_test = test.drop(['Purchase','source'], axis = 1)

**Validating Decision Tree**

In [23]:
dec_reg = DecisionTreeRegressor(criterion='mse', min_samples_split=150, random_state=0)
dec_reg.fit(X_train, y_train)
y_pred = dec_reg.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_pred, y_val))

print (dec_reg, rmse)

DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=None,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=150,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=0, splitter='best') 2759.6341924542494


In [24]:
# Training using entire data to improve accuracy
dec_reg.fit(X, y)
predict = dec_reg.predict(X_test)

In [None]:
submission['Purchase'] = predict
submission.to_csv('bfs_DT.csv', index=False)
from google.colab import files
files.download("bfs_DT.csv")

**Validating the XGB model**

In [26]:
xgb_reg = XGBRegressor(learning_rate=1.0, max_depth=6, min_child_weight=40, seed=0)

xgb_reg.fit(X_train, y_train)
y_pred = xgb_reg.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_pred, y_val))

print (xgb_reg, rmse)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=1.0, max_delta_step=0,
             max_depth=6, min_child_weight=40, missing=None, n_estimators=100,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=0, silent=None,
             subsample=1, verbosity=1) 2591.197359480536


In [27]:
# Training using entire data to improve accuracy
xgb_reg.fit(X, y)
predict = xgb_reg.predict(X_test)



In [28]:
submission['Purchase'] = predict
submission.to_csv('bfs_XGB.csv', index=False)
from google.colab import files
files.download("bfs_XGB.csv")
#Leaderboard Rank: 595
#Leaderboard Score: 2583.03

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>