In [1]:
import numpy as np
import pandas as pd 
import seaborn as sns 
from matplotlib import pyplot as plt
import warnings 
warnings.filterwarnings('ignore')
%matplotlib inline 

from sklearn.model_selection import train_test_split 
from catboost import CatBoostRegressor , Pool,metrics ,cv


### 1. Load our Dataset 

In [2]:
train= pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
sub = pd.read_csv('sub.csv')

In [3]:
# lets have a look at our train dataset 
train.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1000001,P00069042,F,0-17,10,A,2,0,3,,,8370
1,1000001,P00248942,F,0-17,10,A,2,0,1,6.0,14.0,15200
2,1000001,P00087842,F,0-17,10,A,2,0,12,,,1422
3,1000001,P00085442,F,0-17,10,A,2,0,12,14.0,,1057
4,1000002,P00285442,M,55+,16,C,4+,0,8,,,7969


In [4]:
sub.head()

Unnamed: 0,Purchase,User_ID,Product_ID
0,100,1000004,P00128942
1,100,1000009,P00113442
2,100,1000010,P00288442
3,100,1000010,P00145342
4,100,1000011,P00053842


In [5]:
test.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3
0,1000004,P00128942,M,46-50,7,B,2,1,1,11.0,
1,1000009,P00113442,M,26-35,17,C,0,0,3,5.0,
2,1000010,P00288442,F,36-45,1,B,4+,1,5,14.0,
3,1000010,P00145342,F,36-45,1,B,4+,1,4,9.0,
4,1000011,P00053842,F,26-35,1,C,1,0,4,5.0,12.0


In [6]:
# data types in our dataframe train
train.dtypes

User_ID                         int64
Product_ID                     object
Gender                         object
Age                            object
Occupation                      int64
City_Category                  object
Stay_In_Current_City_Years     object
Marital_Status                  int64
Product_Category_1              int64
Product_Category_2            float64
Product_Category_3            float64
Purchase                        int64
dtype: object

In [7]:
#checking for null values in our dataset train

train.isnull().sum()

User_ID                            0
Product_ID                         0
Gender                             0
Age                                0
Occupation                         0
City_Category                      0
Stay_In_Current_City_Years         0
Marital_Status                     0
Product_Category_1                 0
Product_Category_2            173638
Product_Category_3            383247
Purchase                           0
dtype: int64

In [8]:
train.size

6600816

In [9]:
# from this ,we can see that product_Category_2 and product_Category_3 has missing values 
train['Product_Category_2']= train['Product_Category_2'].fillna(train['Product_Category_2'].mean())
train['Product_Category_3']= train['Product_Category_3'].fillna(train['Product_Category_3'].mean())


# do the same for the test dataset
test['Product_Category_2']= test['Product_Category_2'].fillna(test['Product_Category_2'].mean())
test['Product_Category_3']= test['Product_Category_3'].fillna(test['Product_Category_3'].mean())


print(train.isnull().sum())
print(test.isnull().sum())

User_ID                       0
Product_ID                    0
Gender                        0
Age                           0
Occupation                    0
City_Category                 0
Stay_In_Current_City_Years    0
Marital_Status                0
Product_Category_1            0
Product_Category_2            0
Product_Category_3            0
Purchase                      0
dtype: int64
User_ID                       0
Product_ID                    0
Gender                        0
Age                           0
Occupation                    0
City_Category                 0
Stay_In_Current_City_Years    0
Marital_Status                0
Product_Category_1            0
Product_Category_2            0
Product_Category_3            0
dtype: int64


In [10]:
train.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1000001,P00069042,F,0-17,10,A,2,0,3,9.842329,12.668243,8370
1,1000001,P00248942,F,0-17,10,A,2,0,1,6.0,14.0,15200
2,1000001,P00087842,F,0-17,10,A,2,0,12,9.842329,12.668243,1422
3,1000001,P00085442,F,0-17,10,A,2,0,12,14.0,12.668243,1057
4,1000002,P00285442,M,55+,16,C,4+,0,8,9.842329,12.668243,7969


In [11]:
train.describe()

Unnamed: 0,User_ID,Occupation,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
count,550068.0,550068.0,550068.0,550068.0,550068.0,550068.0,550068.0
mean,1003029.0,8.076707,0.409653,5.40427,9.842329,12.668243,9263.968713
std,1727.592,6.52266,0.49177,3.936211,4.207852,2.27183,5023.065394
min,1000001.0,0.0,0.0,1.0,2.0,3.0,12.0
25%,1001516.0,2.0,0.0,1.0,8.0,12.668243,5823.0
50%,1003077.0,7.0,0.0,5.0,9.842329,12.668243,8047.0
75%,1004478.0,14.0,1.0,8.0,14.0,12.668243,12054.0
max,1006040.0,20.0,1.0,20.0,18.0,18.0,23961.0


In [12]:
train.describe(include=['O'])

Unnamed: 0,Product_ID,Gender,Age,City_Category,Stay_In_Current_City_Years
count,550068,550068,550068,550068,550068
unique,3631,2,7,3,5
top,P00265242,M,26-35,B,1
freq,1880,414259,219587,231173,193821


In [13]:
# the following columns should be turn into objects 
train['User_ID']= train['User_ID'].astype('object')
test['User_ID']= test['User_ID'].astype('object')

In [14]:
# separating into features and targets 
X= train.drop('Purchase',axis=1)
y= train['Purchase']

In [15]:
# categorical feature indices 
categorical_feature_indices = np.where(X.dtypes=='object')[0]
print(categorical_feature_indices)

[0 1 2 3 5 6]


In [16]:
#split our data into train and test datasets 
X_train,X_validation,y_train,y_validation = train_test_split(X,y,test_size=0.20,random_state=23)
X_test = test

In [17]:
# Model training  1st iteration 

model = CatBoostRegressor(
    loss_function = 'RMSE',
    random_seed=45,
    logging_level='Silent'
)

model.fit(
    X_train,
    y_train,
    cat_features= categorical_feature_indices,
    eval_set = (X_validation,y_validation),
    plot=True

)



MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

<catboost.core.CatBoostRegressor at 0x1ba804cb8b0>

In [18]:
cv_params = model.get_params()
print(cv_params)

{'loss_function': 'RMSE', 'random_seed': 45, 'logging_level': 'Silent'}


In [19]:
# attempting cross_validation 
cv_data = cv(
    Pool(X,y,cat_features=categorical_feature_indices),
    cv_params,
    nfold=3,
    iterations =1500,
    plot=True
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

In [20]:
cv_data.head()

Unnamed: 0,iterations,test-RMSE-mean,test-RMSE-std,train-RMSE-mean,train-RMSE-std
0,0,10266.906946,3.215959,10266.920275,1.681774
1,1,10000.969402,2.919199,10000.997279,1.679937
2,2,9745.007799,2.661842,9745.03545,1.677935
3,3,9490.287503,2.556411,9490.314843,1.658702
4,4,9245.314458,2.614347,9245.341574,1.625901


In [None]:

print(
'Best validation RMSE score is {:.2f} ± {:.2f} on step {}'.format(
    np.max(cv_data['test-RMSE-mean']),
    cv_data['test-RMSE-std'][np.argmax(cv_data['test-RMSE-mean'])],
    np.argmax(cv_data['test-RMSE-MEAN'])
))