In [2]:
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import make_scorer, mean_squared_error

In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
#preprocess data
def categorical_to_numerical(train, test):    
    train_ = train.copy()
    test_ = test.copy()  
    encoders_train = {}
    encoders_test = {} 
    for column in train.columns[:-1]:
        if train[column].dtype == np.object:
            label = LabelEncoder()
            encoders_train[column] = label.fit_transform(train[column])
            train_[column] = label.transform(train[column])
            
            label = LabelEncoder()
            encoders_test[column] = label.fit_transform(test[column])
            test_[column] = label.transform(test[column])
    train_.fillna(0,inplace=True)
    test_.fillna(0,inplace=True)   
    return train_, test_, encoders_train, encoders_test

In [5]:
#calculate rmse
def RMSE(true_y, preds):
    return np.sqrt(mean_squared_error(true_y, preds))
rmse_scorer = make_scorer(RMSE, greater_is_better=False) #create own scorer

In [6]:
# get file with predictions
def get_file(test,predictions, file_name):
    sample = pd.read_csv('Sample_Submission_Tm9Lura.csv')
    sample['User_ID'] = test.User_ID.values
    sample['Product_ID'] = test.Product_ID.values
    sample['Purchase'] = predictions
    sample.to_csv(file_name, index=False)

In [7]:
# read train and test data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
train_, test_, encoder_train, encoder_test = categorical_to_numerical(train, test)
X_train = train_.drop(['Purchase'],axis=1)
y_train = train_.Purchase
train_X, test_X,train_y, test_y = train_test_split(X_train, y_train,random_state=100, test_size = 0.2)

In [28]:
# check correlations
train_.corr()['Purchase'].sort_values()

Product_Category_1           -0.343703
Product_ID                   -0.109345
Marital_Status               -0.000463
User_ID                       0.004716
Stay_In_Current_City_Years    0.005422
Age                           0.015839
Occupation                    0.020833
Product_Category_2            0.052288
Gender                        0.060346
City_Category                 0.061914
Product_Category_3            0.288501
Purchase                      1.000000
Name: Purchase, dtype: float64

In [14]:
# create gradient boosting model
gb = GradientBoostingRegressor(criterion='mse')

In [9]:
# fiting the model
gb.fit(train_X,train_y)

GradientBoostingRegressor(alpha=0.9, criterion='mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=100, n_iter_no_change=None, presort='auto',
             random_state=None, subsample=1.0, tol=0.0001,
             validation_fraction=0.1, verbose=0, warm_start=False)

In [10]:
# calculate rmse on the own train set
RMSE(gb.predict(train_X),train_y)

2914.0118246371117

In [11]:
#calculate rmse on theown test set
RMSE(gb.predict(test_X),test_y)

2925.419906845364

In [15]:
# refit the model with all data
gb.fit(X_train,y_train)

GradientBoostingRegressor(alpha=0.9, criterion='mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=100, n_iter_no_change=None, presort='auto',
             random_state=None, subsample=1.0, tol=0.0001,
             validation_fraction=0.1, verbose=0, warm_start=False)

In [16]:
# score on all train data
RMSE(gb.predict(X_train),y_train) #score 2960.13046112878  for test on public leaderboard

2914.7300303534244

In [17]:
#get predictions
pred = gb.predict(test_)
get_file(test,pred,'not_tuned_gbr.csv')

In [18]:
#create classifier with new parameters
gb1 = GradientBoostingRegressor(criterion='mse', max_depth=5, n_estimators=500)

In [20]:
#fitting the classifier
gb1.fit(train_X,train_y)

GradientBoostingRegressor(alpha=0.9, criterion='mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=5, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=500, n_iter_no_change=None, presort='auto',
             random_state=None, subsample=1.0, tol=0.0001,
             validation_fraction=0.1, verbose=0, warm_start=False)

In [21]:
#calculate rmse on the own train set
RMSE(gb1.predict(train_X),train_y)

2594.9793626784835

In [22]:
#calculate rmse on the own test set
RMSE(gb1.predict(test_X),test_y) # may be overfitting

2637.585266635931

In [23]:
# refit with all data
gb1.fit(X_train,y_train)

GradientBoostingRegressor(alpha=0.9, criterion='mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=5, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=500, n_iter_no_change=None, presort='auto',
             random_state=None, subsample=1.0, tol=0.0001,
             validation_fraction=0.1, verbose=0, warm_start=False)

In [25]:
# score in all train data
RMSE(gb1.predict(X_train),y_train) #score 2856.8337879410 on public leaderboard. Overfitting?

2603.684460804529

In [27]:
#get predictions
pred = gb1.predict(test_)
get_file(test,pred,'not_tuned_gbr1.csv')

In [30]:
# create model with new parameters
gb2 = GradientBoostingRegressor(criterion='mse', max_depth=5, n_estimators=500, min_samples_leaf=3)

In [32]:
# fit the model
gb2.fit(X_train,y_train)

GradientBoostingRegressor(alpha=0.9, criterion='mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=5, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=3,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=500, n_iter_no_change=None, presort='auto',
             random_state=None, subsample=1.0, tol=0.0001,
             validation_fraction=0.1, verbose=0, warm_start=False)

In [33]:
# score on all train data
RMSE(gb2.predict(X_train),y_train)

2598.6896603112705

In [34]:
#get predictions
pred = gb2.predict(test_)
get_file(test,pred,'gbr2_depth5_est500_minsamplesleaf3.csv')

In [35]:
#create model with new parameters
gb3 = GradientBoostingRegressor(criterion='mse', max_depth=5, n_estimators=2000, min_samples_leaf=3)

In [36]:
# fit the model
gb3.fit(X_train,y_train)

GradientBoostingRegressor(alpha=0.9, criterion='mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=5, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=3,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=2000, n_iter_no_change=None, presort='auto',
             random_state=None, subsample=1.0, tol=0.0001,
             validation_fraction=0.1, verbose=0, warm_start=False)

In [37]:
# calculate score on all train data
RMSE(gb3.predict(X_train),y_train)

2417.47559539084

In [38]:
#get predictions
pred = gb3.predict(test_)
get_file(test,pred,'gbr3_depth5_2000estims_minsamplesleaf3.csv')