In [2]:
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import make_scorer, mean_squared_error

In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
#preprocess data
def categorical_to_numerical(train, test):    
    train_ = train.copy()
    test_ = test.copy()  
    encoders_train = {}
    encoders_test = {} 
    for column in train.columns[:-1]:
        if train[column].dtype == np.object:
            label = LabelEncoder()
            encoders_train[column] = label.fit_transform(train[column])
            train_[column] = label.transform(train[column])
            
            label = LabelEncoder()
            encoders_test[column] = label.fit_transform(test[column])
            test_[column] = label.transform(test[column])
    train_.fillna(0,inplace=True)
    test_.fillna(0,inplace=True)   
    return train_, test_, encoders_train, encoders_test

In [5]:
#calculate rmse
def RMSE(true_y, preds):
    return np.sqrt(mean_squared_error(true_y, preds))
rmse_scorer = make_scorer(RMSE, greater_is_better=False) #create own scorer

In [6]:
# get file with predictions
def get_file(test,predictions, file_name):
    sample = pd.read_csv('Sample_Submission_Tm9Lura.csv')
    sample['User_ID'] = test.User_ID.values
    sample['Product_ID'] = test.Product_ID.values
    sample['Purchase'] = predictions
    sample.to_csv(file_name, index=False)

In [7]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
train_, test_, encoder_train, encoder_test = categorical_to_numerical(train, test)
X_train = train_.drop(['Purchase'],axis=1)
y_train = train_.Purchase
train_X, test_X,train_y, test_y = train_test_split(X_train, y_train,random_state=100, test_size = 0.2)

In [14]:
gb = GradientBoostingRegressor(criterion='mse')

In [9]:
gb.fit(train_X,train_y)

GradientBoostingRegressor(alpha=0.9, criterion='mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=100, n_iter_no_change=None, presort='auto',
             random_state=None, subsample=1.0, tol=0.0001,
             validation_fraction=0.1, verbose=0, warm_start=False)

In [10]:
RMSE(gb.predict(train_X),train_y)

2914.0118246371117

In [11]:
RMSE(gb.predict(test_X),test_y)

2925.419906845364

In [15]:
# fit with all data
gb.fit(X_train,y_train)

GradientBoostingRegressor(alpha=0.9, criterion='mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=100, n_iter_no_change=None, presort='auto',
             random_state=None, subsample=1.0, tol=0.0001,
             validation_fraction=0.1, verbose=0, warm_start=False)

In [16]:
RMSE(gb.predict(X_train),y_train) #score 2960.13046112878 on public leaderboard

2914.7300303534244

In [17]:
#get predictions
pred = gb.predict(test_)
get_file(test,pred,'not_tuned_gbr.csv')

In [18]:
gb1 = GradientBoostingRegressor(criterion='mse', max_depth=5, n_estimators=500)

In [20]:
gb1.fit(train_X,train_y)

GradientBoostingRegressor(alpha=0.9, criterion='mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=5, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=500, n_iter_no_change=None, presort='auto',
             random_state=None, subsample=1.0, tol=0.0001,
             validation_fraction=0.1, verbose=0, warm_start=False)

In [21]:
RMSE(gb1.predict(train_X),train_y)

2594.9793626784835

In [22]:
RMSE(gb1.predict(test_X),test_y) # may be overfitting

2637.585266635931

In [23]:
# fit with all data
gb1.fit(X_train,y_train)

GradientBoostingRegressor(alpha=0.9, criterion='mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=5, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=500, n_iter_no_change=None, presort='auto',
             random_state=None, subsample=1.0, tol=0.0001,
             validation_fraction=0.1, verbose=0, warm_start=False)

In [25]:
RMSE(gb1.predict(X_train),y_train) #score 2856.8337879410 on public leaderboard. Overfitting?

2603.684460804529

In [27]:
#get predictions
pred = gb1.predict(test_)
get_file(test,pred,'not_tuned_gbr1.csv')