In [57]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import make_scorer, mean_squared_error

In [103]:
import warnings
warnings.filterwarnings('ignore')

In [42]:
# data preprocessing
def categorical_to_numerical(train, test):    
    train_ = train.copy()
    test_ = test.copy()  
    encoders_train = {}
    encoders_test = {} 
    for column in train.columns[:-1]:
        if train[column].dtype == np.object:
            label = LabelEncoder()
            encoders_train[column] = label.fit_transform(train[column])
            train_[column] = label.transform(train[column])
            
            label = LabelEncoder()
            encoders_test[column] = label.fit_transform(test[column])
            test_[column] = label.transform(test[column])
    train_.fillna(0,inplace=True)
    test_.fillna(0,inplace=True)   
    return train_, test_, encoders_train, encoders_test

In [61]:
#calculate RMSE
def RMSE(true_y, preds):
    return np.sqrt(mean_squared_error(true_y, preds))
rmse_scorer = make_scorer(RMSE, greater_is_better=False) #make own scorer 

In [71]:
# get file with predictions
def get_file(test,predictions, file_name):
    sample = pd.read_csv('Sample_Submission_Tm9Lura.csv')
    sample['User_ID'] = test.User_ID.values
    sample['Product_ID'] = test.Product_ID.values
    sample['Purchase'] = predictions
    sample.to_csv(file_name, index=False)

In [77]:
#read train data
train = pd.read_csv('train.csv')
train.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1000001,P00069042,F,0-17,10,A,2,0,3,,,8370
1,1000001,P00248942,F,0-17,10,A,2,0,1,6.0,14.0,15200
2,1000001,P00087842,F,0-17,10,A,2,0,12,,,1422
3,1000001,P00085442,F,0-17,10,A,2,0,12,14.0,,1057
4,1000002,P00285442,M,55+,16,C,4+,0,8,,,7969


In [78]:
#read test data
test = pd.read_csv('test.csv')

In [79]:
#preprocess train and test data
train_, test_, encoder_train, encoder_test = categorical_to_numerical(train, test)

In [80]:
X_train = train_.drop(['Purchase'],axis=1)
y_train = train_.Purchase

In [81]:
#make own test dataset from the train data
train_X, test_X,train_y, test_y = train_test_split(X_train, y_train,random_state=100, test_size = 0.2)

In [82]:
#create RF model
rf = RandomForestRegressor(criterion='mse',max_depth=6,n_estimators=200)

In [83]:
#fit the model
rf.fit(X_train,y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=6,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [84]:
# calculate RMSE on train set
RMSE(rf.predict(X_train),y_train) #got 2997.43250390268 on public leaderboard

2939.5915834694156

In [86]:
#predict purchase 
pred = rf.predict(test_)

In [88]:
# get file with predictions
get_file(test,pred,'not_tuned_rf.csv')

In [91]:
# make grid for grid search
grid = {
    'n_estimators': [200,500,1000,2000],
    'max_depth': [6,8,10],
}
#make grid search model to tude RF regressor
tuned_rf = GridSearchCV(RandomForestRegressor(criterion='mse'),grid,scoring=rmse_scorer,cv=3)

In [92]:
#fit GS model
tuned_rf.fit(X_train,y_train)

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_estimators': [200, 500, 1000, 2000], 'max_depth': [6, 8, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=make_scorer(RMSE, greater_is_better=False), verbose=0)

In [93]:
#check best estimator 
tuned_rf.best_estimator_

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=10,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=2000, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [94]:
#calculate rmse fon train set for tuned model
RMSE(tuned_rf.predict(X_train),y_train)

2764.3589185298524

In [99]:
# create another random forest model
rf1 = RandomForestRegressor(criterion='mse',max_depth=6,n_estimators=2000)

In [100]:
# fit it with own train set
rf1.fit(train_X,train_y)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=6,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=2000, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [101]:
# rmse on the own train set
RMSE(rf1.predict(train_X),train_y)

2919.432000345406

In [102]:
# rmse on the own test set
RMSE(rf1.predict(test_X),test_y)

2931.5996032973003