In [8]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error

In [12]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
# prepare data
def categorical_to_numerical(train, test):    
    train_ = train.copy()
    test_ = test.copy()  
    encoders_train = {}
    encoders_test = {} 
    for column in train.columns[:-1]:
        if train[column].dtype == np.object:
            label = LabelEncoder()
            encoders_train[column] = label.fit_transform(train[column])
            train_[column] = label.transform(train[column])
            
            label = LabelEncoder()
            encoders_test[column] = label.fit_transform(test[column])
            test_[column] = label.transform(test[column])
    train_.fillna(0,inplace=True)
    test_.fillna(0,inplace=True)   
    return train_, test_, encoders_train, encoders_test

In [4]:
# get rmse score
def RMSE(true_y, preds):
    return np.sqrt(mean_squared_error(true_y, preds))

In [5]:
# get file with predictions for the submission
def get_file(test,predictions, file_name):
    sample = pd.read_csv('Sample_Submission_Tm9Lura.csv')
    sample['User_ID'] = test.User_ID.values
    sample['Product_ID'] = test.Product_ID.values
    sample['Purchase'] = predictions
    sample.to_csv(file_name, index=False)

In [6]:
train = pd.read_csv('train.csv')
train.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1000001,P00069042,F,0-17,10,A,2,0,3,,,8370
1,1000001,P00248942,F,0-17,10,A,2,0,1,6.0,14.0,15200
2,1000001,P00087842,F,0-17,10,A,2,0,12,,,1422
3,1000001,P00085442,F,0-17,10,A,2,0,12,14.0,,1057
4,1000002,P00285442,M,55+,16,C,4+,0,8,,,7969


In [7]:
test = pd.read_csv('test.csv')
train_, test_, encoder_train, encoder_test = categorical_to_numerical(train, test)
X_train = train_.drop(['Purchase'],axis=1)
y_train = train_.Purchase
train_X, test_X,train_y, test_y = train_test_split(X_train, y_train,random_state=100, test_size = 0.2)

In [15]:
ada = AdaBoostRegressor(RandomForestRegressor(criterion='mse',max_depth=6,n_estimators=100),n_estimators=50)

In [16]:
ada.fit(X_train,y_train)

AdaBoostRegressor(base_estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=6,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
         learning_rate=1.0, loss='linear', n_estimators=50,
         random_state=None)

In [17]:
RMSE(ada.predict(X_train),y_train) #give 3056.89656162323 on public leaderboard 

3009.3150419397853

In [18]:
pred = ada.predict(test_)
get_file(test,pred,'ada_rf.csv')