In [241]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.preprocessing import Normalizer
from sklearn.neighbors import KNeighborsClassifier

In [197]:
df_train = pd.read_csv('./data/initial_feature_selection.csv', index_col=0)

In [198]:
df_test = pd.read_csv('./data/test_set.csv', index_col=0)

In [199]:
df_train.columns

Index(['date', 'cookie_id', 'visit_number', 'user_id', 'device_Mobile_Phone',
       'channel_Display', 'channel_Email', 'channel_Push_Notifications',
       'entry_page_account_dashboard', 'entry_page_alert_results',
       ...
       'page_views_search_filter_car_places', 'page_views_search_filter_color',
       'page_views_search_filter_fuel',
       'page_views_search_filter_region_level2',
       'page_views_search_filter_transmission',
       'page_views_search_filter_user_role_id',
       'page_views_search_filter_version',
       'page_views_send_buyer_phone_to_seller', 'label', 'confidence'],
      dtype='object', length=135)

In [200]:
df_test.columns

Index(['device_Mobile_Phone', 'device_Television', 'channel_Display',
       'channel_Email', 'channel_Paid_Search', 'channel_Push_Notifications',
       'entry_page_account_dashboard', 'entry_page_ad_deletion_confirmation',
       'entry_page_ad_deletion_form', 'entry_page_ad_insertion_add_photo',
       ...
       'page_views_search_filter_car_places', 'page_views_search_filter_color',
       'page_views_search_filter_fuel',
       'page_views_search_filter_region_level2',
       'page_views_search_filter_transmission',
       'page_views_search_filter_user_role_id',
       'page_views_search_filter_version',
       'page_views_send_buyer_phone_to_seller',
       'page_views_send_buyer_phone_to_seller_later',
       'page_views_terms_and_conditions'],
      dtype='object', length=239)

### Dropping confidence and date column

In [201]:
df_train.drop(['confidence', 'date'], axis=1, inplace=True)

### Dropping columns from df_test so that they match df_train

In [202]:
df_train.columns.difference(df_test.columns)

Index(['cookie_id', 'label', 'user_id', 'visit_number'], dtype='object')

In [203]:
df_test.columns.difference(df_train.columns).tolist()

['Visits_c_detail_save_favorite_top_bar',
 'Visits_c_menu_my_ads',
 'Visits_c_menu_my_favorites',
 'Visits_c_menu_new_search',
 'Visits_c_my_ads_no_action_status',
 'Visits_c_my_alerts_disable_alert_emails',
 'Visits_c_my_alerts_enable_alert_push_notif',
 'Visits_c_notifications_configuration_personalized_alerts_email',
 'channel_Paid_Search',
 'device_Television',
 'e2_CallPhone_0',
 'entry_page_ad_deletion_confirmation',
 'entry_page_ad_deletion_form',
 'entry_page_ad_insertion_add_photo',
 'entry_page_ad_insertion_brand',
 'entry_page_ad_insertion_car_body',
 'entry_page_ad_insertion_color',
 'entry_page_ad_insertion_confirmation',
 'entry_page_ad_insertion_error',
 'entry_page_ad_insertion_km',
 'entry_page_ad_insertion_model',
 'entry_page_ad_insertion_photo_gallery',
 'entry_page_ad_insertion_price',
 'entry_page_ad_insertion_step1',
 'entry_page_ad_insertion_version',
 'entry_page_ad_modification_add_photo',
 'entry_page_ad_modification_cancel',
 'entry_page_ad_modification_car_

In [113]:
df_test_dropped = df_test.drop(df_test.columns.difference(df_train.columns).tolist(), axis=1)

In [120]:
df_train.columns.difference(df_test_dropped.columns).tolist()

['cookie_id', 'label', 'user_id', 'visit_number']

In [121]:
df_train_dropped = df_train.drop(['cookie_id', 'user_id', 'visit_number'], axis=1)

### Separating train set into X and y

In [122]:
X = df_train_dropped.drop('label', axis=1)
y = df_train_dropped.label

### Using only the train set with a 75:25 balanced split

In [123]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

### Running a grid search for parameter selection for a Random Forest Classifier model

In [124]:
param_grid = {
    'n_estimators': [150, 300],
    'max_features': ['auto', 'sqrt', 'log2']
}

In [125]:
RFC = RandomForestClassifier(n_jobs=-1)

In [126]:
GS = GridSearchCV(RFC, param_grid, cv=5)

In [127]:
GS.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators='warn', n_jobs=-1,
                                              oob_score=False,
                                              random_state=None, verbose=0,
                                              warm_start=False),
             iid='

In [128]:
GS.best_params_

{'max_features': 'auto', 'n_estimators': 150}

In [129]:
RFC = RandomForestClassifier(max_features='auto', n_estimators=150)

In [130]:
RFC.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=150,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [131]:
y_pred = RFC.predict(X_test)

In [132]:
RFC.score(X_test, y_test)

0.912

In [139]:
f1_score(y_test, y_pred, labels=None, pos_label=1, average='weighted')

0.9109694717023722

In [153]:
RFC.feature_importances_

array([2.34626995e-03, 7.99034685e-04, 5.23656841e-03, 2.87767506e-02,
       1.26127977e-03, 3.35061891e-02, 1.25741232e-04, 1.23042202e-03,
       0.00000000e+00, 6.25892824e-05, 3.56149093e-02, 1.38087838e-04,
       1.56957123e-04, 1.61205359e-04, 4.48244472e-02, 1.67400889e-04,
       4.83286348e-03, 6.13280271e-04, 8.94668258e-03, 1.46178993e-03,
       4.13125252e-04, 1.53857083e-03, 6.63018737e-04, 4.60705779e-05,
       1.38626795e-03, 5.74244907e-04, 3.01689709e-04, 2.23194566e-03,
       3.63637296e-03, 1.62939245e-04, 1.58675731e-04, 6.35343063e-05,
       1.20286011e-01, 2.68009465e-04, 1.35757241e-03, 1.03816895e-03,
       2.52804602e-03, 4.56824771e-04, 1.07070014e-04, 9.29905126e-04,
       1.13083380e-03, 2.19563353e-04, 5.96414945e-04, 2.27328652e-05,
       3.05464808e-04, 1.55090949e-04, 1.16152606e-04, 7.25910081e-04,
       5.39846948e-05, 3.70687797e-04, 2.86206678e-04, 5.33662454e-03,
       4.29248177e-04, 1.74014357e-03, 7.02895011e-05, 3.22465873e-04,
      

### Checking score against actual test set

In [137]:
actual_X_test = df_test_dropped

In [138]:
actual_y_pred = RFC.predict(actual_X_test)

In [142]:
actual_X_test.index

Int64Index([     0,      1,      2,      3,      4,      5,      6,      7,
                 8,      9,
            ...
            249990, 249991, 249992, 249993, 249994, 249995, 249996, 249997,
            249998, 249999],
           dtype='int64', name='case_id', length=250000)

In [144]:
len(actual_y_pred)

250000

In [146]:
pred_df = pd.DataFrame(zip(actual_X_test.index, actual_y_pred), columns=['case_id', 'prediction'], index=)

In [152]:
pred_df.to_csv('./data/RFC1.csv')

F1 score = '0.67441'

### Using new df with label 5 added from extra training set for training

In [160]:
df_5 = pd.read_csv('./data/df_with_5.csv', index_col=0)

In [164]:
df_5.drop(['user_id', 'confidence'], axis=1, inplace=True)

In [169]:
X = df_5.drop('label', axis=1)
y = df_5.label

In [170]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

In [207]:
param_grid = {
    'n_estimators': [500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [2,5,10],
    'min_samples_leaf': [50, 75, 100]
}

In [208]:
RFC = RandomForestClassifier(n_jobs=-1)

In [209]:
GS = GridSearchCV(RFC, param_grid, cv=5, verbose=3)

In [210]:
GS.fit(X_train, y_train)

Fitting 5 folds for each of 27 candidates, totalling 135 fits
[CV] max_depth=2, max_features=auto, min_samples_leaf=50, n_estimators=500 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  max_depth=2, max_features=auto, min_samples_leaf=50, n_estimators=500, score=0.599, total=   2.0s
[CV] max_depth=2, max_features=auto, min_samples_leaf=50, n_estimators=500 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    2.0s remaining:    0.0s


[CV]  max_depth=2, max_features=auto, min_samples_leaf=50, n_estimators=500, score=0.649, total=   0.5s
[CV] max_depth=2, max_features=auto, min_samples_leaf=50, n_estimators=500 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    2.5s remaining:    0.0s


[CV]  max_depth=2, max_features=auto, min_samples_leaf=50, n_estimators=500, score=0.659, total=   0.4s
[CV] max_depth=2, max_features=auto, min_samples_leaf=50, n_estimators=500 
[CV]  max_depth=2, max_features=auto, min_samples_leaf=50, n_estimators=500, score=0.646, total=   0.4s
[CV] max_depth=2, max_features=auto, min_samples_leaf=50, n_estimators=500 
[CV]  max_depth=2, max_features=auto, min_samples_leaf=50, n_estimators=500, score=0.649, total=   0.4s
[CV] max_depth=2, max_features=auto, min_samples_leaf=75, n_estimators=500 
[CV]  max_depth=2, max_features=auto, min_samples_leaf=75, n_estimators=500, score=0.602, total=   0.4s
[CV] max_depth=2, max_features=auto, min_samples_leaf=75, n_estimators=500 
[CV]  max_depth=2, max_features=auto, min_samples_leaf=75, n_estimators=500, score=0.626, total=   0.4s
[CV] max_depth=2, max_features=auto, min_samples_leaf=75, n_estimators=500 
[CV]  max_depth=2, max_features=auto, min_samples_leaf=75, n_estimators=500, score=0.638, total=   0

[CV]  max_depth=5, max_features=auto, min_samples_leaf=50, n_estimators=500, score=0.697, total=   0.5s
[CV] max_depth=5, max_features=auto, min_samples_leaf=50, n_estimators=500 
[CV]  max_depth=5, max_features=auto, min_samples_leaf=50, n_estimators=500, score=0.697, total=   0.4s
[CV] max_depth=5, max_features=auto, min_samples_leaf=75, n_estimators=500 
[CV]  max_depth=5, max_features=auto, min_samples_leaf=75, n_estimators=500, score=0.637, total=   0.5s
[CV] max_depth=5, max_features=auto, min_samples_leaf=75, n_estimators=500 
[CV]  max_depth=5, max_features=auto, min_samples_leaf=75, n_estimators=500, score=0.643, total=   0.4s
[CV] max_depth=5, max_features=auto, min_samples_leaf=75, n_estimators=500 
[CV]  max_depth=5, max_features=auto, min_samples_leaf=75, n_estimators=500, score=0.650, total=   0.5s
[CV] max_depth=5, max_features=auto, min_samples_leaf=75, n_estimators=500 
[CV]  max_depth=5, max_features=auto, min_samples_leaf=75, n_estimators=500, score=0.676, total=   0

[CV]  max_depth=10, max_features=auto, min_samples_leaf=50, n_estimators=500, score=0.688, total=   0.4s
[CV] max_depth=10, max_features=auto, min_samples_leaf=75, n_estimators=500 
[CV]  max_depth=10, max_features=auto, min_samples_leaf=75, n_estimators=500, score=0.652, total=   0.4s
[CV] max_depth=10, max_features=auto, min_samples_leaf=75, n_estimators=500 
[CV]  max_depth=10, max_features=auto, min_samples_leaf=75, n_estimators=500, score=0.655, total=   0.4s
[CV] max_depth=10, max_features=auto, min_samples_leaf=75, n_estimators=500 
[CV]  max_depth=10, max_features=auto, min_samples_leaf=75, n_estimators=500, score=0.647, total=   0.4s
[CV] max_depth=10, max_features=auto, min_samples_leaf=75, n_estimators=500 
[CV]  max_depth=10, max_features=auto, min_samples_leaf=75, n_estimators=500, score=0.694, total=   0.4s
[CV] max_depth=10, max_features=auto, min_samples_leaf=75, n_estimators=500 
[CV]  max_depth=10, max_features=auto, min_samples_leaf=75, n_estimators=500, score=0.673,

[Parallel(n_jobs=1)]: Done 135 out of 135 | elapsed:  1.3min finished


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators='warn', n_jobs=-1,
                                              oob_score=False,
                                              random_state=None, verbose=0,
                                              warm_start=False),
             iid='

In [211]:
GS.best_params_

{'max_depth': 10,
 'max_features': 'log2',
 'min_samples_leaf': 50,
 'n_estimators': 500}

In [229]:
RFC2 = RandomForestClassifier(n_estimators = 500, max_depth =  10, max_features = 'log2', min_samples_leaf = 50)

In [230]:
RFC2.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=10, max_features='log2', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=50, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=500,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [231]:
y_pred = RFC2.predict(X_test)

In [232]:
f1_score(y_test, y_pred, labels=None, pos_label=1, average='weighted')

0.6384706608578523

In [220]:
df_test = pd.read_csv('./data/test_set.csv', index_col=0)

In [228]:
df_test = df_test[df_5.columns.tolist()[:-1]]

In [233]:
actual_y_pred = RFC2.predict(df_test)

In [234]:
pred_df = pd.DataFrame(zip(df_test.index, actual_y_pred), columns=['case_id', 'prediction'])

In [None]:
pred_df.to_csv('./data/RFC2.csv')

### Using KNN

In [246]:
transformer = Normalizer().fit(X_train)
norm_X = transformer.transform(X_train)

In [254]:
transformer = Normalizer().fit(X_test)
norm_X_test = transformer.transform(X_test)

In [247]:
KNN = KNeighborsClassifier()

In [248]:
param_grid = {'n_neighbors': [3,5,7,9,11],
              'weights': ['uniform', 'distance'],
              'leaf_size': [30, 45, 60, 75]}

In [249]:
GS = GridSearchCV(KNN, param_grid, cv=5, verbose=3)

In [250]:
GS.fit(norm_X, y_train)

Fitting 5 folds for each of 40 candidates, totalling 200 fits
[CV] leaf_size=30, n_neighbors=3, weights=uniform ....................
[CV]  leaf_size=30, n_neighbors=3, weights=uniform, score=0.573, total=   0.1s
[CV] leaf_size=30, n_neighbors=3, weights=uniform ....................
[CV]  leaf_size=30, n_neighbors=3, weights=uniform, score=0.605, total=   0.0s
[CV] leaf_size=30, n_neighbors=3, weights=uniform ....................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.2s remaining:    0.0s


[CV]  leaf_size=30, n_neighbors=3, weights=uniform, score=0.596, total=   0.0s
[CV] leaf_size=30, n_neighbors=3, weights=uniform ....................
[CV]  leaf_size=30, n_neighbors=3, weights=uniform, score=0.580, total=   0.0s
[CV] leaf_size=30, n_neighbors=3, weights=uniform ....................
[CV]  leaf_size=30, n_neighbors=3, weights=uniform, score=0.571, total=   0.0s
[CV] leaf_size=30, n_neighbors=3, weights=distance ...................
[CV]  leaf_size=30, n_neighbors=3, weights=distance, score=0.614, total=   0.0s
[CV] leaf_size=30, n_neighbors=3, weights=distance ...................
[CV]  leaf_size=30, n_neighbors=3, weights=distance, score=0.617, total=   0.0s
[CV] leaf_size=30, n_neighbors=3, weights=distance ...................
[CV]  leaf_size=30, n_neighbors=3, weights=distance, score=0.611, total=   0.0s
[CV] leaf_size=30, n_neighbors=3, weights=distance ...................
[CV]  leaf_size=30, n_neighbors=3, weights=distance, score=0.619, total=   0.0s
[CV] leaf_size=30

[CV]  leaf_size=45, n_neighbors=5, weights=uniform, score=0.611, total=   0.0s
[CV] leaf_size=45, n_neighbors=5, weights=uniform ....................
[CV]  leaf_size=45, n_neighbors=5, weights=uniform, score=0.605, total=   0.1s
[CV] leaf_size=45, n_neighbors=5, weights=uniform ....................
[CV]  leaf_size=45, n_neighbors=5, weights=uniform, score=0.553, total=   0.1s
[CV] leaf_size=45, n_neighbors=5, weights=uniform ....................
[CV]  leaf_size=45, n_neighbors=5, weights=uniform, score=0.568, total=   0.1s
[CV] leaf_size=45, n_neighbors=5, weights=distance ...................
[CV]  leaf_size=45, n_neighbors=5, weights=distance, score=0.608, total=   0.0s
[CV] leaf_size=45, n_neighbors=5, weights=distance ...................
[CV]  leaf_size=45, n_neighbors=5, weights=distance, score=0.635, total=   0.0s
[CV] leaf_size=45, n_neighbors=5, weights=distance ...................
[CV]  leaf_size=45, n_neighbors=5, weights=distance, score=0.614, total=   0.0s
[CV] leaf_size=45,

[CV]  leaf_size=60, n_neighbors=5, weights=distance, score=0.635, total=   0.0s
[CV] leaf_size=60, n_neighbors=5, weights=distance ...................
[CV]  leaf_size=60, n_neighbors=5, weights=distance, score=0.614, total=   0.0s
[CV] leaf_size=60, n_neighbors=5, weights=distance ...................
[CV]  leaf_size=60, n_neighbors=5, weights=distance, score=0.586, total=   0.0s
[CV] leaf_size=60, n_neighbors=5, weights=distance ...................
[CV]  leaf_size=60, n_neighbors=5, weights=distance, score=0.607, total=   0.0s
[CV] leaf_size=60, n_neighbors=7, weights=uniform ....................
[CV]  leaf_size=60, n_neighbors=7, weights=uniform, score=0.573, total=   0.1s
[CV] leaf_size=60, n_neighbors=7, weights=uniform ....................
[CV]  leaf_size=60, n_neighbors=7, weights=uniform, score=0.591, total=   0.1s
[CV] leaf_size=60, n_neighbors=7, weights=uniform ....................
[CV]  leaf_size=60, n_neighbors=7, weights=uniform, score=0.591, total=   0.0s
[CV] leaf_size=60

[CV]  leaf_size=75, n_neighbors=7, weights=uniform, score=0.544, total=   0.1s
[CV] leaf_size=75, n_neighbors=7, weights=distance ...................
[CV]  leaf_size=75, n_neighbors=7, weights=distance, score=0.617, total=   0.0s
[CV] leaf_size=75, n_neighbors=7, weights=distance ...................
[CV]  leaf_size=75, n_neighbors=7, weights=distance, score=0.611, total=   0.0s
[CV] leaf_size=75, n_neighbors=7, weights=distance ...................
[CV]  leaf_size=75, n_neighbors=7, weights=distance, score=0.611, total=   0.0s
[CV] leaf_size=75, n_neighbors=7, weights=distance ...................
[CV]  leaf_size=75, n_neighbors=7, weights=distance, score=0.583, total=   0.0s
[CV] leaf_size=75, n_neighbors=7, weights=distance ...................
[CV]  leaf_size=75, n_neighbors=7, weights=distance, score=0.598, total=   0.0s
[CV] leaf_size=75, n_neighbors=9, weights=uniform ....................
[CV]  leaf_size=75, n_neighbors=9, weights=uniform, score=0.576, total=   0.1s
[CV] leaf_size=7

[Parallel(n_jobs=1)]: Done 200 out of 200 | elapsed:    9.8s finished


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='warn', n_jobs=None,
             param_grid={'leaf_size': [30, 45, 60, 75],
                         'n_neighbors': [3, 5, 7, 9, 11],
                         'weights': ['uniform', 'distance']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=3)

In [251]:
GS.best_params_

{'leaf_size': 45, 'n_neighbors': 3, 'weights': 'distance'}

In [252]:
KNN = KNeighborsClassifier(leaf_size=45, n_neighbors=3, weights='distance')

In [258]:
KNN.fit(norm_X, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=45, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                     weights='distance')

In [259]:
y_pred = KNN.predict(norm_X_test)

In [260]:
f1_score(y_test, y_pred, labels=None, pos_label=1, average='weighted')

0.6216973369315215

### Using new df

In [262]:
df = pd.read_csv('./data/df_final.csv', index_col=0)

In [265]:
X = df.drop('label', axis=1)

In [266]:
y = df.label

In [267]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

In [270]:
RFC = RandomForestClassifier(n_jobs=-1)

In [269]:
param_grid = {
    'n_estimators': [500, 750],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [2, 4, 8, 16],
    'min_samples_leaf': [2, 5, 10]
}

In [None]:
GS = GridSearchCV(RFC, param_grid, cv=5)