In [1]:
#import modules
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection  import train_test_split
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC 
from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier 
from scipy import stats
from sklearn.impute import KNNImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from imblearn.over_sampling import SMOTE 
import numpy as np
import seaborn as sns
import statsmodels.api as sm
import os
from sklearn.metrics import roc_curve, auc
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

In [4]:
#fill na with zeros for buy freq
ads_train['buy_freq'] = ads_train['buy_freq'].fillna(0)

In [24]:
ads_train.head(); 

In [6]:
#drop isbuyer, last_buy, last visit, multiple buy
ads_train = ads_train.drop(['isbuyer','last_buy','multiple_buy'],axis=1)

In [7]:
#convert negative values to zeros
ads_train.loc[ads_train['expected_time_buy'] < 0, 'expected_time_buy'] = 0
ads_train.loc[ads_train['expected_time_visit'] < 0, 'expected_time_visit'] = 0

In [25]:
ads_train['expected_time_buy'].value_counts();

In [9]:
mean_sv = ads_train['sv_interval'].mean()
std_sv = ads_train['sv_interval'].std()
cutoff_sv = mean_sv + 2*std_sv

In [10]:
mean_ck = ads_train['num_checkins'].mean()
std_ck = ads_train['num_checkins'].std()
cutoff_ck = mean_ck + 2*std_ck

In [11]:
#drop uniq_urls = -1
#ads_train.drop(['uniq_urls'] == -1)
ads_train = ads_train[ads_train.uniq_urls != -1]

In [12]:
#drop sv > 2 std
ads_train = ads_train[ads_train['sv_interval'] < cutoff_sv]

In [13]:
#drop ck > 2 std
ads_train = ads_train[ads_train['num_checkins'] < cutoff_ck]

In [26]:
ads_train.head();

In [15]:
#score function
def score(y_test,y_pred):
    print(metrics.confusion_matrix(y_test, y_pred))
    print("Accuracy: " + str( metrics.accuracy_score(y_test, y_pred)*100)+" %")
    print("Precision: "+ str( metrics.precision_score(y_test, y_pred)*100) + " %")
    print("Recall: "+ str( metrics.recall_score(y_test, y_pred)*100) +" %")

In [16]:
y = ads_train['y_buy']
ads_train['intercept'] = 1.0
X = ads_train.drop(columns = ['y_buy'], axis=1)

In [17]:
#SMOTE
smt = SMOTE(random_state=42)
X_sm, y_sm = smt.fit_resample(X, y)

In [18]:
#split 
X_train, X_test, y_train, y_test = train_test_split(X_sm, y_sm, test_size=0.20,random_state=109) 

In [19]:
#Random forest
#100 decision trees
rf = RandomForestClassifier(n_estimators = 100,criterion='entropy', 
                           max_depth=23, max_features=None, 
                           max_leaf_nodes=None, min_samples_leaf=1, 
                           min_samples_split=2, min_weight_fraction_leaf=0.0, 
                           random_state = 42,bootstrap=True)
# Train the model on training data
rf.fit(X_train, y_train);

In [20]:
#RF on SMOTE
y_predRF = rf.predict(X_test)  
# Confusion Matrix - Random forest
score(y_test, y_predRF)

[[6839   83]
 [  38 6868]]
Accuracy: 99.12496384148105 %
Precision: 98.80592720471874 %
Recall: 99.44975383724298 %


In [21]:
#split from original data set
X_trainO, X_testO, y_trainO, y_testO = train_test_split(X, y, test_size=0.20,random_state=109) 

In [22]:
#RF on split original data set
y_predRF = rf.predict(X_testO)  
# Confusion Matrix - Random Forest on split original data set
score(y_testO, y_predRF)

[[6896   19]
 [   3   28]]
Accuracy: 99.6832709473078 %
Precision: 59.57446808510638 %
Recall: 90.32258064516128 %


In [23]:
#RF on whole data set
y_predRF = rf.predict(X)
# Confusion Matrix - Random Forest on whole data set
score(y, y_predRF)

[[34466   104]
 [   12   147]]
Accuracy: 99.66598519968902 %
Precision: 58.56573705179283 %
Recall: 92.45283018867924 %
