# Problem One - Determining Online Shoppers Purchasing Intention

### Import Required Libraries

In [1]:
# Reading / Writing Files & Encoding
import pandas as pd
import numpy as np

# Pre-Processing
from sklearn.model_selection import train_test_split

# Sampling
from imblearn.over_sampling import SMOTE

# Clustering
from sklearn.cluster import KMeans

# Feature Selection / Extraction
from sklearn.model_selection import train_test_split
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.feature_selection import chi2, SelectKBest, SelectFromModel
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

# Ensembles & Functions
from vecstack import stacking
from sklearn.svm import LinearSVC as svc
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier

# Accuracy
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, silhouette_score

# Miscellaneous
import warnings
import matplotlib.pyplot as plt
warnings.filterwarnings("ignore")

### Load in Dataset | Perform Basic Preparation

In [2]:
df = pd.read_csv("./data/assignment_five/online_shoppers_intention.csv")
df['Weekend'] = df['Weekend'].astype(int)

df = pd.get_dummies(df[['VisitorType','Month']]).merge(
    df, left_index=True, right_index=True).drop(['VisitorType','Month'], axis=1)

### Construct a Train-Test Split

In [3]:
X_train, X_test, y_train, y_test = train_test_split(df.loc[:, df.columns != 'Revenue'], df['Revenue'], train_size=0.85, random_state=42)

# K-Means Clustering

In [4]:
# Testing different number of clusters (pseudo hyperparameter tuning)
clusters = [i*10 for i in range(1,5)]
for c in clusters:
    kmeans = KMeans(n_clusters=c, init='random', algorithm='elkan')
    kmeans.fit(X_train)
    print(f"Silhouette Score for {c} Clusters: {round(silhouette_score(X_train, kmeans.labels_),6)}")

Silhouette Score for 10 Clusters: 0.49137
Silhouette Score for 20 Clusters: 0.446808
Silhouette Score for 30 Clusters: 0.403592
Silhouette Score for 40 Clusters: 0.387498


In [5]:
kmeans = KMeans(n_clusters=30, init='random', algorithm='elkan')
kmeans.fit(X_train)
predicted_clusters_train = kmeans.predict(X_train)
kmeans.fit(X_test)
predicted_clusters_test = kmeans.predict(X_test)

In [6]:
X_train['Cluster'] = predicted_clusters_train
X_test['Cluster'] = predicted_clusters_test
X_train.head()

Unnamed: 0,VisitorType_New_Visitor,VisitorType_Other,VisitorType_Returning_Visitor,Month_Aug,Month_Dec,Month_Feb,Month_Jul,Month_June,Month_Mar,Month_May,...,BounceRates,ExitRates,PageValues,SpecialDay,OperatingSystems,Browser,Region,TrafficType,Weekend,Cluster
2498,0,0,1,0,0,0,0,0,0,1,...,0.04,0.08,0.0,0.0,2,2,2,13,0,16
5431,0,0,1,0,0,0,0,0,0,1,...,0.015385,0.021953,0.0,0.0,2,2,1,6,1,6
9471,0,0,1,0,0,0,0,0,0,0,...,0.016667,0.022222,0.0,0.0,1,2,1,2,1,6
3753,0,0,1,0,0,0,0,0,0,1,...,0.0,0.007692,0.0,0.0,2,2,7,4,1,6
4122,0,0,1,0,0,0,0,0,0,1,...,0.0,0.025,45.4895,0.8,1,1,6,4,0,6


# Feature Selection

In [7]:
selector = SelectKBest(score_func=chi2, k=20)
selector.fit_transform(X_train, y_train)
cols = selector.get_support(indices=True)

selected_features_df = X_train.iloc[:,cols]
selected_features_df.head()

Unnamed: 0,VisitorType_New_Visitor,VisitorType_Returning_Visitor,Month_Dec,Month_Feb,Month_Mar,Month_May,Month_Nov,Month_Oct,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Browser,Weekend
2498,0,1,0,0,0,1,0,0,0,0.0,0,0.0,5,42.0,0.04,0.08,0.0,0.0,2,0
5431,0,1,0,0,0,1,0,0,0,0.0,0,0.0,28,608.522727,0.015385,0.021953,0.0,0.0,2,1
9471,0,1,0,0,0,0,1,0,0,0.0,0,0.0,12,748.25,0.016667,0.022222,0.0,0.0,2,1
3753,0,1,0,0,0,1,0,0,0,0.0,0,0.0,14,655.166667,0.0,0.007692,0.0,0.0,2,1
4122,0,1,0,0,0,1,0,0,0,0.0,0,0.0,8,710.0,0.0,0.025,45.4895,0.8,1,0


# Predict Target Value Using Classifiers

In [8]:
def accuracy_output(model):
    print("=== ACCURACY ===")
    print(round(model.score(X_test, y_test),6))

In [9]:
def confusion_output(y_test,X_test_prediction):
    print("=== CONFUSION MATRIX ===")
    print(confusion_matrix(y_test, X_test_prediction))

### 1. Decision Tree Classifier

In [10]:
decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, y_train)
decision_tree_prediction = decision_tree.predict(X_test)

In [11]:
accuracy_output(decision_tree)
confusion_output(y_test, decision_tree_prediction)

=== ACCURACY ===
0.860541
=== CONFUSION MATRIX ===
[[1431  121]
 [ 137  161]]


### 2. Random Forest Classifier

In [12]:
random_forest = RandomForestClassifier()
random_forest.fit(X_train, y_train)
random_forest_prediction = random_forest.predict(X_test)

In [13]:
accuracy_output(random_forest)
confusion_output(y_test, random_forest_prediction)

=== ACCURACY ===
0.902162
=== CONFUSION MATRIX ===
[[1504   48]
 [ 133  165]]


### 2.1 Random Forest Classifier - Hyperparameter Tuning

In [14]:
random_parameters = {
    'min_samples_leaf' : [i for i in range(1,5)],
    'max_depth': [i*5 for i in range(10,20)],
    'n_estimators':[i*5 for i in range(5,15)]
}

In [15]:
random_forest_random = RandomizedSearchCV(random_forest,random_parameters,n_iter=50,cv=5,n_jobs=-1,verbose=False)
random_forest_random.fit(X_train, y_train)
random_forest_random_params = random_forest_random.best_params_
random_forest_random_params

{'n_estimators': 50, 'min_samples_leaf': 4, 'max_depth': 55}

In [16]:
random_forest = RandomForestClassifier(**random_forest_random_params)
random_forest.fit(X_train, y_train)
random_forest_prediction = random_forest.predict(X_test)

In [17]:
accuracy_output(random_forest)
confusion_output(y_test, random_forest_prediction)

=== ACCURACY ===
0.893514
=== CONFUSION MATRIX ===
[[1495   57]
 [ 140  158]]


### 3. Linear Support Vector Classifier

In [18]:
support_vector_machines = svc()
support_vector_machines.fit(X_train, y_train)
support_vector_prediction = support_vector_machines.predict(X_test)

In [19]:
accuracy_output(support_vector_machines)
confusion_output(y_test, support_vector_prediction)

=== ACCURACY ===
0.82973
=== CONFUSION MATRIX ===
[[1499   53]
 [ 262   36]]


### 4. K-Nearest Neighbors Classifier

In [20]:
k_neighbors = KNeighborsClassifier()
k_neighbors.fit(X_train, y_train)
k_neighbors_prediction = k_neighbors.predict(X_test)

In [21]:
accuracy_output(k_neighbors)
confusion_output(y_test, k_neighbors_prediction)

=== ACCURACY ===
0.854595
=== CONFUSION MATRIX ===
[[1505   47]
 [ 222   76]]


### 5. Multilayer Perceptron Classifier

In [22]:
multilayer_perceptron = MLPClassifier()
multilayer_perceptron.fit(X_train, y_train)
multilayer_perceptron_prediction = multilayer_perceptron.predict(X_test)

In [23]:
accuracy_output(multilayer_perceptron)
confusion_output(y_test, multilayer_perceptron_prediction)

=== ACCURACY ===
0.878378
=== CONFUSION MATRIX ===
[[1524   28]
 [ 197  101]]


### 5.1 Multilayer Perceptron Classifier - Hyperparameter Tuning

In [24]:
m_layer_parameters = {
    'activation':['identity', 'logistic', 'tanh', 'relu'],
    'learning_rate_init':[i/1000 for i in range(1,5)],
    'max_iter':[i*50 for i in range(2,6)],
    'tol':[i/10000 for i in range(2,5)]
}

In [25]:
m_layer_random = RandomizedSearchCV(multilayer_perceptron,m_layer_parameters,n_iter=10,cv=5,n_jobs=-1,verbose=False)
m_layer_random.fit(X_train,y_train)
m_layer_random_params = m_layer_random.best_params_
m_layer_random_params

{'tol': 0.0003,
 'max_iter': 250,
 'learning_rate_init': 0.003,
 'activation': 'tanh'}

In [26]:
multilayer_perceptron = MLPClassifier(**m_layer_random_params)
multilayer_perceptron.fit(X_train, y_train)
multilayer_perceptron_prediction = multilayer_perceptron.predict(X_test)

### 6. Gradient Boosting Classifier

In [27]:
gradient_boosting = GradientBoostingClassifier()
gradient_boosting.fit(X_train, y_train)
gradient_boosting_prediction = gradient_boosting.predict(X_test)

In [28]:
accuracy_output(gradient_boosting)
confusion_output(y_test, gradient_boosting_prediction)

=== ACCURACY ===
0.896757
=== CONFUSION MATRIX ===
[[1480   72]
 [ 119  179]]


### 6.1 Gradient Boosting Classifier - Hyperparameter Tuning

In [29]:
g_boosting_parameters = {
    'criterion':['friedman_mse','mse','mae'],
    'min_samples_split':[i*2 for i in range(1,5)]
}

In [30]:
g_boosting_random = RandomizedSearchCV(gradient_boosting,g_boosting_parameters,n_iter=5,cv=5,n_jobs=-1,verbose=False)
g_boosting_random.fit(X_train,y_train)
g_boosting_random_params = g_boosting_random.best_params_
g_boosting_random_params

{'min_samples_split': 8, 'criterion': 'mse'}

In [31]:
gradient_boosting = GradientBoostingClassifier(**g_boosting_random_params)
gradient_boosting.fit(X_train, y_train)
gradient_boosting_prediction = gradient_boosting.predict(X_test)

In [32]:
accuracy_output(gradient_boosting)
confusion_output(y_test, gradient_boosting_prediction)

=== ACCURACY ===
0.897297
=== CONFUSION MATRIX ===
[[1481   71]
 [ 119  179]]


### 7. Gradient Descent Classifier

In [33]:
gradient_descent = SGDClassifier()
gradient_descent.fit(X_train, y_train)
gradient_descent_prediction = gradient_descent.predict(X_test)

In [34]:
accuracy_output(gradient_descent)
confusion_output(y_test, gradient_descent_prediction)

=== ACCURACY ===
0.859459
=== CONFUSION MATRIX ===
[[1544    8]
 [ 252   46]]


### 7.1 Gradient Descent Classifier - Hyperparameter Tuning

In [35]:
g_descent_parameters = {
    'tol':[i/10000 for i in range(1,5)],
    'penalty':['l2', 'l1', 'elasticnet'],
    'max_iter':[i*100 for i in range(2,6)]
}

In [36]:
g_descent_random = RandomizedSearchCV(gradient_descent, g_descent_parameters, n_iter=50, cv=5, n_jobs=-1, verbose=False)
g_descent_random.fit(X_train, y_train)
g_descent_random_params = g_descent_random.best_params_
g_descent_random_params

{'tol': 0.0002, 'penalty': 'l1', 'max_iter': 200}

In [37]:
gradient_descent = SGDClassifier(**g_descent_random_params)
gradient_descent.fit(X_train, y_train)
gradient_descent_prediction = gradient_descent.predict(X_test)

In [38]:
accuracy_output(gradient_descent)
confusion_output(y_test, gradient_descent_prediction)

=== ACCURACY ===
0.875676
=== CONFUSION MATRIX ===
[[1492   60]
 [ 170  128]]


# Stacked Model With Hyperparameter Tuning

In [39]:
models = [
    RandomForestClassifier(**random_forest_random_params),
    GradientBoostingClassifier(**g_boosting_random_params),
    SGDClassifier(**g_descent_random_params)
]

stack_train, stack_test = stacking(models,                   
    X_train, y_train, df,   
    regression=True, 
    mode='oof_pred_bag', 
    needs_proba=False,
    save_dir=None,
    n_folds=10, 
    stratified=True,
    shuffle=True,  
    random_state=0)

In [40]:
results = {
    'r_forest':RandomForestClassifier(**random_forest_random_params).fit(stack_train,y_train).predict(stack_test),
    'mlp':MLPClassifier(**m_layer_random_params).fit(stack_train,y_train).predict(stack_test),
    'g_boost':GradientBoostingClassifier(**g_boosting_random_params).fit(stack_train,y_train).predict(stack_test),
    'g_desc':SGDClassifier(**g_descent_random_params).fit(stack_train,y_train).predict(stack_test)
}

results_df = pd.DataFrame(data=results)
results_df

Unnamed: 0,r_forest,mlp,g_boost,g_desc
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,False,False
4,False,False,False,False
...,...,...,...,...
12325,False,False,False,False
12326,False,False,False,False
12327,False,False,False,False
12328,False,False,False,False


# Accuracy Check

### Random Forest Classifier

In [41]:
print(f"Accuracy Score: {round(accuracy_score(df['Revenue'], results_df['r_forest']),6)}")
confusion_output(df['Revenue'], results_df['r_forest'])

Accuracy Score: 0.929035
=== CONFUSION MATRIX ===
[[10286   136]
 [  739  1169]]


### Multilayer Perceptron

In [42]:
print(f"Accuracy Score: {round(accuracy_score(df['Revenue'], results_df['mlp']),6)}")
confusion_output(df['Revenue'], results_df['mlp'])

Accuracy Score: 0.921817
=== CONFUSION MATRIX ===
[[10045   377]
 [  587  1321]]


### Gradient Boosting

In [43]:
print(f"Accuracy Score: {round(accuracy_score(df['Revenue'], results_df['g_boost']),6)}")
confusion_output(df['Revenue'], results_df['g_boost'])

Accuracy Score: 0.929035
=== CONFUSION MATRIX ===
[[10286   136]
 [  739  1169]]


### Gradient Descent

In [44]:
print(f"Accuracy Score: {round(accuracy_score(df['Revenue'], results_df['g_desc']),6)}")
confusion_output(df['Revenue'], results_df['g_desc'])

Accuracy Score: 0.916626
=== CONFUSION MATRIX ===
[[10067   355]
 [  673  1235]]
