In [40]:
#import packages
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score, roc_auc_score, recall_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MultiLabelBinarizer

from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [41]:
#Load data from trainingData.csv and trainingLabels.csv
colnames_train = ['session_id', 'start_time', 'end_time', 'product_list']
colnames_test = ['label']
dateparse = lambda x: pd.datetime.strptime(x, '%Y-%m-%dT%H:%M:%S')
df = pd.read_csv("./data/trainingData.csv", names=colnames_train, parse_dates=['start_time', 'end_time'], header=None)
df_test = pd.read_csv("./data/trainingLabels.csv", names = colnames_test)

In [42]:
#Feature_Engineering
df['session_duration'] = (df['end_time'] - df['start_time']).dt.total_seconds()
df['product_count'] = df['product_list'].str.count('A')
df['Average_duration_per_product'] = df['session_duration'] / df['product_count']
df['day_of_week'] = df['end_time'].dt.dayofweek
df['date'] = df['end_time'].dt.day
df['month'] = df['end_time'].dt.month
df['start_hour'] = df['start_time'].dt.hour
df['end_hour'] = df['end_time'].dt.hour

#Scaling the product_count variable
sc = StandardScaler()
prd_cnt = np.array(df['product_count'])
prd_cnt = prd_cnt.reshape(-1, 1)
df['product_count'] = sc.fit_transform(prd_cnt)
df_dummy = df.iloc[:,7:12]

In [43]:
#dummy encoding the categorical data such as day_of_week, date, month, start_hour, end_hour
dummy = pd.get_dummies(df_dummy, columns=['day_of_week', 'date', 'month', 'start_hour', 'end_hour'])
df = pd.concat([df, dummy], axis = 1)

In [44]:
#preparing the data for identifying the number of nodes at each level/hierarchy
df['product_list'] = df.product_list.apply(lambda x: x.split('/'))
df['product_list'] = df.product_list.apply(' '.join).str.replace('[^A-Za-z0-9,\s]+', '').str.split(expand=False)
df['new'] = df['product_list'].map(set)
df['new'] = df['new'].apply(lambda x: ','.join(map(str, x)))

In [45]:
#calculating number of nodes at each level
df['A_unique_count'] = df['new'].str.count('A')
df['B_unique_count'] = df['new'].str.count('B')
df['C_unique_count'] = df['new'].str.count('C')
df['D_unique_count'] = df['new'].str.count('D')

In [46]:
#dropping the varibles which have been dummy-encoded
df.drop(['day_of_week', 'date', 'month', 'start_hour', 'end_hour', 'new'], inplace=True, axis=1)

In [47]:
#index of columns
df.columns

Index(['session_id', 'start_time', 'end_time', 'product_list',
       'session_duration', 'product_count', 'Average_duration_per_product',
       'day_of_week_0', 'day_of_week_1', 'day_of_week_2', 'day_of_week_3',
       'day_of_week_4', 'day_of_week_5', 'day_of_week_6', 'date_1', 'date_2',
       'date_3', 'date_5', 'date_6', 'date_7', 'date_8', 'date_9', 'date_12',
       'date_13', 'date_14', 'date_15', 'date_16', 'date_17', 'date_18',
       'date_19', 'date_20', 'date_21', 'date_22', 'date_23', 'date_25',
       'date_26', 'date_27', 'date_28', 'date_29', 'date_30', 'month_11',
       'month_12', 'start_hour_0', 'start_hour_1', 'start_hour_2',
       'start_hour_3', 'start_hour_4', 'start_hour_5', 'start_hour_6',
       'start_hour_7', 'start_hour_8', 'start_hour_9', 'start_hour_10',
       'start_hour_11', 'start_hour_12', 'start_hour_13', 'start_hour_14',
       'start_hour_15', 'start_hour_16', 'start_hour_17', 'start_hour_18',
       'start_hour_19', 'start_hour_20', 'start_ho

In [48]:
#Dummy encoding products/hierarchical nodes using multilabel Binarizer
mlb = MultiLabelBinarizer()
df_product = pd.DataFrame(mlb.fit_transform(df['product_list']),columns=mlb.classes_, index=df.index)

In [49]:
#Dropping the products/hierarchical nodes that appear less than 3 times
df_product.drop([col for col, val in df_product.sum().iteritems() if val < 3], axis=1, inplace=True)

In [50]:
#Joining the features extracted with the dummy encoded product/hierarchical data. Dropped redundant and unwanted data for model construction.
df_product = pd.concat([df_product, df], axis = 1)
df_product.drop(['product_list', 'session_id', 'start_time', 'end_time'], inplace=True, axis=1)

In [51]:
#adding the target variable "gender"
df_product['gender'] = df_test['label']

In [52]:
#Converting female and male class to 0 and 1 respectively
labelencoder_y = LabelEncoder()
df_product['gender'] = labelencoder_y.fit_transform(df_product['gender'])

In [53]:
#Target varible - class proportion
df_product.groupby(['gender'])['day_of_week_1'].count()

gender
0    11703
1     3297
Name: day_of_week_1, dtype: int64

In [54]:
#Creating X - feature and y - target. Train test split - 80:20
X = df_product.iloc[:,:-1].values
y = df_product.iloc[:,-1].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
#Oversampling using SMOTE - Synthetic minority Oversampling technique
smt = SMOTE()
X_train, y_train = smt.fit_sample(X_train, y_train)
print(X_train.shape)
print(y_train.shape)

(18714, 2592)
(18714,)


In [55]:
#logistic Regression
lr = LogisticRegression()
lr.fit(X_train,y_train)
y_pred = lr.predict(X_test)
tn, fp, fn, tp = confusion_matrix(y_test,y_pred).ravel()
score = ((tp/ (tp+fn)) + (tn/ (tn+fp)))/2
print("Score of the algorithm(computed as given in question):", score)
print("General accuracy:", accuracy_score(y_test, y_pred))
print("confusion matrix:\n", confusion_matrix(y_test, y_pred))
print("classification report:\n", classification_report(y_test, y_pred))

Score of the algorithm(computed as given in question): 0.7983254729893553
General accuracy: 0.8553333333333333
confusion matrix:
 [[2110  236]
 [ 198  456]]
classification report:
              precision    recall  f1-score   support

          0       0.91      0.90      0.91      2346
          1       0.66      0.70      0.68       654

avg / total       0.86      0.86      0.86      3000



In [56]:
#Neural network
MLP_Classifier = MLPClassifier(random_state=4, learning_rate = 'invscaling', hidden_layer_sizes= (50,40,30))
MLP_Classifier.fit(X_train,y_train)
y_pred = MLP_Classifier.predict(X_test)
tn, fp, fn, tp = confusion_matrix(y_test,y_pred).ravel()
score = ((tp/ (tp+fn)) + (tn/ (tn+fp)))/2
print("Score of the algorithm(computed as given in question):", score)
print("General accuracy:", accuracy_score(y_test, y_pred))
print("confusion matrix:\n", confusion_matrix(y_test, y_pred))
print("classification report:\n", classification_report(y_test, y_pred))

Score of the algorithm(computed as given in question): 0.7684170596838655
General accuracy: 0.8266666666666667
confusion matrix:
 [[2045  301]
 [ 219  435]]
classification report:
              precision    recall  f1-score   support

          0       0.90      0.87      0.89      2346
          1       0.59      0.67      0.63       654

avg / total       0.84      0.83      0.83      3000



In [57]:
#Linear Support Vector classifier
Linear_SVC = LinearSVC()
Linear_SVC.fit(X_train,y_train)
y_pred = Linear_SVC.predict(X_test)
tn, fp, fn, tp = confusion_matrix(y_test,y_pred).ravel()
score = ((tp/ (tp+fn)) + (tn/ (tn+fp)))/2
print("Score of the algorithm(computed as given in question):", score)
print("General accuracy:", accuracy_score(y_test, y_pred))
print("confusion matrix:\n", confusion_matrix(y_test, y_pred))
print("classification report:\n", classification_report(y_test, y_pred))

Score of the algorithm(computed as given in question): 0.7515407838444512
General accuracy: 0.8416666666666667
confusion matrix:
 [[2138  208]
 [ 267  387]]
classification report:
              precision    recall  f1-score   support

          0       0.89      0.91      0.90      2346
          1       0.65      0.59      0.62       654

avg / total       0.84      0.84      0.84      3000



In [58]:
#Gradient Boosting Classifier
GBM = GradientBoostingClassifier()
# class_weight= {0:0.22, 1:0.78}
GBM.fit(X_train,y_train)
y_pred = GBM.predict(X_test)
tn, fp, fn, tp = confusion_matrix(y_test,y_pred).ravel()
score = ((tp/ (tp+fn)) + (tn/ (tn+fp)))/2
print("Score of the algorithm(computed as given in question):", score)
print("General accuracy:", accuracy_score(y_test, y_pred))
print("confusion matrix:\n", confusion_matrix(y_test, y_pred))
print("classification report:\n", classification_report(y_test, y_pred))

Score of the algorithm(computed as given in question): 0.7765003089388927
General accuracy: 0.8746666666666667
confusion matrix:
 [[2230  116]
 [ 260  394]]
classification report:
              precision    recall  f1-score   support

          0       0.90      0.95      0.92      2346
          1       0.77      0.60      0.68       654

avg / total       0.87      0.87      0.87      3000



In [59]:
#Random Forest Classifier
Randomforest = RandomForestClassifier(random_state= 134, n_estimators= 100)
Randomforest.fit(X_train,y_train)
y_pred = Randomforest.predict(X_test)
tn, fp, fn, tp = confusion_matrix(y_test,y_pred).ravel()
score = ((tp/ (tp+fn)) + (tn/ (tn+fp)))/2
print("Score of the algorithm(computed as given in question):", score)
print("General accuracy:", accuracy_score(y_test, y_pred))
print("confusion matrix:\n", confusion_matrix(y_test, y_pred))
print("classification report:\n", classification_report(y_test, y_pred))

Score of the algorithm(computed as given in question): 0.7853500394972508
General accuracy: 0.8833333333333333
confusion matrix:
 [[2250   96]
 [ 254  400]]
classification report:
              precision    recall  f1-score   support

          0       0.90      0.96      0.93      2346
          1       0.81      0.61      0.70       654

avg / total       0.88      0.88      0.88      3000



In [60]:
#Decision Tree Classifier
Decisiontree = DecisionTreeClassifier()
Decisiontree.fit(X_train,y_train)
y_pred = Decisiontree.predict(X_test)
tn, fp, fn, tp = confusion_matrix(y_test,y_pred).ravel()
score = ((tp/ (tp+fn)) + (tn/ (tn+fp)))/2
print("Score of the algorithm(computed as given in question):", score)
print("General accuracy:", accuracy_score(y_test, y_pred))
print("confusion matrix:\n", confusion_matrix(y_test, y_pred))
print("classification report:\n", classification_report(y_test, y_pred))

Score of the algorithm(computed as given in question): 0.7413849065753146
General accuracy: 0.8223333333333334
confusion matrix:
 [[2076  270]
 [ 263  391]]
classification report:
              precision    recall  f1-score   support

          0       0.89      0.88      0.89      2346
          1       0.59      0.60      0.59       654

avg / total       0.82      0.82      0.82      3000



In [62]:
#Ensemble using Voting classifier of Decision tree, Random forest, linear support vector classifier, Neural network, Gradient Boost, Logistic regression
ensemble = VotingClassifier(estimators = [('GBM', GBM), ('DT',Decisiontree), ('RF', Randomforest), ('SVC', Linear_SVC), ('NN', MLP_Classifier), ('LR', lr)], n_jobs=-1)
ensemble.fit(X_train,y_train)
y_pred = ensemble.predict(X_test)
tn, fp, fn, tp = confusion_matrix(y_test,y_pred).ravel()
score = ((tp/ (tp+fn)) + (tn/ (tn+fp)))/2
print("Score of the algorithm(computed as given in question):", score)
print("General accuracy:", accuracy_score(y_test, y_pred))
print("confusion matrix:\n", confusion_matrix(y_test, y_pred))
print("classification report:\n", classification_report(y_test, y_pred))

Score of the algorithm(computed as given in question): 0.7977818969630134
General accuracy: 0.8846666666666667
confusion matrix:
 [[2233  113]
 [ 233  421]]
classification report:
              precision    recall  f1-score   support

          0       0.91      0.95      0.93      2346
          1       0.79      0.64      0.71       654

avg / total       0.88      0.88      0.88      3000



  if diff:
