In [1]:
#import packages
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, roc_auc_score, recall_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MultiLabelBinarizer

from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import GradientBoostingClassifier

  from numpy.core.umath_tests import inner1d


In [2]:
#Load data from trainingData.csv and trainingLabels.csv
colnames_train = ['session_id', 'start_time', 'end_time', 'product_list']
colnames_test = ['label']
dateparse = lambda x: pd.datetime.strptime(x, '%Y-%m-%dT%H:%M:%S')
df = pd.read_csv("./data/trainingData.csv", names=colnames_train, parse_dates=['start_time', 'end_time'], header=None)
df_test = pd.read_csv("./data/trainingLabels.csv", names = colnames_test)

In [3]:
#Feature_Engineering
df['session_duration'] = (df['end_time'] - df['start_time']).dt.total_seconds()
df['product_count'] = df['product_list'].str.count('A')
df['Average_duration_per_product'] = df['session_duration'] / df['product_count']
df['day_of_week'] = df['end_time'].dt.dayofweek
df['date'] = df['end_time'].dt.day
df['month'] = df['end_time'].dt.month
df['start_hour'] = df['start_time'].dt.hour
df['end_hour'] = df['end_time'].dt.hour

#Scaling the product_count variable
sc = StandardScaler()
prd_cnt = np.array(df['product_count'])
prd_cnt = prd_cnt.reshape(-1, 1)
df['product_count'] = sc.fit_transform(prd_cnt)
df_dummy = df.iloc[:,7:12]



In [4]:
#dummy encoding the categorical data such as day_of_week, date, month, start_hour, end_hour
dummy = pd.get_dummies(df_dummy, columns=['day_of_week', 'date', 'month', 'start_hour', 'end_hour'])
df = pd.concat([df, dummy], axis = 1)

In [5]:
#preparing the data for identifying the number of nodes at each level/hierarchy
df['product_list'] = df.product_list.apply(lambda x: x.split('/'))
df['product_list'] = df.product_list.apply(' '.join).str.replace('[^A-Za-z0-9,\s]+', '').str.split(expand=False)
df['new'] = df['product_list'].map(set)
df['new'] = df['new'].apply(lambda x: ','.join(map(str, x)))

In [6]:
#calculating number of nodes at each level
df['A_unique_count'] = df['new'].str.count('A')
df['B_unique_count'] = df['new'].str.count('B')
df['C_unique_count'] = df['new'].str.count('C')
df['D_unique_count'] = df['new'].str.count('D')

In [7]:
#dropping the varibles which have been dummy-encoded
df.drop(['day_of_week', 'date', 'month', 'start_hour', 'end_hour', 'new'], inplace=True, axis=1)

In [8]:
#index of columns
df.columns

Index(['session_id', 'start_time', 'end_time', 'product_list',
       'session_duration', 'product_count', 'Average_duration_per_product',
       'day_of_week_0', 'day_of_week_1', 'day_of_week_2', 'day_of_week_3',
       'day_of_week_4', 'day_of_week_5', 'day_of_week_6', 'date_1', 'date_2',
       'date_3', 'date_5', 'date_6', 'date_7', 'date_8', 'date_9', 'date_12',
       'date_13', 'date_14', 'date_15', 'date_16', 'date_17', 'date_18',
       'date_19', 'date_20', 'date_21', 'date_22', 'date_23', 'date_25',
       'date_26', 'date_27', 'date_28', 'date_29', 'date_30', 'month_11',
       'month_12', 'start_hour_0', 'start_hour_1', 'start_hour_2',
       'start_hour_3', 'start_hour_4', 'start_hour_5', 'start_hour_6',
       'start_hour_7', 'start_hour_8', 'start_hour_9', 'start_hour_10',
       'start_hour_11', 'start_hour_12', 'start_hour_13', 'start_hour_14',
       'start_hour_15', 'start_hour_16', 'start_hour_17', 'start_hour_18',
       'start_hour_19', 'start_hour_20', 'start_ho

In [9]:
#Dummy encoding products/hierarchical nodes using multilabel Binarizer
mlb = MultiLabelBinarizer()
df_product = pd.DataFrame(mlb.fit_transform(df['product_list']),columns=mlb.classes_, index=df.index)

In [10]:
#Dropping the products/hierarchical nodes that appear less than 3 times
df_product.drop([col for col, val in df_product.sum().iteritems() if val < 3], axis=1, inplace=True)

In [11]:
#Joining the features extracted with the dummy encoded product/hierarchical data. Dropped redundant and unwanted data for model construction.
df_product = pd.concat([df_product, df], axis = 1)
df_product.drop(['product_list', 'session_id', 'start_time', 'end_time'], inplace=True, axis=1)

In [12]:
#adding the target variable "gender"
df_product['gender'] = df_test['label']

In [13]:
#Converting female and male class to 0 and 1 respectively
labelencoder_y = LabelEncoder()
df_product['gender'] = labelencoder_y.fit_transform(df_product['gender'])

In [14]:
#Target varible - class proportion
df_product.groupby(['gender'])['day_of_week_1'].count()

gender
0    11703
1     3297
Name: day_of_week_1, dtype: int64

In [15]:
#Under-Sampling
df_female = len(df_product[df_product['gender'] == 1])
female_indices = df_product[df_product['gender'] == 0].index
random_indices = np.random.choice(female_indices, df_female, replace=False)
male_indices = df_product[df_product['gender'] == 1].index
under_sample_indices = np.concatenate([male_indices, random_indices])
under_sample = df_product.loc[under_sample_indices]

In [16]:
#Creating X - feature and y - target
X_under = under_sample.loc[:,under_sample.columns != 'gender'].values
y_under = under_sample.loc[:,under_sample.columns == 'gender'].values

In [17]:
#Train-test split - 80:20
X_under_train, X_under_test, y_under_train, y_under_test = train_test_split(X_under,y_under,test_size = 0.2, random_state = 0)
print(X_under_train.shape)
print(y_under_train.shape)
print(X_under_test.shape)
print(y_under_test.shape)

(5275, 2592)
(5275, 1)
(1319, 2592)
(1319, 1)


In [22]:
#Logistic Regression model
lr_under = LogisticRegression()
lr_under.fit(X_under_train,y_under_train)
y_under_pred = lr_under.predict(X_under_test)
tn, fp, fn, tp = confusion_matrix(y_under_test,y_under_pred).ravel()
score = ((tp/ (tp+fn)) + (tn/ (tn+fp)))/2
print("Score of the algorithm(computed as given in question):", score)
print("General accuracy:", accuracy_score(y_under_test, y_under_pred))
print("confusion matrix:\n", confusion_matrix(y_under_test, y_under_pred))
print("classification report:\n", classification_report(y_under_test, y_under_pred))

Score of the algorithm(computed as given in question): 0.7765067002792359
General accuracy: 0.7793783169067475
confusion matrix:
 [[617  56]
 [235 411]]
classification report:
              precision    recall  f1-score   support

          0       0.72      0.92      0.81       673
          1       0.88      0.64      0.74       646

avg / total       0.80      0.78      0.77      1319



In [23]:
#Neural Network model
MLP_Classifier = MLPClassifier(random_state=4)
MLP_Classifier.fit(X_under_train,y_under_train)
y_under_pred = MLP_Classifier.predict(X_under_test)
tn, fp, fn, tp = confusion_matrix(y_under_test,y_under_pred).ravel()
score = ((tp/ (tp+fn)) + (tn/ (tn+fp)))/2
print("Score of the algorithm(computed as given in question):", score)
print("General accuracy:", accuracy_score(y_under_test, y_under_pred))
print("confusion matrix:\n", confusion_matrix(y_under_test, y_under_pred))
print("classification report:\n", classification_report(y_under_test, y_under_pred))

Score of the algorithm(computed as given in question): 0.7681790789358678
General accuracy: 0.7710386656557998
confusion matrix:
 [[611  62]
 [240 406]]
classification report:
              precision    recall  f1-score   support

          0       0.72      0.91      0.80       673
          1       0.87      0.63      0.73       646

avg / total       0.79      0.77      0.77      1319



In [24]:
#Linear support vector classifier
Linear_SVC_under = LinearSVC(random_state= 134)
Linear_SVC_under.fit(X_under_train,y_under_train)
y_under_pred = Linear_SVC_under.predict(X_under_test)
tn, fp, fn, tp = confusion_matrix(y_under_test,y_under_pred).ravel()
score = ((tp/ (tp+fn)) + (tn/ (tn+fp)))/2
print("Score of the algorithm(computed as given in question):", score)
print("General accuracy:", accuracy_score(y_under_test, y_under_pred))
print("confusion matrix:\n", confusion_matrix(y_under_test, y_under_pred))
print("classification report:\n", classification_report(y_under_test, y_under_pred))

Score of the algorithm(computed as given in question): 0.7123583694837128
General accuracy: 0.7164518574677786
confusion matrix:
 [[614  59]
 [315 331]]
classification report:
              precision    recall  f1-score   support

          0       0.66      0.91      0.77       673
          1       0.85      0.51      0.64       646

avg / total       0.75      0.72      0.70      1319



In [25]:
#Gradient boost classifier
GBM = GradientBoostingClassifier()
GBM.fit(X_under_train,y_under_train)
y_under_pred = GBM.predict(X_under_test)
tn, fp, fn, tp = confusion_matrix(y_under_test,y_under_pred).ravel()
score = ((tp/ (tp+fn)) + (tn/ (tn+fp)))/2
print("Score of the algorithm(computed as given in question):", score)
print("General accuracy:", accuracy_score(y_under_test, y_under_pred))
print("confusion matrix:\n", confusion_matrix(y_under_test, y_under_pred))
print("classification report:\n", classification_report(y_under_test, y_under_pred))

Score of the algorithm(computed as given in question): 0.7789195368457855
General accuracy: 0.7824109173616376
confusion matrix:
 [[639  34]
 [253 393]]
classification report:
              precision    recall  f1-score   support

          0       0.72      0.95      0.82       673
          1       0.92      0.61      0.73       646

avg / total       0.82      0.78      0.78      1319



In [26]:
#Random forest classifier
Randomforest = RandomForestClassifier(class_weight= {0:0.22, 1:0.78}, random_state= 134)
Randomforest.fit(X_under_train,y_under_train)
y_under_pred = Randomforest.predict(X_under_test)
tn, fp, fn, tp = confusion_matrix(y_under_test,y_under_pred).ravel()
score = ((tp/ (tp+fn)) + (tn/ (tn+fp)))/2
print("Score of the algorithm(computed as given in question):", score)
print("General accuracy:", accuracy_score(y_under_test, y_under_pred))
print("confusion matrix:\n", confusion_matrix(y_under_test, y_under_pred))
print("classification report:\n", classification_report(y_under_test, y_under_pred))

  This is separate from the ipykernel package so we can avoid doing imports until


Score of the algorithm(computed as given in question): 0.776633207439541
General accuracy: 0.778620166793025
confusion matrix:
 [[588  85]
 [207 439]]
classification report:
              precision    recall  f1-score   support

          0       0.74      0.87      0.80       673
          1       0.84      0.68      0.75       646

avg / total       0.79      0.78      0.78      1319



In [27]:
#Decision tree classifier
Decisiontree = DecisionTreeClassifier(class_weight= {0:0.22, 1:0.78})
Decisiontree.fit(X_under_train,y_under_train)
y_under_pred = Decisiontree.predict(X_under_test)
tn, fp, fn, tp = confusion_matrix(y_under_test,y_under_pred).ravel()
score = ((tp/ (tp+fn)) + (tn/ (tn+fp)))/2
print("Score of the algorithm(computed as given in question):", score)
print("General accuracy:", accuracy_score(y_under_test, y_under_pred))
print("confusion matrix:\n", confusion_matrix(y_under_test, y_under_pred))
print("classification report:\n", classification_report(y_under_test, y_under_pred))

Score of the algorithm(computed as given in question): 0.7222788309818335
General accuracy: 0.7217589082638363
confusion matrix:
 [[469 204]
 [163 483]]
classification report:
              precision    recall  f1-score   support

          0       0.74      0.70      0.72       673
          1       0.70      0.75      0.72       646

avg / total       0.72      0.72      0.72      1319



In [30]:
#Ensemble using Voting classifier of Decision tree, Random forest, linear support vector classifier, Neural network, Gradient Boost, Logistic regression
ensemble = VotingClassifier(estimators = [('DT',Decisiontree), ('RF', Randomforest), ('SVC', Linear_SVC_under), ('GBM', GBM), ('Logistic_regression', lr_under), ('NN', MLP_Classifier)])
ensemble.fit(X_under_train,y_under_train)
y_under_pred = ensemble.predict(X_under_test)
tn, fp, fn, tp = confusion_matrix(y_under_test,y_under_pred).ravel()
score = ((tp/ (tp+fn)) + (tn/ (tn+fp)))/2
print("Score of the algorithm(computed as given in question):", score)
print("General accuracy:", accuracy_score(y_under_test, y_under_pred))
print("confusion matrix:\n", confusion_matrix(y_under_test, y_under_pred))
print("classification report:\n", classification_report(y_under_test, y_under_pred))

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Score of the algorithm(computed as given in question): 0.7791990026635507
General accuracy: 0.7824109173616376
confusion matrix:
 [[630  43]
 [244 402]]
classification report:
              precision    recall  f1-score   support

          0       0.72      0.94      0.81       673
          1       0.90      0.62      0.74       646

avg / total       0.81      0.78      0.78      1319



  if diff:
