# Assignment for Data Science Intern

The goal here is to model an event which we deem to be an important stage in the customer experience/journey. It’s a rare event and thereby the modelling approaches are supposed to take that into account. 

You will share a modelling data set over which you are to train your model.
A testing data set, over which you are to show your performance output as shown below. 
Basically a precision-recall curve. 

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

In [2]:
df = pd.read_csv('Train_Set-Assignment.csv')
feats=df.columns[2:-1]
X = df[feats].values
y = df["Label"].values

In [3]:
df['Label'].value_counts()

0    113630
1      1076
Name: Label, dtype: int64

In [4]:
df.head(2)


Unnamed: 0,account_id,week_start_date,feature1,feature2,feature3,feature4,feature5,feature6,feature7,feature8,...,feature54,feature55,feature56,feature57,feature58,feature59,feature60,feature61,feature62,Label
0,100559.0,9/3/2018,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,101026.0,9/3/2018,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [5]:
del df['week_start_date']

In [6]:
y = df[['account_id', 'Label']] 

In [7]:
corr_matrix = df.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.05)]
df.drop(to_drop, axis=1,inplace=True  )

In [8]:
df

Unnamed: 0,account_id,feature1,feature2,feature3,feature4,feature5,feature6,feature11,feature42
0,100559.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
1,101026.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
2,101026.0,0.0,0.0,86.666667,0.0,0.0,0.0,0.0,0.0
3,101026.0,0.0,0.0,-0.076046,0.0,0.0,0.0,0.0,0.0
4,101026.0,0.0,0.0,-0.987654,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...
114701,237574.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
114702,238407.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
114703,238407.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
114704,102119.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0


In [9]:
df.isnull().sum()

account_id    0
feature1      0
feature2      0
feature3      0
feature4      0
feature5      0
feature6      0
feature11     0
feature42     0
dtype: int64

In [10]:
df.shape

(114706, 9)

In [11]:
new = df.account_id

In [12]:
new.drop_duplicates(inplace=True)

In [13]:
new

0         100559.0
1         101026.0
85        103037.0
168       101307.0
251       104266.0
            ...   
114700    187190.0
114701    237574.0
114702    238407.0
114704    102119.0
114705    101817.0
Name: account_id, Length: 2690, dtype: float64

In [14]:
from sklearn.model_selection import train_test_split

new_train, new_test = train_test_split(new, test_size=0.2)

In [15]:
new_train

65339     188729.0
103488    212352.0
82596     196209.0
110930    221932.0
4971      101126.0
            ...   
68237     101332.0
96810     170181.0
97784     170671.0
105616    214721.0
96042     205584.0
Name: account_id, Length: 2152, dtype: float64

In [16]:
new_train = new_train.to_frame()
new_test = new_test.to_frame()

In [17]:
new_train_list = new_train["account_id"].tolist()
new_test_list = new_test['account_id'].tolist()

In [18]:
df=df.reset_index()
del df["index"]

In [19]:
df_train = df[df['account_id'].isin(set(new_train_list))]
df_train

Unnamed: 0,account_id,feature1,feature2,feature3,feature4,feature5,feature6,feature11,feature42
0,100559.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
1,101026.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
2,101026.0,0.0,0.0,86.666667,0.0,0.0,0.0,0.0,0.0
3,101026.0,0.0,0.0,-0.076046,0.0,0.0,0.0,0.0,0.0
4,101026.0,0.0,0.0,-0.987654,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...
114701,237574.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
114702,238407.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
114703,238407.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
114704,102119.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0


In [20]:
df_test = df[df['account_id'].isin(set(new_test_list))]
df_test

Unnamed: 0,account_id,feature1,feature2,feature3,feature4,feature5,feature6,feature11,feature42
251,104266.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
252,104266.0,0.0,0.0,73.666667,0.0,0.0,0.0,0.0,0.0
253,104266.0,0.0,0.0,-0.607143,0.0,0.0,0.0,0.0,0.0
254,104266.0,0.0,0.0,0.363636,0.0,0.0,0.0,0.0,0.0
255,104266.0,0.0,0.0,-0.108333,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...
114678,185346.0,0.0,0.0,31.000000,0.0,0.0,0.0,0.0,0.0
114685,239689.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
114690,100134.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
114691,100134.0,0.0,0.0,317.000000,0.0,0.0,0.0,0.0,0.0


In [21]:
y_test= y[y['account_id'].isin(set(new_test_list))]
y_train= y[y['account_id'].isin(set(new_train_list))]

In [22]:
del y_test["account_id"]
del y_train["account_id"]

In [23]:
len(df_test)

22905

In [31]:
len(y_test)

22905

In [25]:
#from sklearn.model_selection import train_test_split
#df_train, df_test, y_train, y_test = train_test_split(train.drop('Label',axis=1), train['Label'], test_size=0.20,random_state=101)

In [26]:
from sklearn.linear_model import LogisticRegression
logmodel = LogisticRegression()
logmodel.fit(df_train.values, y_train.values)


  y = column_or_1d(y, warn=True)


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [27]:
y_pred = logmodel.predict(df_test.values)

In [28]:
from sklearn import metrics

cnf_matrix = metrics.confusion_matrix(y_test,y_pred)
cnf_matrix

array([[22684,     0],
       [  221,     0]], dtype=int64)

In [29]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("Precision:",metrics.precision_score(y_test, y_pred))
print("Recall:",metrics.recall_score(y_test, y_pred))

Accuracy: 0.9903514516481118
Precision: 0.0
Recall: 0.0


  'precision', 'predicted', average, warn_for)


In [30]:
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import plot_precision_recall_curve
import matplotlib.pyplot as plt

disp = plot_precision_recall_curve(classifier, df_test, y_pred)
disp.ax_.set_title('2-class Precision-Recall curve: '
                   'AP={0:0.2f}'.format(average_precision))

ImportError: cannot import name 'plot_precision_recall_curve' from 'sklearn.metrics' (C:\Users\Abhishek\Anaconda 3\conda--meta\lib\site-packages\sklearn\metrics\__init__.py)

##Not able to run last code.Tried updating scikit-learn but after after updating same 