# Classification-LogisticRegression

In [2]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, jaccard_score, accuracy_score

#import itertools

In [3]:
df = pd.read_csv('ChurnData.csv')
df.head()

Unnamed: 0,tenure,age,address,income,ed,employ,equip,callcard,wireless,longmon,...,pager,internet,callwait,confer,ebill,loglong,logtoll,lninc,custcat,churn
0,11.0,33.0,7.0,136.0,5.0,5.0,0.0,1.0,1.0,4.4,...,1.0,0.0,1.0,1.0,0.0,1.482,3.033,4.913,4.0,1.0
1,33.0,33.0,12.0,33.0,2.0,0.0,0.0,0.0,0.0,9.45,...,0.0,0.0,0.0,0.0,0.0,2.246,3.24,3.497,1.0,1.0
2,23.0,30.0,9.0,30.0,1.0,2.0,0.0,0.0,0.0,6.3,...,0.0,0.0,0.0,1.0,0.0,1.841,3.24,3.401,3.0,0.0
3,38.0,35.0,5.0,76.0,2.0,10.0,1.0,1.0,1.0,6.05,...,1.0,1.0,1.0,1.0,1.0,1.8,3.807,4.331,4.0,0.0
4,7.0,35.0,14.0,80.0,2.0,15.0,0.0,1.0,0.0,7.1,...,0.0,0.0,1.0,1.0,0.0,1.96,3.091,4.382,3.0,0.0


In [4]:
df.columns

Index(['tenure', 'age', 'address', 'income', 'ed', 'employ', 'equip',
       'callcard', 'wireless', 'longmon', 'tollmon', 'equipmon', 'cardmon',
       'wiremon', 'longten', 'tollten', 'cardten', 'voice', 'pager',
       'internet', 'callwait', 'confer', 'ebill', 'loglong', 'logtoll',
       'lninc', 'custcat', 'churn'],
      dtype='object')

In [5]:
churn_df = df[['tenure', 'age', 'address', 'income', 'ed', 'employ', 'equip',
       'callcard', 'wireless','churn']].copy()
churn_df['churn'] = churn_df['churn'].astype('int')
churn_df.head()

Unnamed: 0,tenure,age,address,income,ed,employ,equip,callcard,wireless,churn
0,11.0,33.0,7.0,136.0,5.0,5.0,0.0,1.0,1.0,1
1,33.0,33.0,12.0,33.0,2.0,0.0,0.0,0.0,0.0,1
2,23.0,30.0,9.0,30.0,1.0,2.0,0.0,0.0,0.0,0
3,38.0,35.0,5.0,76.0,2.0,10.0,1.0,1.0,1.0,0
4,7.0,35.0,14.0,80.0,2.0,15.0,0.0,1.0,0.0,0


In [6]:
X = churn_df[['tenure', 'age', 'address', 'income', 'ed', 'employ', 'equip',
       'callcard', 'wireless']].values
X[0:5]

array([[ 11.,  33.,   7., 136.,   5.,   5.,   0.,   1.,   1.],
       [ 33.,  33.,  12.,  33.,   2.,   0.,   0.,   0.,   0.],
       [ 23.,  30.,   9.,  30.,   1.,   2.,   0.,   0.,   0.],
       [ 38.,  35.,   5.,  76.,   2.,  10.,   1.,   1.,   1.],
       [  7.,  35.,  14.,  80.,   2.,  15.,   0.,   1.,   0.]])

In [7]:
y = churn_df['churn'].values

In [8]:
y[0:5]

array([1, 1, 0, 0, 0])

In [9]:
X = preprocessing.StandardScaler().fit(X).transform(X)
X[0:5]

array([[-1.13518441, -0.62595491, -0.4588971 ,  0.4751423 ,  1.6961288 ,
        -0.58477841, -0.85972695,  0.64686916,  1.56469673],
       [-0.11604313, -0.62595491,  0.03454064, -0.32886061, -0.6433592 ,
        -1.14437497, -0.85972695, -1.54590766, -0.63910148],
       [-0.57928917, -0.85594447, -0.261522  , -0.35227817, -1.42318853,
        -0.92053635, -0.85972695, -1.54590766, -0.63910148],
       [ 0.11557989, -0.47262854, -0.65627219,  0.00679109, -0.6433592 ,
        -0.02518185,  1.16316   ,  0.64686916,  1.56469673],
       [-1.32048283, -0.47262854,  0.23191574,  0.03801451, -0.6433592 ,
         0.53441472, -0.85972695,  0.64686916, -0.63910148]])

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=42)

In [11]:
print('Training size:',X_train.shape, y_train.shape)
print('Testing size:',X_test.shape, y_test.shape)

Training size: (140, 9) (140,)
Testing size: (60, 9) (60,)


In [12]:
LR = LogisticRegression(C=0.05, solver='saga').fit(X_train,y_train)

In [13]:
LR

In [14]:
yhat = LR.predict(X_test)
yhat[0:5]

array([0, 1, 0, 0, 0])

In [15]:
yhat_prob = LR.predict_proba(X_test)
yhat_prob[0:5]

array([[0.57002938, 0.42997062],
       [0.2655132 , 0.7344868 ],
       [0.8079451 , 0.1920549 ],
       [0.78619444, 0.21380556],
       [0.70006817, 0.29993183]])

In [16]:
jaccard_score(y_test,yhat,pos_label=0)

0.7358490566037735

In [17]:
accuracy_score(y_test,yhat)

0.7666666666666667

In [18]:
print(confusion_matrix(y_test,yhat, labels=[1,0]))

[[ 7  8]
 [ 6 39]]


In [19]:
report = classification_report(y_test,yhat)
print(report)

              precision    recall  f1-score   support

           0       0.83      0.87      0.85        45
           1       0.54      0.47      0.50        15

    accuracy                           0.77        60
   macro avg       0.68      0.67      0.67        60
weighted avg       0.76      0.77      0.76        60



In [40]:
coefficients = LR.coef_[0]

In [42]:
feature_importance = pd.DataFrame({'Feature':churn_df[['tenure', 'age', 'address', 'income', 'ed', 'employ', 'equip',
       'callcard', 'wireless']].columns,'Coefficients':coefficients})

In [44]:
feature_importance = feature_importance.sort_values('Coefficients',ascending=False)

In [46]:
print(feature_importance)

    Feature  Coefficients
4        ed      0.309667
6     equip      0.211616
8  wireless      0.169261
3    income     -0.022688
2   address     -0.102694
1       age     -0.250618
7  callcard     -0.277887
5    employ     -0.284218
0    tenure     -0.287230
