# Logistic Regression

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

### Loading the data

In [2]:
df = pd.read_csv('ChurnData.csv')
df.head()

Unnamed: 0,tenure,age,address,income,ed,employ,equip,callcard,wireless,longmon,...,pager,internet,callwait,confer,ebill,loglong,logtoll,lninc,custcat,churn
0,11.0,33.0,7.0,136.0,5.0,5.0,0.0,1.0,1.0,4.4,...,1.0,0.0,1.0,1.0,0.0,1.482,3.033,4.913,4.0,1.0
1,33.0,33.0,12.0,33.0,2.0,0.0,0.0,0.0,0.0,9.45,...,0.0,0.0,0.0,0.0,0.0,2.246,3.24,3.497,1.0,1.0
2,23.0,30.0,9.0,30.0,1.0,2.0,0.0,0.0,0.0,6.3,...,0.0,0.0,0.0,1.0,0.0,1.841,3.24,3.401,3.0,0.0
3,38.0,35.0,5.0,76.0,2.0,10.0,1.0,1.0,1.0,6.05,...,1.0,1.0,1.0,1.0,1.0,1.8,3.807,4.331,4.0,0.0
4,7.0,35.0,14.0,80.0,2.0,15.0,0.0,1.0,0.0,7.1,...,0.0,0.0,1.0,1.0,0.0,1.96,3.091,4.382,3.0,0.0


## Data selection and preprocessing

In [3]:
X = df[['tenure', 'age', 'address', 'income', 'ed', 'employ', 'equip']]
X.head()

Unnamed: 0,tenure,age,address,income,ed,employ,equip
0,11.0,33.0,7.0,136.0,5.0,5.0,0.0
1,33.0,33.0,12.0,33.0,2.0,0.0,0.0
2,23.0,30.0,9.0,30.0,1.0,2.0,0.0
3,38.0,35.0,5.0,76.0,2.0,10.0,1.0
4,7.0,35.0,14.0,80.0,2.0,15.0,0.0


In [4]:
y = df['churn'].astype('int')
y.head()

0    1
1    1
2    0
3    0
4    0
Name: churn, dtype: int64

#### Normalizing the data

In [5]:
from sklearn import preprocessing

In [6]:
ss_X = preprocessing.StandardScaler()
ss_X.fit(X)
X = ss_X.transform(X)
X[0:5]

array([[-1.13518441, -0.62595491, -0.4588971 ,  0.4751423 ,  1.6961288 ,
        -0.58477841, -0.85972695],
       [-0.11604313, -0.62595491,  0.03454064, -0.32886061, -0.6433592 ,
        -1.14437497, -0.85972695],
       [-0.57928917, -0.85594447, -0.261522  , -0.35227817, -1.42318853,
        -0.92053635, -0.85972695],
       [ 0.11557989, -0.47262854, -0.65627219,  0.00679109, -0.6433592 ,
        -0.02518185,  1.16316   ],
       [-1.32048283, -0.47262854,  0.23191574,  0.03801451, -0.6433592 ,
         0.53441472, -0.85972695]])

### Train-test split

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

## Training the model

In [9]:
from sklearn.linear_model import LogisticRegression

In [10]:
model = LogisticRegression()
model

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

Lets build our model using __LogisticRegression__ from Scikit-learn package. This function implements logistic regression and can use different numerical optimizers to find parameters, including ‘newton-cg’, ‘lbfgs’, ‘liblinear’, ‘sag’, ‘saga’ solvers. You can find extensive information about the pros and cons of these optimizers if you search it in internet.

The version of Logistic Regression in Scikit-learn, support regularization. Regularization is a technique used to solve the overfitting problem in machine learning models.
__C__ parameter indicates __inverse of regularization strength__ which must be a positive float. Smaller values specify stronger regularization. 
Now lets fit our model with train set:

In [11]:
model.C = 0.1      # Changing the value of C to make it more regularized
model.solver = 'liblinear'
model

LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [12]:
model.fit(X_train, y_train)

LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

### Prediction

In [13]:
y_hat = model.predict(X_test)
y_hat[0:5]

array([1, 0, 1, 0, 0])

**predict_proba**  returns estimates for all classes, ordered by the label of classes. So, the first column is the probability of class 1, P(Y=1|X), and second column is probability of class 0, P(Y=0|X):

In [14]:
y_hat_proba = model.predict_proba(X_test)
y_hat_proba[0:5]

array([[0.39296215, 0.60703785],
       [0.7934059 , 0.2065941 ],
       [0.26493845, 0.73506155],
       [0.89675048, 0.10324952],
       [0.91738017, 0.08261983]])

## Evaluation

In [15]:
from sklearn import metrics

In [16]:
print('Accuracy of the Logistic Model using accuracy_score is %.9f' % metrics.accuracy_score(y_hat, y_test))

Accuracy of the Logistic Model using accuracy_score is 0.700000000


### Jaccard-Index Score

In [17]:
from sklearn import metrics

In [18]:
print('Accuracy of the Logistic Model using jaccard_similarity_score is %.9f' % metrics.jaccard_similarity_score(y_hat, y_test))

Accuracy of the Logistic Model using jaccard_similarity_score is 0.700000000


### Confusion Matrix

Confusion Matrix is way representing the result in a 2d-matrix. <br>
<table>
    <tr>
        <td></td>
        <td>Predicted No</td>
        <td>Predicted Yes</td>
    </tr>
    <tr>
        <td>Actual No</td>
        <td>a</td>
        <td>b</td>
    </tr>
    <tr>
        <td>Actual Yes</td>
        <td>c</td>
        <td>d</td>
    </tr>
</table>

In [19]:
from sklearn import metrics

In [20]:
print(metrics.confusion_matrix(y_test, y_hat))

[[22  7]
 [ 5  6]]


In [21]:
print(metrics.confusion_matrix(y_test, y_hat, labels=[1, 0]))  # Tweaking the positions of rows and columns

[[ 6  5]
 [ 7 22]]


## Log Loss

In [22]:
from sklearn import metrics

In [23]:
print('Accuracy of the Logistic Model using log_loss is %.9f' % metrics.log_loss(y_test, y_hat_proba))

Accuracy of the Logistic Model using log_loss is 0.490226788
