#### Load the package we need

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
import statsmodels.api as sm

#### Load the data

In [2]:
df = pd.read_csv('Logistic_regression.csv')

In [3]:
df.shape

(41188, 21)

In [4]:
df.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp_var_rate,cons_price_idx,cons_conf_idx,euribor3m,nr_employed,y
0,44,blue-collar,married,Basic,unknown,yes,no,cellular,aug,thu,...,1,999,0,nonexistent,1.4,93.444,-36.1,4.963,5228.1,0
1,53,technician,married,unknown,no,no,no,cellular,nov,fri,...,1,999,0,nonexistent,-0.1,93.2,-42.0,4.021,5195.8,0
2,28,management,single,university.degree,no,yes,no,cellular,jun,thu,...,3,6,2,success,-1.7,94.055,-39.8,0.729,4991.6,1
3,39,services,married,high.school,no,no,no,cellular,apr,fri,...,2,999,0,nonexistent,-1.8,93.075,-47.1,1.405,5099.1,0
4,55,retired,married,Basic,no,yes,no,cellular,aug,fri,...,1,3,1,success,-2.9,92.201,-31.4,0.869,5076.2,1


#### Create dummy for category variable

In [5]:
df_dummy = pd.get_dummies(df)

In [6]:
df_dummy.columns

Index(['age', 'duration', 'campaign', 'pdays', 'previous', 'emp_var_rate',
       'cons_price_idx', 'cons_conf_idx', 'euribor3m', 'nr_employed', 'y',
       'job_admin.', 'job_blue-collar', 'job_entrepreneur', 'job_housemaid',
       'job_management', 'job_retired', 'job_self-employed', 'job_services',
       'job_student', 'job_technician', 'job_unemployed', 'job_unknown',
       'marital_divorced', 'marital_married', 'marital_single',
       'marital_unknown', 'education_Basic', 'education_high.school',
       'education_illiterate', 'education_professional.course',
       'education_university.degree', 'education_unknown', 'default_no',
       'default_unknown', 'default_yes', 'housing_no', 'housing_unknown',
       'housing_yes', 'loan_no', 'loan_unknown', 'loan_yes',
       'contact_cellular', 'contact_telephone', 'month_apr', 'month_aug',
       'month_dec', 'month_jul', 'month_jun', 'month_mar', 'month_may',
       'month_nov', 'month_oct', 'month_sep', 'day_of_week_fri',
      

In [7]:
X = df_dummy.drop('y',axis = 1)
y = df_dummy['y']

#### Seperate the train set and test set

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 123)

#### Drop one of dummy variables to avoid Collinear in logestic regression

In [9]:
X_train.drop(['default_unknown', 'education_unknown','marital_unknown',  'housing_unknown', 'job_unknown',
             'loan_unknown','contact_telephone','month_sep','day_of_week_wed','poutcome_nonexistent'],axis = 1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [10]:
len(X_train.columns)

51

#### Using P-value to select variale

In [11]:
logit_model = sm.Logit(y_train,X_train)
result = logit_model.fit()
p_value = result.pvalues

         Current function value: 0.208361
         Iterations: 35




In [12]:
X_train.drop(p_value[p_value>0.05].index.tolist(),axis = 1, inplace = True)

In [13]:
X_train.columns

Index(['duration', 'campaign', 'pdays', 'emp_var_rate', 'cons_price_idx',
       'euribor3m', 'nr_employed', 'default_no', 'contact_cellular',
       'month_apr', 'month_aug', 'month_dec', 'month_jul', 'month_jun',
       'month_mar', 'month_nov', 'day_of_week_mon', 'poutcome_failure'],
      dtype='object')

In [14]:
logit_model2 = sm.Logit(y_train,X_train)
result2 = logit_model2.fit()
p_value2 = result2.pvalues

Optimization terminated successfully.
         Current function value: 0.209455
         Iterations 8


In [15]:
p_value2[p_value2>0.05]

Series([], dtype: float64)

In [16]:
result2.summary()

0,1,2,3
Dep. Variable:,y,No. Observations:,28831.0
Model:,Logit,Df Residuals:,28813.0
Method:,MLE,Df Model:,17.0
Date:,"Thu, 10 Oct 2019",Pseudo R-squ.:,0.4037
Time:,16:03:07,Log-Likelihood:,-6038.8
converged:,True,LL-Null:,-10127.0
,,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
duration,0.0047,8.84e-05,52.785,0.000,0.004,0.005
campaign,-0.0476,0.014,-3.427,0.001,-0.075,-0.020
pdays,-0.0013,8.54e-05,-15.419,0.000,-0.001,-0.001
emp_var_rate,-1.1409,0.070,-16.195,0.000,-1.279,-1.003
cons_price_idx,0.7170,0.036,19.979,0.000,0.647,0.787
euribor3m,0.8774,0.086,10.157,0.000,0.708,1.047
nr_employed,-0.0143,0.001,-20.337,0.000,-0.016,-0.013
default_no,0.3689,0.078,4.748,0.000,0.217,0.521
contact_cellular,0.5937,0.079,7.526,0.000,0.439,0.748


#### Use SMOTE to deal with the imbalance

In [17]:
os = SMOTE(random_state=123)
os_data_X,os_data_y = os.fit_sample(X_train, y_train)

#### Model logestic regression

In [18]:
def sigmoid(scores):
    temp = np.exp(-scores)
    return 1.0 / (1.0 + temp)

In [19]:
def log_likelihood(features,target,weights):
    scores = np.dot(features,weights)
    res = np.sum(target * scores - np.log(1 + np.exp(scores)))
    return res

In [71]:
def logistic_regression(features, target, num_steps, learning_rate, add_intercept=False):
    if add_intercept: 
        intercept = np.ones((features.shape[0], 1))  
        features = np.hstack((intercept, features)) 
        
    weights = np.zeros(features.shape[1])
  
    for step in range(num_steps):  
        scores = np.dot(features, weights) 
        prediction = sigmoid(scores)
        error = target - prediction      
        gradient = np.dot(error,features)    
        weights += learning_rate * gradient  
        if step % 10000 == 0:           
            print(log_likelihood(features, target, weights))
    
    return weights

In [42]:
weight = logistic_regression(os_data_X,os_data_y,500000,0.00000000001,add_intercept = True)

-35255.73777829268
-28341.51279582199
-28275.641513657098
-28209.968301366564
-28144.49339354043
-28079.21701403969
-28014.139376209052
-27949.260683085253
-27884.58112760291
-27820.1008927964
-27755.82015199855
-27691.739069036455
-27627.857798423567
-27564.17648554909
-27500.695266863586
-27437.41427006209
-27374.33361426363
-27311.45341018818
-27248.773760330274
-27186.294759129833
-27124.016493139607
-27061.93904119027
-27000.062474552786
-26938.386857096753
-26876.91224544762
-26815.63868913948
-26754.56623076629
-26693.694906130164
-26633.02474438631
-26572.55576818597
-26512.287993816713
-26452.221431339774
-26392.35608472499
-26332.69195198339
-26273.22902529706
-26213.967291146546
-26154.906730435916
-26096.047318615387
-26037.389025801524
-25978.931816895205
-25920.67565169705
-25862.620485020885
-25804.766266804796
-25747.11294221975
-25689.66045177664
-25632.408731430358
-25575.35771268278
-25518.50732268228
-25461.85748432324
-25405.40811634092


In [44]:
weight

array([ 0.00065011,  0.00586317, -0.02049908, -0.0028753 , -0.06967079,
        0.04551461, -0.07167836, -0.00050997,  0.00669725,  0.00951992,
        0.00372119,  0.00017835,  0.00057321, -0.00256605,  0.00021986,
        0.00324966, -0.00061895, -0.00020907,  0.00248656])

#### Run the model on test set

In [50]:
X_test = X_test[X_train.columns]

In [51]:
intercept = np.ones((X_test.shape[0], 1))  
X_test = np.hstack((intercept, X_test)) 

In [52]:
X_test

array([[  1., 194.,   1., ...,   0.,   0.,   0.],
       [  1.,  81.,   1., ...,   0.,   1.,   0.],
       [  1., 123.,   2., ...,   0.,   1.,   0.],
       ...,
       [  1., 261.,   3., ...,   0.,   0.,   0.],
       [  1., 295.,   1., ...,   0.,   0.,   0.],
       [  1.,  92.,   1., ...,   0.,   0.,   0.]])

In [53]:
h = np.dot(X_test,weight)

In [54]:
p = sigmoid(h)

In [57]:
for i in range(len(p)):
    if p[i] >= 0.5:
        p[i] = 1
    else:
        p[i] = 0

In [60]:
wrong = 0
for i in range(len(p)):
    if p[i] != y[i]:
        wrong += 1

In [61]:
wrong / len(p)

0.3823743627093955