In [1]:
import pandas as pd
import numpy as np

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

pd.options.display.max_columns = None
pd.options.display.max_rows = None
number_of_data = 5000

In [2]:
data_preprocessed = pd.read_csv('sermil2020-preprocessed.csv')

In [3]:
data_preprocessed.head()

Unnamed: 0,PESO,ALTURA,CABECA,CALCADO,CINTURA,RELIGIAO,PAIS_NASCIMENTO,ESTADO_CIVIL,ZONA_RESIDENCIAL,PAIS_RESIDENCIA,OUT_OF_YEAR,CONVOCADO,EDUC_0,EDUC_1,EDUC_2,EDUC_3,JSM_UF_1,JSM_UF_2,JSM_UF_3,JSM_UF_4,JSM_UF_5,JSM_UF_6
0,87.0,181.0,59.0,43.0,83.0,1,0,0,0,0,1,1,0,0,1,0,0,0,1,0,0,0
1,72.0,182.0,57.0,42.0,86.0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,1,0,0
2,70.0,185.0,57.0,42.0,78.0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0
3,85.0,182.0,60.0,41.0,100.0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0
4,102.0,172.0,56.0,42.0,83.0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0


In [4]:
data_preprocessed['CONVOCADO'].value_counts()

0    1169044
1     134359
Name: CONVOCADO, dtype: int64

In [5]:
targets_0 = data_preprocessed.copy()
targets_0 = targets_0.loc[targets_0['CONVOCADO'] == 0].sample(frac=1)
targets_0 = targets_0[:number_of_data]
targets_0['CONVOCADO'].value_counts()

0    5000
Name: CONVOCADO, dtype: int64

In [6]:
targets_1 = data_preprocessed.copy()
targets_1 = targets_1.loc[targets_1['CONVOCADO'] == 1].sample(frac=1)
targets_1 = targets_1[:number_of_data]
targets_1['CONVOCADO'].value_counts()

1    5000
Name: CONVOCADO, dtype: int64

In [7]:
df = pd.concat([targets_0, targets_1])
df.sample(frac=1)
df.head()

Unnamed: 0,PESO,ALTURA,CABECA,CALCADO,CINTURA,RELIGIAO,PAIS_NASCIMENTO,ESTADO_CIVIL,ZONA_RESIDENCIAL,PAIS_RESIDENCIA,OUT_OF_YEAR,CONVOCADO,EDUC_0,EDUC_1,EDUC_2,EDUC_3,JSM_UF_1,JSM_UF_2,JSM_UF_3,JSM_UF_4,JSM_UF_5,JSM_UF_6
478356,63.72,164.6,53.87,39.05,75.49,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0
509724,63.72,164.6,53.87,39.05,75.49,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0
454256,63.72,164.6,53.87,39.05,75.49,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0
198635,66.0,170.0,57.0,44.0,88.0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0
845444,63.72,164.6,53.87,39.05,75.49,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0


In [8]:
targets = df['CONVOCADO']
targets.shape[0]

10000

In [9]:
df = df.drop(['CONVOCADO'],axis=1)
df.shape[0]

10000

In [10]:
unscaled_inputs = df.copy()

In [11]:
columns_to_omit = [ 
    'OUT_OF_YEAR', 
    'RELI_0', 
    'RELI_1', 
    'RELI_2', 
    'RELI_3',
    'RELI_4', 
    'EDUC_0', 
    'EDUC_1', 
    'EDUC_2', 
    'EDUC_3',
    'JSM_UF_1', 
    'JSM_UF_2', 
    'JSM_UF_3', 
    'JSM_UF_4',
    'JSM_UF_5', 
    'JSM_UF_6'
]

In [12]:
columns_to_scale = [x for x in unscaled_inputs.columns.values if x not in columns_to_omit]

In [13]:
scaler = StandardScaler()

In [14]:
scaler.fit(unscaled_inputs)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [15]:
unscaled_inputs.shape

(10000, 21)

In [16]:
unscaled_inputs.columns

Index(['PESO', 'ALTURA', 'CABECA', 'CALCADO', 'CINTURA', 'RELIGIAO',
       'PAIS_NASCIMENTO', 'ESTADO_CIVIL', 'ZONA_RESIDENCIAL',
       'PAIS_RESIDENCIA', 'OUT_OF_YEAR', 'EDUC_0', 'EDUC_1', 'EDUC_2',
       'EDUC_3', 'JSM_UF_1', 'JSM_UF_2', 'JSM_UF_3', 'JSM_UF_4', 'JSM_UF_5',
       'JSM_UF_6'],
      dtype='object')

In [17]:
scaled_inputs = scaler.transform(unscaled_inputs)
scaled_inputs.shape[0]

10000

In [18]:
train_test_split(scaled_inputs, targets)

[array([[-0.47452393,  1.06118994, -3.99657566, ...,  2.24144962,
         -0.32849731, -0.09583095],
        [-0.31620955, -0.10275091,  0.31397815, ..., -0.44613985,
         -0.32849731, -0.09583095],
        [-0.18005918, -0.31763229, -0.29812049, ..., -0.44613985,
          3.04416498, -0.09583095],
        ...,
        [-2.84923964, -4.31084472, -3.99657566, ..., -0.44613985,
         -0.32849731, -0.09583095],
        [-0.18005918, -0.31763229, -0.29812049, ..., -0.44613985,
          3.04416498, -0.09583095],
        [-0.31620955, -0.10275091,  0.31397815, ..., -0.44613985,
         -0.32849731, -0.09583095]]),
 array([[-0.18005918, -0.31763229, -0.29812049, ..., -0.44613985,
         -0.32849731, -0.09583095],
        [ 0.9503055 ,  0.7925882 ,  0.0266079 , ...,  2.24144962,
         -0.32849731, -0.09583095],
        [-0.18005918, -0.31763229, -0.29812049, ..., -0.44613985,
         -0.32849731, -0.09583095],
        ...,
        [-0.7119955 ,  0.16585083,  0.88871866, ..., -

In [19]:
x_train, x_test, y_train, y_test = train_test_split(scaled_inputs, targets, train_size=0.8, random_state=20)

In [20]:
print(x_train.shape, y_train.shape)

(8000, 21) (8000,)


In [21]:
print(x_test.shape, y_test.shape)

(2000, 21) (2000,)


In [22]:
reg = LogisticRegression()

In [23]:
reg.fit(x_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [24]:
reg.score(x_train, y_train)

0.77225

In [25]:
model_outputs = reg.predict(x_train)
model_outputs

array([0, 1, 0, ..., 1, 1, 0])

In [26]:
model_outputs == y_train

258388      True
281871      True
1152665     True
85797       True
1298673    False
486645      True
124587      True
53265       True
45567       True
528135      True
785131      True
71824       True
144439     False
650547      True
133925      True
10992       True
249033     False
258954      True
627291      True
118686      True
284602      True
832415      True
210036      True
412928      True
218750      True
45955      False
1247539    False
329144      True
9120       False
70097      False
651917     False
453031      True
393076      True
163365      True
34005      False
92571       True
494701      True
624075     False
643551      True
113185      True
1156837     True
457590     False
1065710     True
966251      True
746660      True
214373      True
1192791     True
287528     False
51835       True
899794      True
1227118     True
1278287     True
398056     False
857073      True
358197      True
570421      True
268093      True
145048      True
148070     Fal

In [27]:
np.sum((model_outputs==y_train))

6178

In [28]:
model_outputs.shape[0]

8000

In [29]:
np.sum((model_outputs==y_train)) / model_outputs.shape[0]

0.77225

In [30]:
reg.intercept_

array([-0.14423743])

In [31]:
reg.coef_

array([[-0.19680886,  1.19546945,  0.53316821,  0.39460767, -0.05134435,
        -0.02158924,  0.0186168 ,  0.00169407, -0.26808888, -0.20257391,
         0.06686331, -0.10569882, -0.11962748,  0.10168884,  0.01388916,
         0.11050786, -0.02134649, -0.23878939,  0.14721025,  0.07406776,
         0.16731041]])

In [32]:
unscaled_inputs.columns.values

array(['PESO', 'ALTURA', 'CABECA', 'CALCADO', 'CINTURA', 'RELIGIAO',
       'PAIS_NASCIMENTO', 'ESTADO_CIVIL', 'ZONA_RESIDENCIAL',
       'PAIS_RESIDENCIA', 'OUT_OF_YEAR', 'EDUC_0', 'EDUC_1', 'EDUC_2',
       'EDUC_3', 'JSM_UF_1', 'JSM_UF_2', 'JSM_UF_3', 'JSM_UF_4',
       'JSM_UF_5', 'JSM_UF_6'], dtype=object)

In [33]:
feature_name = unscaled_inputs.columns.values

In [34]:
summary_table = pd.DataFrame(columns=['Feature name'], data=feature_name)

summary_table['Coefficient'] = np.transpose(reg.coef_)

summary_table

Unnamed: 0,Feature name,Coefficient
0,PESO,-0.196809
1,ALTURA,1.195469
2,CABECA,0.533168
3,CALCADO,0.394608
4,CINTURA,-0.051344
5,RELIGIAO,-0.021589
6,PAIS_NASCIMENTO,0.018617
7,ESTADO_CIVIL,0.001694
8,ZONA_RESIDENCIAL,-0.268089
9,PAIS_RESIDENCIA,-0.202574


In [35]:
summary_table.index = summary_table.index + 1
summary_table.loc[0] = ['Intercept', reg.intercept_[0]]
summary_table = summary_table.sort_index()
summary_table

Unnamed: 0,Feature name,Coefficient
0,Intercept,-0.144237
1,PESO,-0.196809
2,ALTURA,1.195469
3,CABECA,0.533168
4,CALCADO,0.394608
5,CINTURA,-0.051344
6,RELIGIAO,-0.021589
7,PAIS_NASCIMENTO,0.018617
8,ESTADO_CIVIL,0.001694
9,ZONA_RESIDENCIAL,-0.268089


In [36]:
summary_table['Odds_ratio'] = np.exp(summary_table.Coefficient)

In [37]:
summary_table

Unnamed: 0,Feature name,Coefficient,Odds_ratio
0,Intercept,-0.144237,0.865682
1,PESO,-0.196809,0.821348
2,ALTURA,1.195469,3.305109
3,CABECA,0.533168,1.704323
4,CALCADO,0.394608,1.483802
5,CINTURA,-0.051344,0.949952
6,RELIGIAO,-0.021589,0.978642
7,PAIS_NASCIMENTO,0.018617,1.018791
8,ESTADO_CIVIL,0.001694,1.001696
9,ZONA_RESIDENCIAL,-0.268089,0.76484


In [38]:
summary_table.sort_values('Odds_ratio', ascending=False)

Unnamed: 0,Feature name,Coefficient,Odds_ratio
2,ALTURA,1.195469,3.305109
3,CABECA,0.533168,1.704323
4,CALCADO,0.394608,1.483802
21,JSM_UF_6,0.16731,1.182121
19,JSM_UF_4,0.14721,1.158598
16,JSM_UF_1,0.110508,1.116845
14,EDUC_2,0.101689,1.107039
20,JSM_UF_5,0.074068,1.07688
11,OUT_OF_YEAR,0.066863,1.069149
7,PAIS_NASCIMENTO,0.018617,1.018791


In [39]:
reg.score(x_test, y_test)

0.7715

In [40]:
predicted_proba = reg.predict_proba(x_test)
predicted_proba

array([[0.21910007, 0.78089993],
       [0.64352669, 0.35647331],
       [0.29328943, 0.70671057],
       ...,
       [0.06863295, 0.93136705],
       [0.65851687, 0.34148313],
       [0.83969297, 0.16030703]])

In [41]:
predicted_proba.shape

(2000, 2)

In [42]:
predicted_proba[:,1]

array([0.78089993, 0.35647331, 0.70671057, ..., 0.93136705, 0.34148313,
       0.16030703])

## Save the model

In [43]:
import pickle

In [44]:
with open('model', 'wb') as file:
    pickle.dump(reg, file)

In [45]:
with open('scaler', 'wb') as file:
    pickle.dump(scaler, file)