In [1]:
import pandas as pd
import numpy as np

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

pd.options.display.max_columns = None
pd.options.display.max_rows = None
number_of_data = 5000

In [2]:
data_preprocessed = pd.read_csv('sermil2020-preprocessed.csv')

In [3]:
data_preprocessed.head()

Unnamed: 0,PESO,ALTURA,CABECA,CALCADO,CINTURA,RELIGIAO,PAIS_NASCIMENTO,ESTADO_CIVIL,ZONA_RESIDENCIAL,PAIS_RESIDENCIA,OUT_OF_YEAR,CONVOCADO,EDUC_0,EDUC_1,EDUC_2,EDUC_3,JSM_UF_1,JSM_UF_2,JSM_UF_3,JSM_UF_4,JSM_UF_5,JSM_UF_6
0,87.0,181.0,59.0,43.0,83.0,1,0,0,0,0,1,1,0,0,1,0,0,0,1,0,0,0
1,72.0,182.0,57.0,42.0,86.0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,1,0,0
2,70.0,185.0,57.0,42.0,78.0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0
3,85.0,182.0,60.0,41.0,100.0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0
4,102.0,172.0,56.0,42.0,83.0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0


In [4]:
data_preprocessed['CONVOCADO'].value_counts()

0    1169044
1     134359
Name: CONVOCADO, dtype: int64

In [5]:
targets_0 = data_preprocessed.copy()
targets_0 = targets_0.loc[targets_0['CONVOCADO'] == 0].sample(frac=1)
targets_0 = targets_0[:number_of_data]
targets_0['CONVOCADO'].value_counts()

0    5000
Name: CONVOCADO, dtype: int64

In [6]:
targets_1 = data_preprocessed.copy()
targets_1 = targets_1.loc[targets_1['CONVOCADO'] == 1].sample(frac=1)
targets_1 = targets_1[:number_of_data]
targets_1['CONVOCADO'].value_counts()

1    5000
Name: CONVOCADO, dtype: int64

In [7]:
df = pd.concat([targets_0, targets_1])
df.sample(frac=1)
df.head()

Unnamed: 0,PESO,ALTURA,CABECA,CALCADO,CINTURA,RELIGIAO,PAIS_NASCIMENTO,ESTADO_CIVIL,ZONA_RESIDENCIAL,PAIS_RESIDENCIA,OUT_OF_YEAR,CONVOCADO,EDUC_0,EDUC_1,EDUC_2,EDUC_3,JSM_UF_1,JSM_UF_2,JSM_UF_3,JSM_UF_4,JSM_UF_5,JSM_UF_6
68216,30.0,120.0,41.0,30.0,50.0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0
428367,63.72,164.6,53.87,39.05,75.49,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0
92906,68.0,170.0,58.0,40.0,78.0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0
223144,87.0,185.0,56.0,43.0,78.0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0
45199,63.0,185.0,57.0,41.0,71.0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0


In [8]:
targets = df['CONVOCADO']
targets.shape[0]

10000

In [9]:
df = df.drop(['CONVOCADO'],axis=1)
df.to_csv(r'sermil2020_balanced.csv', index = True)

In [10]:
unscaled_inputs = df.copy()

In [11]:
columns_to_omit = [ 
    'OUT_OF_YEAR', 
    'RELI_0', 
    'RELI_1', 
    'RELI_2', 
    'RELI_3',
    'RELI_4', 
    'EDUC_0', 
    'EDUC_1', 
    'EDUC_2', 
    'EDUC_3',
    'JSM_UF_1', 
    'JSM_UF_2', 
    'JSM_UF_3', 
    'JSM_UF_4',
    'JSM_UF_5', 
    'JSM_UF_6'
]

In [12]:
columns_to_scale = [x for x in unscaled_inputs.columns.values if x not in columns_to_omit]

In [13]:
scaler = StandardScaler()

In [14]:
scaler.fit(unscaled_inputs)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [15]:
unscaled_inputs.shape

(10000, 21)

In [16]:
unscaled_inputs.columns

Index(['PESO', 'ALTURA', 'CABECA', 'CALCADO', 'CINTURA', 'RELIGIAO',
       'PAIS_NASCIMENTO', 'ESTADO_CIVIL', 'ZONA_RESIDENCIAL',
       'PAIS_RESIDENCIA', 'OUT_OF_YEAR', 'EDUC_0', 'EDUC_1', 'EDUC_2',
       'EDUC_3', 'JSM_UF_1', 'JSM_UF_2', 'JSM_UF_3', 'JSM_UF_4', 'JSM_UF_5',
       'JSM_UF_6'],
      dtype='object')

In [17]:
scaled_inputs = scaler.transform(unscaled_inputs)
scaled_inputs.shape[0]

10000

In [18]:
train_test_split(scaled_inputs, targets)

[array([[ 1.01112556,  0.34992847, -3.17143493, ..., -0.46200105,
         -0.33518212, -0.09583095],
        [-2.98367602, -4.34549807, -4.04818118, ..., -0.46200105,
         -0.33518212, -0.09583095],
        [-0.9030502 ,  1.07230178,  0.04330133, ..., -0.46200105,
         -0.33518212, -0.09583095],
        ...,
        [ 1.01112556,  0.5305218 ,  0.62779883, ..., -0.46200105,
         -0.33518212, -0.09583095],
        [ 0.3453253 , -0.46274151,  1.21229633, ..., -0.46200105,
         -0.33518212, -0.09583095],
        [ 1.67692582,  1.07230178,  1.21229633, ..., -0.46200105,
         -0.33518212, -0.09583095]]),
 array([[-0.48692503,  1.07230178,  0.04330133, ..., -0.46200105,
         -0.33518212, -0.09583095],
        [-0.17732791, -0.31826685, -0.28693976, ..., -0.46200105,
          2.98345271, -0.09583095],
        [ 0.92790053,  1.97526843,  1.50454508, ...,  2.16449724,
         -0.33518212, -0.09583095],
        ...,
        [ 1.51047575,  1.34319178, -0.24894742, ...,  

In [19]:
x_train, x_test, y_train, y_test = train_test_split(scaled_inputs, targets, train_size=0.8, random_state=20)

In [20]:
print(x_train.shape, y_train.shape)

(8000, 21) (8000,)


In [21]:
print(x_test.shape, y_test.shape)

(2000, 21) (2000,)


In [22]:
reg = LogisticRegression()

In [23]:
reg.fit(x_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [24]:
reg.score(x_train, y_train)

0.77375

In [25]:
model_outputs = reg.predict(x_train)
model_outputs

array([0, 1, 0, ..., 0, 0, 1])

In [26]:
model_outputs == y_train

1273459     True
355767      True
103830      True
190316     False
212038      True
1002859     True
1285431    False
302871      True
245057      True
591215      True
1298579     True
278259      True
957247      True
1150104     True
192197      True
327726      True
75418       True
866260     False
14843      False
373886      True
133154      True
1283207     True
212395      True
346310      True
185257      True
12211       True
284240      True
215646      True
235287     False
96755       True
1019798     True
1234871     True
491140     False
129336      True
569699      True
394916      True
1125093     True
161252      True
457380      True
1028751     True
903820      True
236058      True
953832      True
790576      True
215271     False
21198       True
955236     False
328097      True
126167      True
547092      True
474868      True
691889      True
738658      True
705709      True
199610      True
186240      True
174855      True
212856      True
868675      Tr

In [27]:
np.sum((model_outputs==y_train))

6190

In [28]:
model_outputs.shape[0]

8000

In [29]:
np.sum((model_outputs==y_train)) / model_outputs.shape[0]

0.77375

In [30]:
reg.intercept_

array([-0.13234436])

In [31]:
reg.coef_

array([[-0.37793576,  1.09272542,  0.43173223,  0.54870686, -0.00856332,
        -0.04070565, -0.06540917, -0.00126739, -0.2672524 , -0.18899682,
         0.06744635, -0.29347135, -0.16095935,  0.12559741,  0.06917763,
         0.10856245, -0.05910861, -0.21917251,  0.12467867,  0.13783429,
         0.11671761]])

In [32]:
unscaled_inputs.columns.values

array(['PESO', 'ALTURA', 'CABECA', 'CALCADO', 'CINTURA', 'RELIGIAO',
       'PAIS_NASCIMENTO', 'ESTADO_CIVIL', 'ZONA_RESIDENCIAL',
       'PAIS_RESIDENCIA', 'OUT_OF_YEAR', 'EDUC_0', 'EDUC_1', 'EDUC_2',
       'EDUC_3', 'JSM_UF_1', 'JSM_UF_2', 'JSM_UF_3', 'JSM_UF_4',
       'JSM_UF_5', 'JSM_UF_6'], dtype=object)

In [33]:
feature_name = unscaled_inputs.columns.values

In [34]:
summary_table = pd.DataFrame(columns=['Feature name'], data=feature_name)

summary_table['Coefficient'] = np.transpose(reg.coef_)

summary_table

Unnamed: 0,Feature name,Coefficient
0,PESO,-0.377936
1,ALTURA,1.092725
2,CABECA,0.431732
3,CALCADO,0.548707
4,CINTURA,-0.008563
5,RELIGIAO,-0.040706
6,PAIS_NASCIMENTO,-0.065409
7,ESTADO_CIVIL,-0.001267
8,ZONA_RESIDENCIAL,-0.267252
9,PAIS_RESIDENCIA,-0.188997


In [35]:
summary_table.index = summary_table.index + 1
summary_table.loc[0] = ['Intercept', reg.intercept_[0]]
summary_table = summary_table.sort_index()
summary_table

Unnamed: 0,Feature name,Coefficient
0,Intercept,-0.132344
1,PESO,-0.377936
2,ALTURA,1.092725
3,CABECA,0.431732
4,CALCADO,0.548707
5,CINTURA,-0.008563
6,RELIGIAO,-0.040706
7,PAIS_NASCIMENTO,-0.065409
8,ESTADO_CIVIL,-0.001267
9,ZONA_RESIDENCIAL,-0.267252


In [36]:
summary_table['Odds_ratio'] = np.exp(summary_table.Coefficient)

In [37]:
summary_table

Unnamed: 0,Feature name,Coefficient,Odds_ratio
0,Intercept,-0.132344,0.876039
1,PESO,-0.377936,0.685275
2,ALTURA,1.092725,2.982391
3,CABECA,0.431732,1.539923
4,CALCADO,0.548707,1.731013
5,CINTURA,-0.008563,0.991473
6,RELIGIAO,-0.040706,0.960112
7,PAIS_NASCIMENTO,-0.065409,0.936684
8,ESTADO_CIVIL,-0.001267,0.998733
9,ZONA_RESIDENCIAL,-0.267252,0.76548


In [38]:
summary_table.sort_values('Odds_ratio', ascending=False)

Unnamed: 0,Feature name,Coefficient,Odds_ratio
2,ALTURA,1.092725,2.982391
4,CALCADO,0.548707,1.731013
3,CABECA,0.431732,1.539923
20,JSM_UF_5,0.137834,1.147785
14,EDUC_2,0.125597,1.133826
19,JSM_UF_4,0.124679,1.132784
21,JSM_UF_6,0.116718,1.123802
16,JSM_UF_1,0.108562,1.114675
15,EDUC_3,0.069178,1.071627
11,OUT_OF_YEAR,0.067446,1.069773


In [39]:
reg.score(x_test, y_test)

0.7725

In [40]:
predicted_proba = reg.predict_proba(x_test)
predicted_proba

array([[0.48732417, 0.51267583],
       [0.06283144, 0.93716856],
       [0.5350624 , 0.4649376 ],
       ...,
       [0.42602981, 0.57397019],
       [0.71452198, 0.28547802],
       [0.5350624 , 0.4649376 ]])

In [41]:
predicted_proba.shape

(2000, 2)

In [42]:
predicted_proba[:,1]

array([0.51267583, 0.93716856, 0.4649376 , ..., 0.57397019, 0.28547802,
       0.4649376 ])

## Save the model

In [43]:
import pickle

In [44]:
with open('model', 'wb') as file:
    pickle.dump(reg, file)

In [45]:
with open('scaler', 'wb') as file:
    pickle.dump(scaler, file)