In [7]:
import pandas as pd
import numpy as np

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

pd.options.display.max_columns = None
pd.options.display.max_rows = None
number_of_data = 5000

In [8]:
data_preprocessed = pd.read_csv('sermil2020-preprocessed.csv')

In [9]:
data_preprocessed.head()

Unnamed: 0,PESO,ALTURA,CABECA,CALCADO,CINTURA,RELIGIAO,PAIS_NASCIMENTO,ESTADO_CIVIL,ZONA_RESIDENCIAL,PAIS_RESIDENCIA,OUT_OF_YEAR,CONVOCADO,EDUC_0,EDUC_1,EDUC_2,EDUC_3,JSM_UF_1,JSM_UF_2,JSM_UF_3,JSM_UF_4,JSM_UF_5,JSM_UF_6
0,87.0,181.0,59.0,43.0,83.0,1,0,0,0,0,1,1,0,0,1,0,0,0,1,0,0,0
1,72.0,182.0,57.0,42.0,86.0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,1,0,0
2,70.0,185.0,57.0,42.0,78.0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0
3,85.0,182.0,60.0,41.0,100.0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0
4,102.0,172.0,56.0,42.0,83.0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0


In [10]:
data_preprocessed['CONVOCADO'].value_counts()

0    1169044
1     134359
Name: CONVOCADO, dtype: int64

In [12]:
targets_0 = data_preprocessed.copy()
targets_0 = targets_0.loc[targets_0['CONVOCADO'] == 0].sample(frac=1)
targets_0 = targets_0[:number_of_data]
targets_0['CONVOCADO'].value_counts()

0    5000
Name: CONVOCADO, dtype: int64

In [13]:
targets_1 = data_preprocessed.copy()
targets_1 = targets_1.loc[targets_1['CONVOCADO'] == 1].sample(frac=1)
targets_1 = targets_1[:number_of_data]
targets_1['CONVOCADO'].value_counts()

1    5000
Name: CONVOCADO, dtype: int64

In [14]:
df = pd.concat([targets_0, targets_1])
df.sample(frac=1)
df.head()

Unnamed: 0,PESO,ALTURA,CABECA,CALCADO,CINTURA,RELIGIAO,PAIS_NASCIMENTO,ESTADO_CIVIL,ZONA_RESIDENCIAL,PAIS_RESIDENCIA,OUT_OF_YEAR,CONVOCADO,EDUC_0,EDUC_1,EDUC_2,EDUC_3,JSM_UF_1,JSM_UF_2,JSM_UF_3,JSM_UF_4,JSM_UF_5,JSM_UF_6
587118,63.72,164.6,53.87,39.05,75.49,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0
256492,62.0,170.0,56.0,40.0,80.0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0
1164156,63.72,164.6,53.87,39.05,75.49,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0
608388,63.72,164.6,53.87,39.05,75.49,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0
1270147,63.72,164.6,53.87,39.05,75.49,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0


In [15]:
targets = df['CONVOCADO']
targets.shape[0]

10000

In [16]:
df = df.drop(['CONVOCADO'],axis=1)
df.shape[0]

10000

In [17]:
unscaled_inputs = df.copy()

In [20]:
columns_to_omit = [ 
    'OUT_OF_YEAR', 
    'RELI_0', 
    'RELI_1', 
    'RELI_2', 
    'RELI_3',
    'RELI_4', 
    'EDUC_0', 
    'EDUC_1', 
    'EDUC_2', 
    'EDUC_3',
    'JSM_UF_1', 
    'JSM_UF_2', 
    'JSM_UF_3', 
    'JSM_UF_4',
    'JSM_UF_5', 
    'JSM_UF_6'
]

In [21]:
columns_to_scale = [x for x in unscaled_inputs.columns.values if x not in columns_to_omit]

In [23]:
scaler = StandardScaler()

In [24]:
scaler.fit(unscaled_inputs)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [25]:
scaled_inputs = scaler.transform(unscaled_inputs)
scaled_inputs.shape[0]

10000

In [26]:
train_test_split(scaled_inputs, targets)

[array([[-0.1777872 , -0.32802784, -0.2975449 , ...,  2.24225947,
         -0.34233552, -0.09476245],
        [-0.1777872 , -0.32802784, -0.2975449 , ..., -0.44597872,
         -0.34233552, -0.09476245],
        [-0.1777872 , -0.32802784, -0.2975449 , ..., -0.44597872,
         -0.34233552, -0.09476245],
        ...,
        [ 0.75460074,  0.80071082,  0.89756095, ..., -0.44597872,
          2.92111083, -0.09476245],
        [-0.15464282,  0.89173813, -0.25992655, ..., -0.44597872,
         -0.34233552, -0.09476245],
        [-0.1777872 , -0.32802784, -0.2975449 , ..., -0.44597872,
         -0.34233552, -0.09476245]]),
 array([[-0.81591087, -0.29161691,  0.02944532, ...,  2.24225947,
         -0.34233552, -0.09476245],
        [ 0.01067419,  0.16351964,  0.60818908, ..., -0.44597872,
         -0.34233552, -0.09476245],
        [-0.15464282,  0.16351964,  0.89756095, ..., -0.44597872,
         -0.34233552, -0.09476245],
        ...,
        [-0.1777872 , -0.32802784, -0.2975449 , ..., -

In [27]:
x_train, x_test, y_train, y_test = train_test_split(scaled_inputs, targets, train_size=0.8, random_state=20)

In [28]:
print(x_train.shape, y_train.shape)

(8000, 21) (8000,)


In [29]:
print(x_test.shape, y_test.shape)

(2000, 21) (2000,)


In [30]:
reg = LogisticRegression()

In [31]:
reg.fit(x_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [32]:
reg.score(x_train, y_train)

0.771875

In [33]:
model_outputs = reg.predict(x_train)
model_outputs

array([1, 1, 0, ..., 0, 1, 0])

In [34]:
model_outputs == y_train

258582     False
182619      True
20243       True
153416     False
84261       True
1022889     True
324232      True
183307      True
155455      True
624258      True
876284      True
175120      True
871818     False
1029463     True
35906       True
144712      True
277630      True
1173864    False
887694      True
167228      True
410761      True
115626      True
1090878    False
7805        True
215126     False
144277      True
61963       True
267203      True
645988      True
232514      True
832028      True
171036      True
42462       True
377877      True
81196      False
297491      True
618540      True
857826     False
1052909     True
1078192     True
1233248     True
300524      True
1196876     True
237773      True
1137942     True
315760      True
217335     False
309060     False
159352      True
930395      True
871184      True
924642      True
1254975     True
487245      True
259042     False
1045407     True
111227      True
259323      True
360680      Tr

In [35]:
np.sum((model_outputs==y_train))

6175

In [36]:
model_outputs.shape[0]

8000

In [37]:
np.sum((model_outputs==y_train)) / model_outputs.shape[0]

0.771875

In [38]:
reg.intercept_

array([-0.14364115])

In [39]:
reg.coef_

array([[-0.35661483,  1.24915794,  0.51581726,  0.23839626,  0.03750879,
        -0.01397395, -0.01206239,  0.02216836, -0.2403619 , -0.15577595,
         0.07251869, -0.36565563, -0.20127906,  0.13729724,  0.12252829,
         0.09233727, -0.06339424, -0.21400021,  0.17730064,  0.08583067,
         0.11534009]])

In [40]:
unscaled_inputs.columns.values

array(['PESO', 'ALTURA', 'CABECA', 'CALCADO', 'CINTURA', 'RELIGIAO',
       'PAIS_NASCIMENTO', 'ESTADO_CIVIL', 'ZONA_RESIDENCIAL',
       'PAIS_RESIDENCIA', 'OUT_OF_YEAR', 'EDUC_0', 'EDUC_1', 'EDUC_2',
       'EDUC_3', 'JSM_UF_1', 'JSM_UF_2', 'JSM_UF_3', 'JSM_UF_4',
       'JSM_UF_5', 'JSM_UF_6'], dtype=object)

In [41]:
feature_name = unscaled_inputs.columns.values

In [42]:
summary_table = pd.DataFrame(columns=['Feature name'], data=feature_name)

summary_table['Coefficient'] = np.transpose(reg.coef_)

summary_table

Unnamed: 0,Feature name,Coefficient
0,PESO,-0.356615
1,ALTURA,1.249158
2,CABECA,0.515817
3,CALCADO,0.238396
4,CINTURA,0.037509
5,RELIGIAO,-0.013974
6,PAIS_NASCIMENTO,-0.012062
7,ESTADO_CIVIL,0.022168
8,ZONA_RESIDENCIAL,-0.240362
9,PAIS_RESIDENCIA,-0.155776


In [43]:
summary_table.index = summary_table.index + 1
summary_table.loc[0] = ['Intercept', reg.intercept_[0]]
summary_table = summary_table.sort_index()
summary_table

Unnamed: 0,Feature name,Coefficient
0,Intercept,-0.143641
1,PESO,-0.356615
2,ALTURA,1.249158
3,CABECA,0.515817
4,CALCADO,0.238396
5,CINTURA,0.037509
6,RELIGIAO,-0.013974
7,PAIS_NASCIMENTO,-0.012062
8,ESTADO_CIVIL,0.022168
9,ZONA_RESIDENCIAL,-0.240362


In [44]:
summary_table['Odds_ratio'] = np.exp(summary_table.Coefficient)

In [45]:
summary_table

Unnamed: 0,Feature name,Coefficient,Odds_ratio
0,Intercept,-0.143641,0.866199
1,PESO,-0.356615,0.700042
2,ALTURA,1.249158,3.487405
3,CABECA,0.515817,1.675007
4,CALCADO,0.238396,1.269212
5,CINTURA,0.037509,1.038221
6,RELIGIAO,-0.013974,0.986123
7,PAIS_NASCIMENTO,-0.012062,0.98801
8,ESTADO_CIVIL,0.022168,1.022416
9,ZONA_RESIDENCIAL,-0.240362,0.786343


In [46]:
summary_table.sort_values('Odds_ratio', ascending=False)

Unnamed: 0,Feature name,Coefficient,Odds_ratio
2,ALTURA,1.249158,3.487405
3,CABECA,0.515817,1.675007
4,CALCADO,0.238396,1.269212
19,JSM_UF_4,0.177301,1.19399
14,EDUC_2,0.137297,1.147169
15,EDUC_3,0.122528,1.130351
21,JSM_UF_6,0.11534,1.122255
16,JSM_UF_1,0.092337,1.096735
20,JSM_UF_5,0.085831,1.089622
11,OUT_OF_YEAR,0.072519,1.075213


In [47]:
reg.score(x_test, y_test)

0.769

In [48]:
predicted_proba = reg.predict_proba(x_test)
predicted_proba

array([[0.1717722 , 0.8282278 ],
       [0.71838877, 0.28161123],
       [0.62047192, 0.37952808],
       ...,
       [0.22877349, 0.77122651],
       [0.65691822, 0.34308178],
       [0.26202065, 0.73797935]])

In [49]:
predicted_proba.shape

(2000, 2)