In [1]:
import pandas as pd
import numpy as np

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

pd.options.display.max_columns = None
pd.options.display.max_rows = None
number_of_data = 5000

In [2]:
data_preprocessed = pd.read_csv('sermil2020-preprocessed.csv')

In [3]:
data_preprocessed.head()

Unnamed: 0,PESO,ALTURA,CABECA,CALCADO,CINTURA,PAIS_NASCIMENTO,ESTADO_CIVIL,TARGETS,ZONA_RESIDENCIAL,PAIS_RESIDENCIA,DIFF_MUN_JSM,DIFF_UF_JSM,OUT_OF_YEAR,RELI_0,RELI_1,RELI_2,RELI_3,RELI_4,EDUC_0,EDUC_1,EDUC_2,EDUC_3,JSM_UF_1,JSM_UF_2,JSM_UF_3,JSM_UF_4,JSM_UF_5,JSM_UF_6
0,87.0,181.0,59.0,43.0,83.0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0
1,72.0,182.0,57.0,42.0,86.0,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0
2,70.0,185.0,57.0,42.0,78.0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,0,1,0,0,0,0,0
3,85.0,182.0,60.0,41.0,100.0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,0,1,0,0,0,0,0
4,102.0,172.0,56.0,42.0,83.0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0


In [4]:
data_preprocessed['TARGETS'].value_counts()

0    1169044
1     134359
Name: TARGETS, dtype: int64

In [5]:
targets_0 = data_preprocessed.copy()
targets_0 = targets_0.loc[targets_0['TARGETS'] == 0].sample(frac=1)
targets_0 = targets_0[:number_of_data]
targets_0['TARGETS'].value_counts()

0    5000
Name: TARGETS, dtype: int64

In [6]:
targets_1 = data_preprocessed.copy()
targets_1 = targets_1.loc[targets_1['TARGETS'] == 1].sample(frac=1)
targets_1 = targets_1[:number_of_data]
targets_1['TARGETS'].value_counts()

1    5000
Name: TARGETS, dtype: int64

In [7]:
df = pd.concat([targets_0, targets_1])
df.sample(frac=1)
df.head()

Unnamed: 0,PESO,ALTURA,CABECA,CALCADO,CINTURA,PAIS_NASCIMENTO,ESTADO_CIVIL,TARGETS,ZONA_RESIDENCIAL,PAIS_RESIDENCIA,DIFF_MUN_JSM,DIFF_UF_JSM,OUT_OF_YEAR,RELI_0,RELI_1,RELI_2,RELI_3,RELI_4,EDUC_0,EDUC_1,EDUC_2,EDUC_3,JSM_UF_1,JSM_UF_2,JSM_UF_3,JSM_UF_4,JSM_UF_5,JSM_UF_6
1303091,63.72,164.6,53.87,39.05,75.49,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0
348112,30.0,120.0,41.0,30.0,50.0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0
804271,63.72,164.6,53.87,39.05,75.49,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0
71565,30.0,120.0,41.0,30.0,50.0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0
597495,63.72,164.6,53.87,39.05,75.49,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0


In [8]:
targets = df['TARGETS']
targets.shape[0]

10000

In [9]:
df = df.drop(['TARGETS', 'JSM_UF_1', 'JSM_UF_2', 'JSM_UF_3', 'JSM_UF_4', 'JSM_UF_5', 'JSM_UF_6'],axis=1)
df.shape[0]

10000

In [10]:
unscaled_inputs = df.copy()

In [11]:
class CustomScaler(BaseEstimator, TransformerMixin):
    
    def __init__(self,columns,copy=True,with_mean=True,with_std=True):
        self.scaler = StandardScaler(copy,with_mean,with_std)
        self.columns = columns
        self.mean_ = None
        self.var_ = None
        
    def fit(self, X, y=None):
        self.scaler.fit(X[self.columns], y)
        self.mean_ = np.mean(X[self.columns])
        self.var_ = np.var(X[self.columns])
        return self
    
    def transform(self, X, y=None, copy=None):
        init_col_order = X.columns
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns=self.columns)
        X_not_scaled = X.loc[:,~X.columns.isin(self.columns)]
        return pd.concat([X_not_scaled, X_scaled], axis=1)[init_col_order]

In [12]:
columns_to_omit = [
    'PAIS_NASCIMENTO', 
    'ESTADO_CIVIL', 
    'ZONA_RESIDENCIAL',                 
    'PAIS_RESIDENCIA', 
    'DIFF_MUN_JSM', 
    'DIFF_UF_JSM',
    'OUT_OF_YEAR', 
    'RELI_0', 
    'RELI_1', 
    'RELI_2', 
    'RELI_3',
    'RELI_4', 
    'EDUC_0', 
    'EDUC_1', 
    'EDUC_2', 
    'EDUC_3',
    'JSM_UF_1', 
    'JSM_UF_2', 
    'JSM_UF_3', 
    'JSM_UF_4',
    'JSM_UF_5', 
    'JSM_UF_6'
]

In [13]:
columns_to_scale = [x for x in unscaled_inputs.columns.values if x not in columns_to_omit]

In [14]:
# scaler = CustomScaler(columns_to_scale)
scaler = StandardScaler()

In [15]:
scaler.fit(unscaled_inputs)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [16]:
scaled_inputs = scaler.transform(unscaled_inputs)
scaled_inputs.shape[0]

10000

In [17]:
train_test_split(scaled_inputs, targets)

[array([[-0.17506685, -0.30770197, -0.27000332, ...,  2.65791575,
         -2.00125086, -0.28207066],
        [-0.17506685, -0.30770197, -0.27000332, ..., -0.37623465,
          0.49968748, -0.28207066],
        [-0.17506685, -0.30770197, -0.27000332, ...,  2.65791575,
         -2.00125086, -0.28207066],
        ...,
        [-0.17506685, -0.30770197, -0.27000332, ..., -0.37623465,
          0.49968748, -0.28207066],
        [-0.17506685, -0.30770197, -0.27000332, ..., -0.37623465,
         -2.00125086,  3.545211  ],
        [-0.17506685, -0.30770197, -0.27000332, ..., -0.37623465,
          0.49968748, -0.28207066]]),
 array([[-0.72272122,  0.43430198, -0.2335414 , ...,  2.65791575,
         -2.00125086, -0.28207066],
        [ 0.90720251,  1.93597663,  0.0469349 , ..., -0.37623465,
          0.49968748, -0.28207066],
        [-0.17506685, -0.30770197, -0.27000332, ..., -0.37623465,
          0.49968748, -0.28207066],
        ...,
        [ 0.49972157,  1.49430761,  0.60788751, ..., -

In [18]:
x_train, x_test, y_train, y_test = train_test_split(scaled_inputs, targets, train_size=0.8, random_state=20)

In [19]:
print(x_train.shape, y_train.shape)

(8000, 21) (8000,)


In [20]:
print(x_test.shape, y_test.shape)

(2000, 21) (2000,)


In [21]:
reg = LogisticRegression()

In [22]:
reg.fit(x_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [23]:
reg.score(x_train, y_train)

0.776125

In [24]:
model_outputs = reg.predict(x_train)
model_outputs

array([0, 1, 0, ..., 1, 1, 0])

In [25]:
model_outputs == y_train

172466      True
383202      True
1143693     True
464705     False
351321      True
1135821     True
400694      True
344852      True
40875       True
834795      True
1073005     True
221244      True
956828      True
648811      True
167507     False
336251      True
233895      True
216757      True
425094      True
32704      False
228835      True
326942     False
11221       True
360889      True
377008      True
271552      True
233125      True
628776     False
1107800     True
370423      True
177384     False
988137      True
203683      True
59226       True
868458      True
241588      True
1181924     True
1105035    False
738913      True
833029      True
889709      True
4395        True
207134     False
230996     False
640730      True
1381        True
154465     False
257973      True
414595      True
675568      True
97520      False
940790      True
486399      True
907264      True
168326      True
450952      True
374825      True
270061      True
513052      Tr

In [26]:
np.sum((model_outputs==y_train))

6209

In [27]:
model_outputs.shape[0]

8000

In [28]:
np.sum((model_outputs==y_train)) / model_outputs.shape[0]

0.776125

In [29]:
reg.intercept_

array([-0.16573265])

In [30]:
reg.coef_

array([[-0.2443086 ,  1.13700635,  0.5555997 ,  0.31313256, -0.00510594,
        -0.0700508 ,  0.03055763, -0.23414603, -0.37654286, -0.03158872,
         0.02720728,  0.04398153,  0.01735128, -0.01809601,  0.00154416,
         0.        ,  0.        , -0.28840912, -0.18489842,  0.12029598,
         0.09966798]])

In [31]:
unscaled_inputs.columns.values

array(['PESO', 'ALTURA', 'CABECA', 'CALCADO', 'CINTURA',
       'PAIS_NASCIMENTO', 'ESTADO_CIVIL', 'ZONA_RESIDENCIAL',
       'PAIS_RESIDENCIA', 'DIFF_MUN_JSM', 'DIFF_UF_JSM', 'OUT_OF_YEAR',
       'RELI_0', 'RELI_1', 'RELI_2', 'RELI_3', 'RELI_4', 'EDUC_0',
       'EDUC_1', 'EDUC_2', 'EDUC_3'], dtype=object)

In [32]:
feature_name = unscaled_inputs.columns.values

In [33]:
summary_table = pd.DataFrame(columns=['Feature name'], data=feature_name)

summary_table['Coefficient'] = np.transpose(reg.coef_)

summary_table

Unnamed: 0,Feature name,Coefficient
0,PESO,-0.244309
1,ALTURA,1.137006
2,CABECA,0.5556
3,CALCADO,0.313133
4,CINTURA,-0.005106
5,PAIS_NASCIMENTO,-0.070051
6,ESTADO_CIVIL,0.030558
7,ZONA_RESIDENCIAL,-0.234146
8,PAIS_RESIDENCIA,-0.376543
9,DIFF_MUN_JSM,-0.031589


In [34]:
summary_table.index = summary_table.index + 1
summary_table.loc[0] = ['Intercept', reg.intercept_[0]]
summary_table = summary_table.sort_index()
summary_table

Unnamed: 0,Feature name,Coefficient
0,Intercept,-0.165733
1,PESO,-0.244309
2,ALTURA,1.137006
3,CABECA,0.5556
4,CALCADO,0.313133
5,CINTURA,-0.005106
6,PAIS_NASCIMENTO,-0.070051
7,ESTADO_CIVIL,0.030558
8,ZONA_RESIDENCIAL,-0.234146
9,PAIS_RESIDENCIA,-0.376543


In [35]:
summary_table['Odds_ratio'] = np.exp(summary_table.Coefficient)

In [36]:
summary_table

Unnamed: 0,Feature name,Coefficient,Odds_ratio
0,Intercept,-0.165733,0.847273
1,PESO,-0.244309,0.783246
2,ALTURA,1.137006,3.117422
3,CABECA,0.5556,1.742986
4,CALCADO,0.313133,1.367703
5,CINTURA,-0.005106,0.994907
6,PAIS_NASCIMENTO,-0.070051,0.932346
7,ESTADO_CIVIL,0.030558,1.031029
8,ZONA_RESIDENCIAL,-0.234146,0.791246
9,PAIS_RESIDENCIA,-0.376543,0.68623


In [37]:
summary_table.sort_values('Odds_ratio', ascending=False)

Unnamed: 0,Feature name,Coefficient,Odds_ratio
2,ALTURA,1.137006,3.117422
3,CABECA,0.5556,1.742986
4,CALCADO,0.313133,1.367703
20,EDUC_2,0.120296,1.127831
21,EDUC_3,0.099668,1.104804
12,OUT_OF_YEAR,0.043982,1.044963
7,ESTADO_CIVIL,0.030558,1.031029
11,DIFF_UF_JSM,0.027207,1.027581
13,RELI_0,0.017351,1.017503
15,RELI_2,0.001544,1.001545


In [38]:
reg.score(x_test, y_test)

0.773

In [39]:
predicted_proba = reg.predict_proba(x_test)
predicted_proba

array([[0.1944526 , 0.8055474 ],
       [0.78272071, 0.21727929],
       [0.76782907, 0.23217093],
       ...,
       [0.60341862, 0.39658138],
       [0.26910494, 0.73089506],
       [0.60341862, 0.39658138]])

In [40]:
predicted_proba.shape

(2000, 2)