In [35]:
## Importando bibliotecas essenciais de DS
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import random

##### O problema do negócio era simples.
Temos um dataset de caminhões nos quais se classificam caminhões em 'neg' (não deram defeito) e 'pos' (deram defeito).
A missão, então, é simples, prever, a partir das 170 covariadas do dataset quais caminhões vão dar defeito ou não.

O problema é que 170 colunas é perfeito para, de cara termos um problema de overfitting.
Pior que isto, no entanto, é que as colunas estão todas codificadas, ou seja, não consigo inferir sua importância através de alguma análise teórica ou intuitiva da situação.

Isso fez desta modelagem especialmente desafiadora, porque tive que aplicar técnicas específicas para reduzir a dimensionalidade do dataset, remover colunas com muita colinearidade, ou então pouco influentes, usando, inclusive, de técnica como PCA, das quais nunca tinha feito uso.

Enfim, abaixo, a cada passo, estarei comentando com mais profundidade cada etapa essencial

In [36]:
# Estabelecendo uma seed para garantir reprodutibilidade
random.seed(1603)

In [37]:
## lendo o dataset 'previous year', que deve servir como base de nossa projeção
df_previous = pd.read_csv('C:/Users/jpzam/Desktop/Case_study_selection/air_system_previous_years.csv')

df_previous.head()

Unnamed: 0,class,aa_000,ab_000,ac_000,ad_000,ae_000,af_000,ag_000,ag_001,ag_002,...,ee_002,ee_003,ee_004,ee_005,ee_006,ee_007,ee_008,ee_009,ef_000,eg_000
0,neg,76698,na,2130706438,280,0,0,0,0,0,...,1240520,493384,721044,469792,339156,157956,73224,0,0,0
1,neg,33058,na,0,na,0,0,0,0,0,...,421400,178064,293306,245416,133654,81140,97576,1500,0,0
2,neg,41040,na,228,100,0,0,0,0,0,...,277378,159812,423992,409564,320746,158022,95128,514,0,0
3,neg,12,0,70,66,0,10,0,0,0,...,240,46,58,44,10,0,0,0,4,32
4,neg,60874,na,1368,458,0,0,0,0,0,...,622012,229790,405298,347188,286954,311560,433954,1218,0,0


In [38]:
df_previous['class'].unique()

array(['neg', 'pos'], dtype=object)

In [39]:
## Sumarização breve
print(df_previous.columns)

print(df_previous.dtypes)

Index(['class', 'aa_000', 'ab_000', 'ac_000', 'ad_000', 'ae_000', 'af_000',
       'ag_000', 'ag_001', 'ag_002',
       ...
       'ee_002', 'ee_003', 'ee_004', 'ee_005', 'ee_006', 'ee_007', 'ee_008',
       'ee_009', 'ef_000', 'eg_000'],
      dtype='object', length=171)
class     object
aa_000     int64
ab_000    object
ac_000    object
ad_000    object
           ...  
ee_007    object
ee_008    object
ee_009    object
ef_000    object
eg_000    object
Length: 171, dtype: object


In [40]:
# Aferindo a presença de NAs na coluna 'class', que estão como 'na', em string.
df_previous[df_previous['class'] == 'na'].value_counts()

Series([], dtype: int64)

In [41]:
## Substituindos os 'na' por valores NaN

df_previous = df_previous.replace('na', np.nan)

df_previous.head()

Unnamed: 0,class,aa_000,ab_000,ac_000,ad_000,ae_000,af_000,ag_000,ag_001,ag_002,...,ee_002,ee_003,ee_004,ee_005,ee_006,ee_007,ee_008,ee_009,ef_000,eg_000
0,neg,76698,,2130706438,280.0,0,0,0,0,0,...,1240520,493384,721044,469792,339156,157956,73224,0,0,0
1,neg,33058,,0,,0,0,0,0,0,...,421400,178064,293306,245416,133654,81140,97576,1500,0,0
2,neg,41040,,228,100.0,0,0,0,0,0,...,277378,159812,423992,409564,320746,158022,95128,514,0,0
3,neg,12,0.0,70,66.0,0,10,0,0,0,...,240,46,58,44,10,0,0,0,4,32
4,neg,60874,,1368,458.0,0,0,0,0,0,...,622012,229790,405298,347188,286954,311560,433954,1218,0,0


In [42]:
## Conferência para ver a quantidade de NaN's no dataset.
df_previous.isnull().sum().sort_values(ascending = False)

br_000    49264
bq_000    48722
bp_000    47740
bo_000    46333
ab_000    46329
          ...  
cj_000      338
ci_000      338
bt_000      167
aa_000        0
class         0
Length: 171, dtype: int64

In [43]:
## Tendo em vista que as colunas são, por completo, numéricas, resolvi substituir os NaNs pela moda de cada coluna

# Loop para substituir os NAs pela moda.
for col in df_previous.columns:
    if df_previous[col].isna().sum() > 0:
        moda = df_previous[col].mode()[0]
        df_previous[col] = df_previous[col].fillna(moda)

print(df_previous.isna().sum()) # E conferência final

class     0
aa_000    0
ab_000    0
ac_000    0
ad_000    0
         ..
ee_007    0
ee_008    0
ee_009    0
ef_000    0
eg_000    0
Length: 171, dtype: int64


In [44]:
## Mudando as variáveis numéricas para 'float64'.

df_previous.iloc[:, 1:] = df_previous.iloc[:, 1:].astype(float)

df_previous.dtypes


  df_previous.iloc[:, 1:] = df_previous.iloc[:, 1:].astype(float)


class      object
aa_000    float64
ab_000    float64
ac_000    float64
ad_000    float64
           ...   
ee_007    float64
ee_008    float64
ee_009    float64
ef_000    float64
eg_000    float64
Length: 171, dtype: object

In [45]:
df_previous.head()

Unnamed: 0,class,aa_000,ab_000,ac_000,ad_000,ae_000,af_000,ag_000,ag_001,ag_002,...,ee_002,ee_003,ee_004,ee_005,ee_006,ee_007,ee_008,ee_009,ef_000,eg_000
0,neg,76698.0,0.0,2130706000.0,280.0,0.0,0.0,0.0,0.0,0.0,...,1240520.0,493384.0,721044.0,469792.0,339156.0,157956.0,73224.0,0.0,0.0,0.0
1,neg,33058.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,421400.0,178064.0,293306.0,245416.0,133654.0,81140.0,97576.0,1500.0,0.0,0.0
2,neg,41040.0,0.0,228.0,100.0,0.0,0.0,0.0,0.0,0.0,...,277378.0,159812.0,423992.0,409564.0,320746.0,158022.0,95128.0,514.0,0.0,0.0
3,neg,12.0,0.0,70.0,66.0,0.0,10.0,0.0,0.0,0.0,...,240.0,46.0,58.0,44.0,10.0,0.0,0.0,0.0,4.0,32.0
4,neg,60874.0,0.0,1368.0,458.0,0.0,0.0,0.0,0.0,0.0,...,622012.0,229790.0,405298.0,347188.0,286954.0,311560.0,433954.0,1218.0,0.0,0.0


In [46]:
## Transformando 'classe' em uma coluna booleana

df_previous['class'] = df_previous['class'] != 'neg' ## neg = 0;False ;; pos = 1; True
df_previous

df_previous.head()

Unnamed: 0,class,aa_000,ab_000,ac_000,ad_000,ae_000,af_000,ag_000,ag_001,ag_002,...,ee_002,ee_003,ee_004,ee_005,ee_006,ee_007,ee_008,ee_009,ef_000,eg_000
0,False,76698.0,0.0,2130706000.0,280.0,0.0,0.0,0.0,0.0,0.0,...,1240520.0,493384.0,721044.0,469792.0,339156.0,157956.0,73224.0,0.0,0.0,0.0
1,False,33058.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,421400.0,178064.0,293306.0,245416.0,133654.0,81140.0,97576.0,1500.0,0.0,0.0
2,False,41040.0,0.0,228.0,100.0,0.0,0.0,0.0,0.0,0.0,...,277378.0,159812.0,423992.0,409564.0,320746.0,158022.0,95128.0,514.0,0.0,0.0
3,False,12.0,0.0,70.0,66.0,0.0,10.0,0.0,0.0,0.0,...,240.0,46.0,58.0,44.0,10.0,0.0,0.0,0.0,4.0,32.0
4,False,60874.0,0.0,1368.0,458.0,0.0,0.0,0.0,0.0,0.0,...,622012.0,229790.0,405298.0,347188.0,286954.0,311560.0,433954.0,1218.0,0.0,0.0


Feita a limpeza e tratamento dos dados nas linhas acima, aqui, abaixo, importo os pacotes essenciais para a construção do modelo.

Tendo em vista a pergunta a ser respondida, optei por um modelo de regressão logística, para o qual usarei a biblioteca sickit-learn.

Antes de tudo, porém, precisava reduzir a dimensionalidade desse dataset e e selecionar as melhores variáveis para a predição.

In [47]:
from scipy.stats import pointbiserialr
from sklearn import linear_model

In [48]:
## Calculando as correlações com 'classe' para cada coluna
    ## Neste momento, pensei que isto talvez bastaria para que eu pudesse encontrar alguma luz que me apontasse quais variáveis usar
correlacoes = []

for col in df_previous.columns:
    if col != 'class':
        correlacao = pointbiserialr(df_previous['class'], df_previous[col])
        correlacoes.append((col, correlacao[0], correlacao[1])) # Coluna, correlação, p-valor

print(correlacoes)

[('aa_000', 0.5369783925131109, 0.0), ('ab_000', 0.015084381201274314, 0.00021985482974012596), ('ac_000', -0.05099634473091605, 7.524064561691334e-36), ('ad_000', -0.0005297612434709985, 0.8967549032172947), ('ae_000', 0.005541072015891219, 0.17469796167472249), ('af_000', 0.016249922517591543, 6.874682388440666e-05), ('ag_000', 0.01235259091904953, 0.002479706370481349), ('ag_001', 0.19108385741369696, 0.0), ('ag_002', 0.3399556782143777, 0.0), ('ag_003', 0.43305030605979766, 0.0), ('ag_004', 0.3716224837308859, 0.0), ('ag_005', 0.44817851080983284, 0.0), ('ag_006', 0.36834794713121316, 0.0), ('ag_007', 0.24191004312131892, 0.0), ('ag_008', 0.11300610195605039, 1.0235682569954864e-169), ('ag_009', 0.07208137875681554, 6.098446719681328e-70), ('ah_000', 0.511630935180927, 0.0), ('ai_000', 0.11827308754982259, 8.029574893207054e-186), ('aj_000', 0.023871558015076646, 4.974435179784846e-09), ('ak_000', 0.011752894095135972, 0.003990663732019046), ('al_000', 0.36545199897228575, 0.0), ('



In [49]:
## Colocando correlacoes em um dataframe; e ordenando para seleção
corr_df = pd.DataFrame(correlacoes, columns = ['coluna_df', 'correlacao', 'p_valor'])

# Correlações positivas
corr_df = corr_df.sort_values(by = ['p_valor', 'correlacao'], ascending = [True, False]).reset_index(drop = True)
corr_df.head(10)

## Com dá para ver, não foi muito efetivo, não dá para tirar muita coisa daqui. 
    ## Mas parece haver indícios de que muitas colunas, afinal, implicam coisas parecidas para 'class'

Unnamed: 0,coluna_df,correlacao,p_valor
0,ci_000,0.550049,0.0
1,aa_000,0.536978,0.0
2,bt_000,0.533964,0.0
3,bb_000,0.529501,0.0
4,bv_000,0.528056,0.0
5,bu_000,0.528056,0.0
6,cq_000,0.528056,0.0
7,aq_000,0.518841,0.0
8,bj_000,0.513465,0.0
9,cc_000,0.511886,0.0


In [50]:
# Correlações negativas.
corr_df = corr_df.sort_values(by = ['p_valor', 'correlacao'], ascending = [True, True]).reset_index(drop = True)
corr_df.head(5)

Unnamed: 0,coluna_df,correlacao,p_valor
0,br_000,-0.27425,0.0
1,bq_000,-0.263856,0.0
2,bp_000,-0.245132,0.0
3,bo_000,-0.22264,0.0
4,bn_000,-0.192862,0.0


In [51]:
## Aqui tentei fazer um teste de variança, para escolher quais colunas eu poderia droppar.
    ## Ao fim do modelo, percebi que este teste não contribuía tanto para a seleção das covariadas e optei por removê-lo por completo.
#from sklearn.feature_selection import VarianceThreshold

# Setting the threshold
#select = VarianceThreshold(threshold = 10) # <- Variance threshold; 1 reduced a fair ammount

#select.fit(df_previous/df_previous.mean()) ## Nomalizing data
#mask = select.get_support()

df_previous_reduzido = df_previous#.loc[:, mask] ## Aqui removi a máscara, deixando-a comentada. 
    ## Para aquele que vê o modelo, caso tenha sugestões de como fazer um melhor uso deste teste, sinta-se a vontade para dar ideias.

print(df_previous.shape)
print(df_previous_reduzido.shape)

(60000, 171)
(60000, 171)


In [52]:
df_test = df_previous_reduzido.iloc[:, 1:]

In [53]:
## Testando colinearidade
from statsmodels.stats.outliers_influence import variance_inflation_factor

vif = pd.DataFrame()

vif["features"] = df_test.columns

vif['fator_vif'] = [variance_inflation_factor(df_test.values, i) for i in range(df_test.shape[1])]

vif

Unnamed: 0,features,fator_vif
0,aa_000,4.012605e+02
1,ab_000,1.068214e+00
2,ac_000,1.281894e+00
3,ad_000,1.266802e+09
4,ae_000,3.173399e+00
...,...,...
165,ee_007,1.826089e+09
166,ee_008,1.236384e+08
167,ee_009,1.377875e+06
168,ef_000,1.128800e+00


In [54]:
## selecionando aqui os fatores cujos vifs sejam inferiores a 5. A literatura vigente parece indicar que este é um bom threshold.

vif = vif[vif['fator_vif'] <= 5]

list(vif['features'])

['ab_000',
 'ac_000',
 'ae_000',
 'af_000',
 'ag_000',
 'ai_000',
 'aj_000',
 'ak_000',
 'ar_000',
 'as_000',
 'at_000',
 'au_000',
 'av_000',
 'ax_000',
 'bc_000',
 'bd_000',
 'be_000',
 'bf_000',
 'bl_000',
 'bm_000',
 'bs_000',
 'bz_000',
 'ca_000',
 'cg_000',
 'ch_000',
 'cj_000',
 'cl_000',
 'cm_000',
 'cp_000',
 'cr_000',
 'cs_001',
 'cy_000',
 'da_000',
 'db_000',
 'dd_000',
 'de_000',
 'df_000',
 'dg_000',
 'dh_000',
 'di_000',
 'dj_000',
 'dk_000',
 'dq_000',
 'dr_000',
 'dx_000',
 'dy_000',
 'dz_000',
 'ea_000',
 'eb_000',
 'ef_000',
 'eg_000']

In [55]:
## Filtrando aqueles que são inferiores a cinco

df_previous_reduzido.columns.isin(list(vif['features']))

vif_features = vif['features'].tolist()

df_previous_filtered = df_previous_reduzido[['class'] + vif_features]

df_previous_filtered.head()

Unnamed: 0,class,ab_000,ac_000,ae_000,af_000,ag_000,ai_000,aj_000,ak_000,ar_000,...,dk_000,dq_000,dr_000,dx_000,dy_000,dz_000,ea_000,eb_000,ef_000,eg_000
0,False,0.0,2130706000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2801180.0,0.0,0.0
1,False,0.0,0.0,0.0,0.0,0.0,0.0,68.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3477820.0,0.0,0.0
2,False,0.0,228.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1040120.0,0.0,0.0
3,False,0.0,70.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,...,0.0,2014.0,370.0,20174.0,44.0,0.0,0.0,0.0,4.0,32.0
4,False,0.0,1368.0,0.0,0.0,0.0,0.0,226.0,0.0,0.0,...,0.0,0.0,0.0,98334.0,27588.0,0.0,0.0,21173050.0,0.0,0.0


In [56]:
## Fazendo agora PCA para reduzir ainda mais a dimensionalidade.
    ## Como interpretabilidade não era, exatamente, o objetivo do modelo, fazer combinações assim não trazia muitos prejuízos.
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df_previous_std = scaler.fit_transform(df_previous_filtered) ## Aplicando o standard scaler
pca = PCA(n_components = 30)

pc = pca.fit_transform(df_previous_std)

print(pca.explained_variance_ratio_)

# Testing with df_previous; unfiltered.

#df_previous_std = scaler.fit_transform(df_previous)

#pca = PCA()

#pca.fit(df_previous_std)

#print(pca.explained_variance_ratio_)

print(pca.explained_variance_ratio_.cumsum())

[0.11784524 0.04351647 0.03770767 0.03700105 0.03326159 0.03285731
 0.03168028 0.03011506 0.02941475 0.02687504 0.02475095 0.02371373
 0.02258081 0.02244718 0.02187659 0.02146106 0.020574   0.01988748
 0.01932648 0.01910922 0.01893226 0.01877558 0.01856365 0.01789939
 0.01726799 0.01707957 0.01624242 0.01588227 0.01508117 0.01446777]
[0.11784524 0.16136171 0.19906938 0.23607043 0.26933202 0.30218933
 0.33386961 0.36398467 0.39339942 0.42027445 0.4450254  0.46873913
 0.49131995 0.51376712 0.53564371 0.55710477 0.57767877 0.59756625
 0.61689273 0.63600195 0.65493421 0.67370979 0.69227343 0.71017282
 0.72744082 0.74452039 0.76076281 0.77664508 0.79172625 0.80619401]


In [57]:
print(pc[:, 0]) ## All Rows and First columns

print(pc[:, 29])

[ 0.4949083   0.01050001 -0.57086209 ... -1.14849055  0.4292142
 -0.15610979]
[ 0.20455878  0.00997734 -0.135083   ... -0.04882534 -0.08586258
 -0.20033107]


In [58]:
## Mirando para 80% de explicação do modelo, um valor ok para uma base com tantas covariadas
    ## O objetivo é reduzir para, ao menos, 30.    

df_final = pd.DataFrame(pc, columns = [f'PC{n}' for n in range (0,30)])

df_final = pd.concat([df_previous_filtered['class'], df_final], axis = 1)

df_final

Unnamed: 0,class,PC0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,...,PC20,PC21,PC22,PC23,PC24,PC25,PC26,PC27,PC28,PC29
0,False,0.494908,-1.058713,1.062927,-0.535794,0.510481,0.604555,-0.702982,-0.008774,0.821035,...,0.920742,-1.428758,0.029972,-0.266439,-0.243947,-0.401581,-0.376973,-0.280870,0.064578,0.204559
1,False,0.010500,-0.872519,0.876062,-0.678250,0.258127,0.324567,-0.537897,-0.167283,0.304379,...,-0.026899,0.410065,0.033737,-0.067756,-0.148712,-0.104260,0.363692,-0.174469,0.233674,0.009977
2,False,-0.570862,0.097870,-0.089227,-0.063715,0.041489,-0.082424,-0.039439,-0.048224,-0.026101,...,-0.263546,0.364392,0.007473,0.028369,-0.037823,-0.065289,0.012435,0.139802,0.065525,-0.135083
3,False,-0.992459,1.406717,0.032776,1.047784,-0.442538,0.022621,1.235937,-0.709827,-0.287785,...,-0.459385,0.196470,-0.740086,0.370472,-0.097457,0.109891,-0.435068,0.121590,0.162297,0.706590
4,False,0.594243,-1.012876,0.983027,-0.718450,0.616945,0.527859,-0.453126,0.074075,0.598856,...,-0.195069,0.494240,0.107050,-0.136780,0.010250,0.213592,-0.087259,-0.054933,0.089679,-0.088942
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59995,False,1.701621,-1.638527,1.486969,-0.993349,0.640425,0.964128,-0.978215,0.028341,1.034443,...,-0.035879,0.452871,0.080027,-0.417709,0.238699,0.117699,-0.765825,-0.351102,-0.280973,0.034132
59996,False,-1.094579,0.733647,-0.587794,0.262117,-0.046261,-0.125334,0.307470,0.104657,-0.013010,...,0.839941,-1.677252,-0.302407,-0.007619,-0.168573,-0.335317,-0.285048,0.021410,0.139518,-0.016935
59997,False,-1.148491,0.810050,-0.696557,0.346352,-0.137856,-0.185000,0.356634,0.079853,-0.132581,...,0.852279,-1.687609,-0.268111,-0.021020,-0.165107,-0.345165,-0.352539,-0.011206,0.212633,-0.048825
59998,False,0.429214,-0.545662,0.182461,-0.128540,0.384587,0.233903,0.112744,0.067053,-0.071903,...,0.830080,-1.604454,-0.044039,-0.295301,-0.102621,-0.325611,-0.458560,0.291015,-0.032701,-0.085863


In [59]:
# Aqui usei uma técnica que aprendi do DataCamp
    # Forward stepwise selection
    ## Esta técnica vai nos permitir selecionar, a partir do dataset de teste, as variáveis que levar ao melhor AUC possível
    ## Não espero que as 30 variáveis entrem nesta seleção.
from sklearn.metrics import roc_auc_score

# Function to calculate AUC
def auc(vars, explained, df):
    X = df[vars]
    y = df[explained]

    logreg = linear_model.LogisticRegression()
    logreg.fit(X, y)

    preds = logreg.predict_proba(X)[:, 1]
    auc = roc_auc_score(y, preds)
    return(auc)

# Função para determinar melhor variável
def next_best(current_vars, possible_vars, explained, df):
    best_auc = -1
    melhor_var = None

    for v in possible_vars:
        auc_v = auc(current_vars + [v], explained, df)
        if auc_v >= best_auc:
            best_auc = auc_v
            melhor_var = v
    return melhor_var

In [60]:
## Fazer o split de treino e teste.
from sklearn.model_selection import train_test_split

explicativa = df_final.drop('class', axis = 1)
explicada = df_final['class']

explicativas_train, explicativas_test, explicada_train, explicada_test = train_test_split(explicativa, explicada, test_size = 0.3)

train = pd.concat([explicada_train, explicativas_train], axis=1)
test = pd.concat([explicada_test, explicativas_test], axis=1)

In [61]:
vars_atuais = ['PC0'] ## Estabelecendo variável inicial, primeira coluna, mais explicativa.
auc_train = []
auc_test = []

possiveis = train.drop('class', axis = 1).columns.tolist()
explicada = 'class'

for n in range(10): ## Range arbitrário que, caso não suficiente poderia ser expandido.
        ## A função, basicamente, itera diversos fits de regressão logística e vai testando para ver se o AUC em teste aumentou ou caiu.
    best_var = next_best(vars_atuais, possiveis, explicada, train)
    vars_atuais.append(best_var)
    possiveis.remove(best_var)
    
    best_auc_train = auc(vars_atuais, explicada, train)
    best_auc_test = auc(vars_atuais, explicada, test)
    auc_train.append(best_auc_train)
    auc_test.append(best_auc_test)

    if auc_test[n] < auc_test[n-1]:
        print(vars_atuais)
        print(auc_train)
        print(auc_test)
        break
    else:
        pass

## PArece que, com essas vars. conseguimos um AUC bem positivo.

['PC0', 'PC3', 'PC1', 'PC16', 'PC10']
[0.9898722523010522, 0.994065936936929, 0.9943979816751853, 0.9955712853757667]
[0.9919889382478148, 0.9923168997447644, 0.9958423931924828, 0.9898246703014186]


In [62]:
df_final.head()

Unnamed: 0,class,PC0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,...,PC20,PC21,PC22,PC23,PC24,PC25,PC26,PC27,PC28,PC29
0,False,0.494908,-1.058713,1.062927,-0.535794,0.510481,0.604555,-0.702982,-0.008774,0.821035,...,0.920742,-1.428758,0.029972,-0.266439,-0.243947,-0.401581,-0.376973,-0.28087,0.064578,0.204559
1,False,0.0105,-0.872519,0.876062,-0.67825,0.258127,0.324567,-0.537897,-0.167283,0.304379,...,-0.026899,0.410065,0.033737,-0.067756,-0.148712,-0.10426,0.363692,-0.174469,0.233674,0.009977
2,False,-0.570862,0.09787,-0.089227,-0.063715,0.041489,-0.082424,-0.039439,-0.048224,-0.026101,...,-0.263546,0.364392,0.007473,0.028369,-0.037823,-0.065289,0.012435,0.139802,0.065525,-0.135083
3,False,-0.992459,1.406717,0.032776,1.047784,-0.442538,0.022621,1.235937,-0.709827,-0.287785,...,-0.459385,0.19647,-0.740086,0.370472,-0.097457,0.109891,-0.435068,0.12159,0.162297,0.70659
4,False,0.594243,-1.012876,0.983027,-0.71845,0.616945,0.527859,-0.453126,0.074075,0.598856,...,-0.195069,0.49424,0.10705,-0.13678,0.01025,0.213592,-0.087259,-0.054933,0.089679,-0.088942


In [63]:
## Agora sabemos que devemos usar PC0, PC3, PC1 e PC13 para fazer nossas predições
    # Treinando o modelo de Logistic Regression
vars_selecionadas = vars_atuais[:-1]

X = df_final[vars_selecionadas]
y = df_final[['class']]

logreg = linear_model.LogisticRegression()
logreg.fit(X, y)

  y = column_or_1d(y, warn=True)


Com o modelo treinado, tudo que faltava era aplicar o treinamento no dataset 'air_system_present_year.csv'

In [64]:
# Importing the current .csv data
df_present = pd.read_csv('C:/Users/jpzam/Desktop/Case_study_selection/air_system_present_year.csv')

print(df_present.shape)
df_present.head()

(16000, 171)


Unnamed: 0,class,aa_000,ab_000,ac_000,ad_000,ae_000,af_000,ag_000,ag_001,ag_002,...,ee_002,ee_003,ee_004,ee_005,ee_006,ee_007,ee_008,ee_009,ef_000,eg_000
0,neg,60,0,20,12,0,0,0,0,0,...,1098,138,412,654,78,88,0,0,0,0
1,neg,82,0,68,40,0,0,0,0,0,...,1068,276,1620,116,86,462,0,0,0,0
2,neg,66002,2,212,112,0,0,0,0,0,...,495076,380368,440134,269556,1315022,153680,516,0,0,0
3,neg,59816,na,1010,936,0,0,0,0,0,...,540820,243270,483302,485332,431376,210074,281662,3232,0,0
4,neg,1814,na,156,140,0,0,0,0,0,...,7646,4144,18466,49782,3176,482,76,0,0,0


In [65]:
# Limpando o dataset 'present', que estava bem organizado, para falar a verdade.
df_present = df_present.replace('na', np.nan)

for col in df_present.columns:
    if df_present[col].isna().sum() > 0:
        moda = df_present[col].mode()[0]
        df_present[col] = df_present[col].fillna(moda)

df_present.iloc[:, 1:] = df_present.iloc[:, 1:].astype(float)

df_present['class'] = df_present['class'] != 'neg' 

df_present_filtered = df_present[['class'] + vif_features]

  df_present.iloc[:, 1:] = df_present.iloc[:, 1:].astype(float)


In [66]:
df_present_filtered.head()

Unnamed: 0,class,ab_000,ac_000,ae_000,af_000,ag_000,ai_000,aj_000,ak_000,ar_000,...,dk_000,dq_000,dr_000,dx_000,dy_000,dz_000,ea_000,eb_000,ef_000,eg_000
0,False,0.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1100.0,574.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0
1,False,0.0,68.0,0.0,0.0,0.0,0.0,60.0,0.0,0.0,...,0.0,3996.0,584.0,6368.0,36.0,0.0,0.0,0.0,0.0,0.0
2,False,2.0,212.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,10262714.0,1278664.0,4434614.0,70900.0,0.0,0.0,26002880.0,0.0,0.0
3,False,0.0,1010.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1179900.0,0.0,0.0
4,False,0.0,156.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,13664.0,110.0,0.0,0.0,813740.0,0.0,0.0


In [67]:
## Aplicando o mesmo scaler
df_present_std = scaler.transform(df_present_filtered)

## E a mesma transformação de PCA
pc_present = pca.transform(df_present_std)

present_final = pd.DataFrame(pc_present, columns = [f'PC{n}' for n in range (0,30)])

present_final = pd.concat([df_present_filtered['class'], present_final], axis = 1)

present_final

Unnamed: 0,class,PC0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,...,PC20,PC21,PC22,PC23,PC24,PC25,PC26,PC27,PC28,PC29
0,False,-0.954508,0.467346,-0.469194,0.300346,-0.219652,-0.199603,0.199290,0.008421,-0.204502,...,-0.215061,0.247686,-0.033237,0.117028,0.001036,0.054405,-0.070574,-0.141358,-0.000743,0.048716
1,False,-1.092516,0.768610,-0.789442,0.416744,-0.333754,-0.359753,0.381619,-0.015171,-0.454167,...,-0.223753,0.179798,-0.032699,0.083816,0.032143,0.038092,-0.072100,-0.046976,-0.027786,0.003391
2,False,2.030255,-0.360056,0.962974,-1.052065,0.555174,0.406762,-0.744603,-0.209987,1.111338,...,0.112560,0.194617,0.640534,-0.509688,0.118945,1.107100,0.298830,-0.107172,0.159043,-0.569695
3,False,0.189389,-0.883493,0.824461,-0.553297,0.490756,0.372355,-0.421667,0.016978,0.416409,...,-0.265556,0.593430,0.210760,-0.015551,0.100163,0.056395,0.334080,0.278755,-0.066867,0.222966
4,False,-1.074066,0.788252,-0.784616,0.390083,-0.314011,-0.353138,0.356519,-0.014104,-0.413033,...,-0.227525,0.179586,-0.051144,0.090346,0.021145,0.014071,-0.041207,-0.015018,-0.020593,-0.012374
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15995,False,1.590289,-0.429079,0.814795,-0.844527,0.704673,0.535452,-0.618645,0.159849,0.916472,...,0.504385,-1.508558,-0.192655,-0.293580,0.548395,-0.356105,0.370784,-0.167367,0.033327,-0.706193
15996,False,-1.100062,0.880210,-0.786932,0.458971,-0.345971,-0.420949,0.390405,0.041623,-0.547715,...,-0.221072,0.171097,-0.032454,0.077871,0.025829,0.025999,-0.079419,-0.014993,-0.022749,0.002983
15997,False,0.774154,-1.129644,1.012757,-0.889723,1.036823,0.695283,0.011291,0.437240,0.440432,...,-0.070052,0.668624,0.139266,-0.111642,0.278175,0.086364,0.453726,-0.052014,-0.129882,0.148906
15998,False,-1.022022,0.638419,-0.651967,0.363701,-0.282125,-0.293186,0.305228,-0.005765,-0.347761,...,-0.223747,0.203226,-0.036151,0.099324,0.012537,0.048801,-0.069624,-0.085765,-0.022872,0.011837


In [68]:
## Por fim, fazendo as predições.
present_X = present_final[vars_selecionadas]

predictions = logreg.predict_proba(present_X)

print(predictions[:,1])

[0.00335565 0.00282445 0.00637293 ... 0.00751587 0.00304737 0.00270292]


Com isso, o modelo está encerrado. 

É claro que, para confirmação, poderíamos calcular a quantidade de falsos-positivos, falsos-negativos etc. e, disso, derivar uma métrica de acurácia do modelo.

Com o modelo feito, a 'empresa' da questão poderia, agora, predizer quais dos caminhões devem vir a dar defeito e enviá-los para manutenção antecipadamente, evitando custos extras de resgate, por exemplo