# Feature engineering des données

Le feature engineering des données suivera le plan suivant :

   1. Données manquantes
       - Variables qualitatives
       - Variables quantitatives
   2. Variables asymétriques
   3. Variables qualitatives : mapping/one-hot encoding
   4. Entrainement d'un premier modèle

# 0. Importer les librairies / Lecture des données

In [54]:
import pandas as pd
import numpy as np

import scipy.stats as stats

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import joblib

pd.pandas.set_option("display.max_columns", None)

In [55]:
data = pd.read_csv("train.csv")
X_train, X_test, y_train, y_test = train_test_split(
    data.drop(["HeartDisease"], axis=1),
    data["HeartDisease"], 
    test_size=0.1,
    random_state=123
)
X_train.shape, X_test.shape

((648, 11), (73, 11))

# 1. Données manquantes

In [56]:
# Rappel

# Identifier toutes les variables dont le type est "object" avec le dtype
CAT_VAR = [nom_col for nom_col in data.columns if data[nom_col].dtype == "object" 
           if nom_col not in  ["Sex", "ExerciseAngina"]]

# Identifier les variables binaires
BIN_VAR = ["Sex", "ExerciseAngina"]

CAT_VAR.extend(BIN_VAR)
print("CAT_VAR : ", CAT_VAR)

# Le restant des variables est automatiquement de type quantitiaves 
QUANT_VAR = [nom_col for nom_col in data.columns
             if nom_col not in set(BIN_VAR).union(set(CAT_VAR))
             and  nom_col != "HeartDisease"]
print("QUANT_VAR : ", QUANT_VAR)

CAT_VAR :  ['ChestPainType', 'RestingECG', 'ST_Slope', 'Sex', 'ExerciseAngina']
QUANT_VAR :  ['Age', 'RestingBP', 'Cholesterol', 'FastingBS', 'MaxHR', 'Oldpeak']


## 1.1 Variables qualitatives

In [57]:
cat_var_with_na = [var for var in CAT_VAR  
                  if data[var].isnull().sum() > 0]

# Affichier la moyenne des données manquantes par colonne
mean_per_col = data[cat_var_with_na].isnull().mean().sort_values(ascending=False)
print("la moyenne des données manquantes par colonne:")
print("CAT_VAR_WITH_NA = ", cat_var_with_na)
print(mean_per_col)



la moyenne des données manquantes par colonne:
CAT_VAR_WITH_NA =  ['ChestPainType', 'ST_Slope']
ST_Slope         0.266297
ChestPainType    0.097087
dtype: float64


In [58]:
# on choisi le seuil 15 % pour traiter les valeurs manquantes 
cat_nan_with_val_miss = [var for var in cat_var_with_na 
                        if data[var].isnull().mean() > 0.15]

cat_nan_frequent_val = [var for var in cat_var_with_na 
                        if data[var].isnull().mean() <= 0.15]

#--------------------------------------------------------------

## if > 0.15 == ajouter "Mussing"
X_train[cat_nan_with_val_miss] = X_train[cat_nan_with_val_miss].fillna("Missing")
X_test[cat_nan_with_val_miss] = X_test[cat_nan_with_val_miss].fillna("Missing")

## if <= 0.15 == ajouter val plus fréquente
for var in cat_nan_frequent_val:
    mode = X_train[var].mode()[0]
    print(var, mode)
    
    X_train[var].fillna(mode, inplace=True)
    X_test[var].fillna(mode, inplace=True)
print("--")
#--------------------------------------------------------------

# Check miss values
print("CAT_NAN_WITH_VAL_MISS = ", cat_nan_with_val_miss)
print("CAT_NAN_FREQUENT_VAL = ", cat_nan_frequent_val)
print()
print("sum of nan for cat_na_frequent_val", X_train[cat_nan_frequent_val].isnull().sum())
print("sum of nan for cat_nan_with_val_miss", X_train[cat_nan_with_val_miss].isnull().sum())

ChestPainType ASY
--
CAT_NAN_WITH_VAL_MISS =  ['ST_Slope']
CAT_NAN_FREQUENT_VAL =  ['ChestPainType']

sum of nan for cat_na_frequent_val ChestPainType    0
dtype: int64
sum of nan for cat_nan_with_val_miss ST_Slope    0
dtype: int64


## 1.2 Variables quantitatives

In [59]:
quant_var_with_na = [var for var in QUANT_VAR 
                    if data[var].isnull().sum() > 0]

# Affichier la moyenne des données manquantes par colonne
mean_per_col = data[quant_var_with_na].isnull().mean().sort_values(ascending=False)
print("la moyenne des données manquantes par colonne : ")
print("QUANT_VAR_WITH_NA : ", quant_var_with_na)
print(mean_per_col)


la moyenne des données manquantes par colonne : 
QUANT_VAR_WITH_NA :  ['Cholesterol', 'MaxHR']
Cholesterol    0.375867
MaxHR          0.006935
dtype: float64


In [60]:
for var in quant_var_with_na:
    mean_val = data[var].mean()
    
    print("mean ({}) est : {}".format(var, mean_val))
    
    # On ajoute un indicateur des valeurs manquantes 
    # Dans certains cas les valeurs manquantes peuvent avoir un impact sur la prédiction 
    X_train[var+"_indc_na"] = np.where(X_train[var].isnull(), 1, 0) 
    X_test[var+"_indc_na"] = np.where(X_test[var].isnull(), 1, 0) 
    
    
    # remplacer les valeurs manquantes par la moyenne
    X_train[var].fillna(mean_val, inplace=True)
    X_test[var].fillna(mean_val, inplace=True)
    
# check miss values
print()
print("Pourcentage des nan valeurs par colonnes")
print(X_train[quant_var_with_na].isnull().sum())
    


mean (Cholesterol) est : 199.01555555555555
mean (MaxHR) est : 137.19553072625698

Pourcentage des nan valeurs par colonnes
Cholesterol    0
MaxHR          0
dtype: int64


# 2. Variables asymétriques

# 2.1 Transformation Yeo-Johnson 

In [65]:
# Entrainement du transformeur sur X_train
X_train["RestingBP"], param = stats.yeojohnson(X_train["RestingBP"])

# inférence des paramètres du transformeur 
X_test["RestingBP"] = stats.yeojohnson(X_test["RestingBP"], lmbda=param)
print("VAR_YEO-JOHNSO = ", ["RestingBP"])
print("param : ", param)

VAR_YEO-JOHNSO =  ['RestingBP']
param :  1.043994005071975


# 3. Variables qualitatives : mapping/one-hot encoding

## 3.1 mapping

In [62]:
# RestingECG  mapping : 
map_RestingECG = {"Missing": 0, "NA": 0, "Normal":1, "ST": 2, "LVH":3}
X_train["RestingECG"] = X_train["RestingECG"].map(map_RestingECG)
X_test["RestingECG"] = X_test["RestingECG"].map(map_RestingECG)

# Sex mapping
map_Sex = {"Missing": 0, "NA": 0, "M":1, "F":2}
X_train["Sex"] = X_train["Sex"].map(map_Sex)
X_test["Sex"] = X_test["Sex"].map(map_Sex)

# ExerciseAngina
map_ExerciseAngina = {"Missing": 0, "NA": 0, "N":1, "Y":2}
X_train["ExerciseAngina"] = X_train["ExerciseAngina"].map(map_ExerciseAngina)
X_test["ExerciseAngina"] = X_test["ExerciseAngina"].map(map_ExerciseAngina)

# 3.2 one-hot encoding

In [63]:
CAT_VAR_NOM = ['ChestPainType', 'ST_Slope']

for var in CAT_VAR_NOM:
    X_train= pd.get_dummies(X_train, prefix=var, columns=[var])
    X_test= pd.get_dummies(X_test, prefix=var, columns=[var])

# 4. Entrainement d'un premier modèle

In [64]:
# Régression logistique 
model = LogisticRegression(random_state=123)

# train the model
clf = model.fit(X_train, y_train)

# predict test data
y_pred = clf.predict(X_test)

# score model
print("accracy : ", accuracy_score(y_test, y_pred))

accracy :  0.8493150684931506


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
