# 0. Importer les librairies / Lecture des données

In [226]:
import pandas as pd
import numpy as np

import scipy.stats as stats

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import joblib

pd.pandas.set_option("display.max_columns", None)

data = pd.read_csv("train.csv")
X_train, X_test, y_train, y_test = train_test_split(
    data.drop(["HeartDisease"], axis=1),
    data["HeartDisease"], 
    test_size=0.1,
    random_state=123
)
X_train.shape, X_test.shape

((648, 11), (73, 11))

# 1. Configuration

In [227]:
# Configuration données :manquantes 
CAT_NAN_WITH_VAL_MISS =  ['ST_Slope']
CAT_NAN_FREQUENT_VAL =  ['ChestPainType']
QUANT_VAR_WITH_NA = ['Cholesterol', 'MaxHR']
                         
                         
# Congfiguration des transformations de distributions
VAR_YEO_JOHNSON = ["RestingBP"]
lmbda=-12.55283001172003
                         
# configuration des mapping de variables qualitatives
map_variables = {
    "RestingECG": {"Missing": 0, "NA": 0, "Normal":1, "ST": 2, "LVH":3}, 
    "Sex":{"Missing": 0, "NA": 0, "M":1, "F":2}, 
    "ExerciseAngina" : {"Missing": 0, "NA": 0, "N":1, "Y":2}  
    }       

# Cofiguration du one-hot encoding
CAT_VAR_ONE_HOT = ['ChestPainType', 'ST_Slope']
              
# Configuration de random_state
RANDOM_STATE = 123

# Configuration du paramètre de la régression logistique 
C=1.0

# 2. Pipeline : entrainement

## 2.1 Nétoyage de données en pipeline

In [228]:
# Step 1: 
def CategoricalImputer_missing(df, variables):
    df[variables] = df[variables].fillna("Missing")

# Step 2: 
def CategoricalImputer_frequent(df, variables):
    for var in variables:
        mode = df[var].mode()[0]
        df[var].fillna(mode, inplace=True)
        
# Step 3: 
def addMissingIndicator(df, variables):
    for var in variables:
        df[var+"_indc_na"] = np.where(df[var].isnull(), 1, 0) 
        
# Step4 :
def addMeadianImputer(df, variables):
    for var in variables:
        mean = df[var].mean()
        df[var].fillna(mean, inplace=True)

# Step 5:
def YeoJohnsonTransformation(df, variables, param):
    for var in variables:
        df[var] = stats.yeojohnson(df[var], lmbda=param)
        
# Step 6:
def mapping_var(df, dict_mapping):
    for var in dict_mapping.keys():
        mapping = dict_mapping[var]
        df[var] = df[var].map(mapping)

# Step 7 
def oneHotEncoding(df, variables):
    for var in variables:
        df[var] = pd.get_dummies(df, prefix=var, columns=[var])

In [229]:
# Let's apply pipeline of cleaning data  
def pipeline_cleaning_data(X_train):
    # Step 1: 
    CategoricalImputer_missing(X_train, CAT_NAN_WITH_VAL_MISS)
    # Step 2 : 
    CategoricalImputer_frequent(X_train, CAT_NAN_FREQUENT_VAL)
    # Step 3 : 
    addMissingIndicator(X_train, QUANT_VAR_WITH_NA)
    # Step 4 : 
    addMeadianImputer(X_train, QUANT_VAR_WITH_NA)
    # Step 5
    YeoJohnsonTransformation(X_train, VAR_YEO_JOHNSON, lmbda)
    # Step 6
    mapping_var(X_train, map_variables)
    # Step 7
    oneHotEncoding(X_train, CAT_VAR_ONE_HOT)
    
pipeline_cleaning_data(X_train)
pipeline_cleaning_data(X_test)

## 2.2 Entrainement et évaluation du modèle 

In [230]:
def train_model(X_train, y_train, X_test, y_test, C, random_state):
    clf = LogisticRegression(C=C, random_state=random_state)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_train)
    print("accuracy training : ", accuracy_score(y_train, y_pred))
    y_pred = clf.predict(X_test)
    print("accuracy testing : ", accuracy_score(y_test, y_pred))
    return clf

def save_model(clf):
    joblib.dump(clf, "logistic_regression_model.joblib")
    
clf = train_model(X_train, y_train, X_test, y_test, C, RANDOM_STATE)
save_model(clf)

accuracy training :  0.8070987654320988
accuracy testing :  0.8493150684931506


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


# 3. Pipeline : inférence

In [231]:
# Lecture des données de test d'inférence 
data_test = pd.read_csv("test.csv")
col_x_test = [col for col in data_test.columns if col != "HeartDisease"]
X_test = data[col_x_test]
y_test = data["HeartDisease"]
X_test.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope
0,40,M,ATA,140,289.0,0,Normal,172.0,N,0.0,Up
1,49,F,NAP,160,,0,Normal,156.0,N,1.0,Flat
2,37,M,ATA,130,283.0,0,ST,98.0,N,0.0,
3,48,F,ASY,138,,0,Normal,108.0,Y,1.5,Flat
4,54,M,NAP,150,195.0,0,Normal,122.0,N,0.0,Up


In [232]:
# Transformation des donées
pipeline_cleaning_data(X_test)
clf = joblib.load("logistic_regression_model.joblib")
y_pred = clf.predict(X_test)
print("accuracy of inference data : ", accuracy_score(y_test, y_pred))

accuracy of inference data :  0.811373092926491


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().fillna(
