# I - Code d'origine

 source du code d'origine : https://www.kaggle.com/code/setyoab/heart-attack-analysis-using-logisticregression

In [None]:
# On importe les librairies

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd



In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
# On importe le dataset 

df = pd.read_csv("heart.csv")
print(f"data shape : {df.shape}")

In [None]:
df.head()

In [None]:
df.info()

In [None]:
# heatmap de corrélation

plt.figure(figsize = (15, 8))
sns.heatmap(df.corr(), annot = True, fmt = ".2f")


## EDA

In [None]:
fig,axes = plt.subplots(2,3,figsize=(20,12))
col = ['sex', 'thall', 'exng']
sns.countplot(ax = axes[0, 0], data = df, x = 'sex', hue ='output')
sns.countplot(ax = axes[0, 1],data = df, x = 'thall', hue = 'output')
sns.countplot(ax = axes[0, 2],data = df, x = 'exng', hue = 'output')
sns.countplot(ax = axes[1, 0],data = df, x = 'cp', hue = 'output')
sns.countplot(ax = axes[1, 1],data = df, x = 'slp', hue = 'output')
sns.countplot(ax = axes[1, 2],data = df, x = 'restecg', hue = 'output')


# adding title to each subplots
axes[0,0].set_title('sex based on output')
axes[0,1].set_title('thall based on output')
axes[0,2].set_title('exng based on output')
axes[1,0].set_title('chest pain based on output')
axes[1,1].set_title('slp based on output')
axes[1,2].set_title('rest ecg based on output')


In [None]:
plt.figure(figsize=(25,15))
sns.countplot(data = df, y = df['age'])
plt.title('Age countplot')

In [None]:
plt.figure(figsize=(10,8))
sns.displot(data = df, x = 'thalachh', kde = True, hue = 'output')
plt.title('Thalach distribution based on output')
plt.show()

In [None]:
sns.displot(data = df, x = 'chol', kde = True, hue = 'output')
plt.title('cholesterol based on output')

## Création du Modèle

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [None]:
X = df.drop("output",  axis = 1)
y = df["output"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [None]:
log_model = LogisticRegression(solver = "lbfgs",
                               max_iter = 1000)

log_model.fit(X_train, y_train)

In [None]:
pred = log_model.predict(X_test)

In [None]:
print(classification_report(y_test, pred))

In [None]:
print(f"accuracy on test set : {log_model.score(X_test, y_test)}")

In [None]:
evaluation(log_model)

In [None]:
# Création courbes ROC
# Initialiser les modèles
models = [log_model]
model_names = ['Logistic Regression non optimisée']

# Boucle sur les modèles pour les entraîner et prédire les probabilités
for model, name in zip(models, model_names):
    model.fit(X_train, y_train)
    y_pred_prob = model.predict_proba(X_test)[:,1]
    
    # Calculer la courbe ROC et AUC
    fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
    auc = roc_auc_score(y_test, y_pred_prob)
    
    # Afficher la courbe ROC
    plt.figure(figsize=(10,8))
    plt.plot(fpr, tpr, label=name + ', AUC = %0.3f' % auc)
    
# Ajouter des annotations et légendes

plt.plot([0, 1], [0, 1], linestyle='--')
plt.xlabel('Taux de faux positifs')
plt.ylabel('Taux de vrais positifs')
plt.title('Courbe ROC pour Logistic Regression non optimisée')
plt.legend()

# Afficher la figure
plt.show()

# II - Phase d'optimisation

## Analyse de la colonne target
On va analyser la répartition des deux valeures target, afin ,de voir si notre dataset est équilibré ou non

In [None]:
# On vérifie la distribution de notre target

df["output"].value_counts(normalize=True)

On voit que notre dataset est plutot équilibré, nous n'avons donc pas besoin de faire un travail de traitement des données dans ce sens

In [None]:
# Recherche de corrélations par rapport à notre target

cor_matrix = df.corr()
cor_matrix["output"].sort_values(ascending = False)

On voit que 3 variables ont une corrélation positive sur notre target :
 - cp : 0.434 
 - thalachh : 0.422 
 - slp : 0.346

3 variables ont aussi une corrélation négative sur notre target : 
- exng : -0.437
- oldpeak : -0.431
- caa : -0.392

Nous n'avons aucune corralation linéaire vis à vis de notre target.

## Preprocessing

In [None]:
# Création d'un pipeline

from sklearn.pipeline import make_pipeline

# Utilisation de RobusScaller 
from sklearn.preprocessing import RobustScaler

# Import de nos modèles de machine learning

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.metrics import accuracy_score, roc_curve, roc_auc_score, confusion_matrix, classification_report

from sklearn.model_selection import learning_curve
from sklearn.model_selection import GridSearchCV

import numpy as np

import pickle

In [None]:
# Création de notre dataset d'entrainement et de validation

X = df.drop("output",  axis = 1)
y = df["output"]


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [None]:
# Creation de nos différents modèles

model_log_Reg = make_pipeline(RobustScaler(), 
                      LogisticRegression(max_iter = 10000))

model_dec_tree_class = make_pipeline(RobustScaler(), 
                      DecisionTreeClassifier())

model_SVC = make_pipeline(RobustScaler(), 
                      SVC(probability = True))

model_grad_boost = make_pipeline(RobustScaler(), 
                     GradientBoostingClassifier())


### Test des modèles sans optimisation des hyper paramètres

In [None]:
# Création courbes ROC
# Initialiser les modèles
models = [model_log_Reg, model_dec_tree_class, model_SVC, model_grad_boost]
model_names = ['LogReg', 'DecisionTree', 'SVC', "GradientBoostingClassifier"]

# Boucle sur les modèles pour les entraîner et prédire les probabilités
for model, name in zip(models, model_names):


    model.fit(X_train, y_train)
    y_pred_prob = model.predict_proba(X_test)[:,1]
    
    # Calculer la courbe ROC et AUC
    fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
    auc = roc_auc_score(y_test, y_pred_prob)
    
    # Afficher la courbe ROC
    
    plt.plot(fpr, tpr, label=name + ', AUC = %0.3f' % auc)
    
# Ajouter des annotations et légendes
    
    
plt.plot([0, 1], [0, 1], linestyle='--')
plt.xlabel('Taux de faux positifs')
plt.ylabel('Taux de vrais positifs')
plt.title('Courbe ROC pour différents modèles de machine learning')
plt.legend()

# Afficher la figure
plt.show()

In [None]:
# Courbe d'apprentissage

fig, axes = plt.subplots(1, len(models), figsize=(15, 5))
for i, model,  in enumerate(models, ):
    train_sizes, train_scores, val_scores = learning_curve(model, X_train, y_train, train_sizes=[0.1, 0.3, 0.5, 0.7, 0.9], cv=5)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    val_scores_mean = np.mean(val_scores, axis=1)
    val_scores_std = np.std(val_scores, axis=1)
    axes[i].set_title(type(model).__name__)
    axes[i].set_xlabel("Training Examples")
    axes[i].set_ylabel("Score")
    axes[i].grid()
    axes[i].fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r")
    axes[i].fill_between(train_sizes, val_scores_mean - val_scores_std, val_scores_mean + val_scores_std, alpha=0.1, color="g")
    axes[i].plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training Score")
    axes[i].plot(train_sizes, val_scores_mean, 'o-', color="g", label="validation Score")
plt.legend(loc="best")

### Optimisation de nos modèles avec GridSearchCv

In [None]:
def evaluation(model):
    
    #model.fit(X_train,y_train)
    ypred=model.predict(X_test)
    
    print(f"Matrix de confusion {confusion_matrix(y_test,ypred)}")
    print(f"Report Classification {classification_report(y_test,ypred)}")
    
    N,train_score,val_score=learning_curve(model,X_train,y_train,cv=4,train_sizes=np.linspace(0.1,1,10),
                                            scoring="precision")
    
    plt.figure(figsize=(10,5))
    plt.plot(N,train_score.mean(axis=1),label="Train score")
    plt.plot(N,val_score.mean(axis=1),label="Validation score")
    plt.legend()

In [None]:
# Création de nos paramètres

param_grid_SVC = {'svc__C': (0.001, 0.01, 0.1, 1, 10, 100), 
                  'svc__kernel': ['linear', 'rbf', "sigmoid", "poly"], 
                  'svc__gamma': ["scale", "auto"],
                  "svc__degree": range(1,5)}

param_grid_log_reg = {"logisticregression__penalty": ["l1", "l2", "elasticnet"],
                      "logisticregression__C": (0.001, 0.01, 0.1, 1, 10, 100),
                      "logisticregression__solver": ["newton-cg", "lbfgs", "liblinear", "sag", "saga", "newton-cholesky"]
                      
                      }

param_grid_dec_tree = {"decisiontreeclassifier__max_depth": range(1, 100, 10),
                       "decisiontreeclassifier__min_samples_split": range(1, 100, 10),
                       "decisiontreeclassifier__min_samples_leaf": range(1, 100, 10),
                       "decisiontreeclassifier__max_features": range(1, 100, 10),
                       "decisiontreeclassifier__criterion": ["gini", "entropy"]


}

param_grid_grad_boost = {"gradientboostingclassifier__loss": ["log_loss", "deviance", "exponential"],
                         "gradientboostingclassifier__learning_rate": (0.0001,0.001, 0.01, 0.1, 1, 1.001, 1.01, 1.1),
                         "gradientboostingclassifier__criterion": ["friedman_mse", "squarred_error"],
                         "gradientboostingclassifier__max_depth": range(1, 100),
                         "gradientboostingclassifier__max_features": ["auto", "sqrt", "log2"],
                         "gradientboostingclassifier__ccp_alpha": (0.0001, 0.001, 0.01, 0.1, 1)


}


In [None]:

def optimisation(model, param_grid):

    grid = GridSearchCV(model, param_grid= param_grid, cv = 5, scoring = "precision", verbose = 2, n_jobs = -1)
    grid.fit(X_train, y_train)
    print(grid.best_params_)
    print(grid.best_score_)
    print(grid.best_estimator_)
    return (grid.best_estimator_)

### Logistic Regression

In [None]:
best_model_log_reg = optimisation(model_log_Reg, param_grid_log_reg)


In [None]:
evaluation(best_model_log_reg)

In [None]:
#Sauvegarde du modèle

with open('modeles/LogisticRegression.pkl', 'wb') as file:
    pickle.dump(best_model_log_reg, file)

### Decision Tree

In [None]:
best_model_decision_tree = optimisation(model_dec_tree_class, param_grid_dec_tree)

In [None]:
evaluation(best_model_decision_tree)

In [None]:
# Sauvegarde du modèle

with open('modeles/DecisionTree.pkl', 'wb') as file:
    pickle.dump(best_model_decision_tree, file)



# GradientBoostingClassifier

In [None]:
best_model_grad_boost = optimisation(model_grad_boost, param_grid_grad_boost)

In [None]:
evaluation(best_model_grad_boost)

In [None]:
# Sauvegarde du modèle

with open('modeles/GradientBoost.pkl', 'wb') as file:
    pickle.dump(best_model_grad_boost, file)

### SVC

In [None]:
best_model_SVC = optimisation(model_SVC, param_grid_SVC)

In [None]:
evaluation(best_model_SVC)

In [None]:
# Sauvegarde du modèle

with open('modeles/SVC.pkl', 'wb') as file:
    pickle.dump(best_model_SVC, file)

### Courbes ROC de l'ensemble de nos modèles

In [None]:
# On charge nos modèles

with open('modeles/LogisticRegression.pkl', 'rb') as file:
    model_log_reg = pickle.load(file)

with open('modeles/SVC.pkl', 'rb') as file:
    model_SVC = pickle.load(file)

with open('modeles/DecisionTree.pkl', 'rb') as file:
    model_decision_tree = pickle.load(file)

with open('modeles/GradientBoost.pkl', 'rb') as file:
    model_grad_boost = pickle.load(file)


In [None]:
# Création courbes ROC
# Initialiser les modèles
models = [model_decision_tree, model_log_reg, model_SVC, model_grad_boost]
model_names = ['Decision Tree', 'Logistic Regression', 'SVC', "Grad Boost"]

# Boucle sur les modèles pour les entraîner et prédire les probabilités
for model, name in zip(models, model_names):
    model.fit(X_train, y_train)
    y_pred_prob = model.predict_proba(X_test)[:,1]
    
    # Calculer la courbe ROC et AUC
    fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
    auc = roc_auc_score(y_test, y_pred_prob)
    
    # Afficher la courbe ROC
    plt.plot(fpr, tpr, label=name + ', AUC = %0.3f' % auc)
    
# Ajouter des annotations et légendes
plt.plot([0, 1], [0, 1], linestyle='--')
plt.xlabel('Taux de faux positifs')
plt.ylabel('Taux de vrais positifs')
plt.title('Courbe ROC pour différents modèles de machine learning')
plt.legend()

# Afficher la figure
plt.show()

# Création de l'API

In [1]:
from flask import Flask, request, jsonify
import pickle
import numpy as np

In [2]:
# On charge le modèle SVC

model = pickle.load(open('modeles/GradientBoost.pkl', 'rb'))




In [3]:
# Initialisation de flask

app = Flask(__name__)

In [4]:
# Création des routes

@app.route("/", methods = ["GET"])
def index():
    return "API de prédiction des risques d'arrets cardiaque "

@app.route("/prediction_simple", methods=["POST"])
def prediction_simple():
    
    # recevoir les données en tableau numpy
    
    data = request.get_json(force = True)    
    
   # Faire la prédiction

    predict_request = model.predict(data["data"])

    # Renvoyer la réponse au format JSON

    output = {"predict_request": int(predict_request)}
    return jsonify(output)

@app.route("/prediction_multiple", methods=["POST"])
def prediction_multiple():

    # recevoir les données en tableau numpy

    data_multi = request.get_json(force = True)
    input_data = np.array(data_multi["data"].reshape(-1, 1))

    # Faire prédiction

    #multi_predict_request = model.predict(data_multi["data"].reshape(1, -1))
    predictions = model.predict(input_data)

    # Renvoyer la réponse au format JSON

    multi_output = {"predictions": predictions.tolist()}
    return jsonify(multi_output)


In [5]:
if __name__ == "__main__":
    app.run(debug = True,host = "0.0.0.0", use_reloader = True) 

 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:5000
 * Running on http://192.168.1.159:5000
Press CTRL+C to quit
 * Restarting with watchdog (windowsapi)


SystemExit: 1

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
