In [4]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import recall_score, precision_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest

# Dataset

In [5]:
data = pd.read_excel('./data/Coeur.xlsx')
df = data.copy()

df.head()

Unnamed: 0,AGE,SEXE,TDT,PAR,CHOLESTEROL,GAJ,ECG,FCMAX,ANGINE,DEPRESSION,PENTE,CŒUR
0,40,homme,AA,140,289,0,Normal,172,Non,0.0,Ascendant,0
1,49,femme,DNA,160,180,0,Normal,156,Non,1.0,Plat,1
2,37,homme,AA,130,283,0,ST,98,Non,0.0,Ascendant,0
3,48,femme,ASY,138,214,0,Normal,108,Oui,1.5,Plat,1
4,54,homme,DNA,150,195,0,Normal,122,Non,0.0,Ascendant,0


# Normalisation des variables continue

In [6]:
for col in df.drop(['CŒUR'], axis=1).select_dtypes(np.number).columns:
    df[col] = (df[col] - df[col].mean())/ df[col].std()

# Recodage des variables discrète

In [7]:
for col in df.select_dtypes('object').columns:
    df[col] = df[col].astype('category').cat.codes

# Selection des meilleurs varialbes

In [84]:
selector = SelectKBest(k=2).fit_transform(df.drop(['CŒUR'], axis=1), df['CŒUR'])

# Division du dataset

In [85]:
X_train, X_test, y_train,  y_test = train_test_split(selector, df["CŒUR"], random_state=0, test_size=0.2)

# Regression Logistique

In [86]:
# Initialisation du modèle
lr = LogisticRegression()

# Entraînement du modèle
lr.fit(X_train, y_train)

# Performance sur le train set
lr.score(X_train, y_train)

0.8242506811989101

# Arbre de décision

In [87]:
# Initialisation du modèle
tree = DecisionTreeClassifier(random_state=0)

# Entraînement du model
tree.fit(X_train, y_train)

# Performance sur le train set
tree.score(X_train, y_train)

0.8283378746594006

# Performance des deux modèles

In [88]:
# Performance Régression Logistique
print(lr.score(X_test, y_test))

# Performance Arbre de décision
print(tree.score(X_test, y_test))

0.7717391304347826
0.7880434782608695


# Prédiction

In [89]:
# Prédiction régression logistique
lr_pred = lr.predict(X_test)

# Prédiction arbre de décision
tree_pred = tree.predict(X_test)

# Matrice de confusion

In [90]:
# Matrice de confusion régression logistique
print(confusion_matrix(y_test, lr_pred))

print('------------')

# Matrice de confusion arbre de décision
print(confusion_matrix(y_test, tree_pred))

[[58 19]
 [23 84]]
------------
[[52 25]
 [14 93]]


# Sensibilité

In [91]:
# Sensibité régression logistique
print(recall_score(y_test, lr_pred))

# Sensibilité arbre de décision
print(recall_score(y_test, tree_pred))

0.7850467289719626
0.8691588785046729


# Précision

In [92]:
# Précision régression logistique
print(precision_score(y_test, lr_pred))

# Précision arbre de décision
print(precision_score(y_test, tree_pred))

0.8155339805825242
0.788135593220339
