In [1]:
import pandas as pd
import seaborn as sns

In [4]:
data = pd.read_csv("BMI_IOS_SCD_Asthma.csv")
data

Unnamed: 0,Group,Subject ID,Observation_number,Hydroxyurea,Asthma,ICS,LABA,Gender,Age (months),Height (cm),Weight (Kg),BMI,R5Hz_PP,R20Hz_PP,X5Hz_PP,Fres_PP
0,C-SCD,1,1,Yes,Yes,Yes,No,Male,239,164.1,61.5,22.84,145,133,-456.0,
1,C-SCD,1,2,Yes,Yes,Yes,No,Male,193,162.7,62.3,23.53,103,98,111.0,169.0
2,C-SCD,1,3,Yes,Yes,Yes,Yes,Male,212,163.5,63.1,23.60,107,98,174.0,159.0
3,C-SCD,1,4,Yes,Yes,Yes,Yes,Male,224,163.8,63.7,23.74,87,87,-303.0,
4,C-SCD,2,1,No,No,No,No,Female,204,154.5,66.4,27.82,124,121,98.0,135.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
214,C-Asthma,87,3,No,Yes,Yes,Yes,male,104,124.0,25.7,16.71,60,72,81.0,94.0
215,C-Asthma,88,1,No,Yes,No,No,Female,138,160.0,50.6,19.77,84,86,96.0,136.0
216,C-Asthma,89,1,No,Yes,Yes,Yes,male,93,133.0,30.8,17.41,67,67,73.0,99.0
217,C-Asthma,89,2,No,Yes,Yes,No,male,90,135.0,31.3,17.17,104,83,156.0,120.0


In [27]:
data.columns

Index(['Subject ID', 'Hydroxyurea', 'Asthma', 'ICS', 'LABA', 'Gender',
       'Age (months)', 'Height (cm)', 'Weight (Kg)', 'BMI', 'R5Hz_PP',
       'R20Hz_PP', 'X5Hz_PP', 'Fres_PP'],
      dtype='object')

In [3]:
# Suppression des colonens inutiles
data.drop(columns=['Observation_number'], inplace=True)
data.drop(columns=['Group'], inplace=True)

In [4]:
# Remplacements des valeurs manquantes de la colonne Fres_PP par la moyenne
if 'Fres_PP' in data.columns and data['Fres_PP'].isnull().any():
    mean_fres_pp = data['Fres_PP'].mean()
    data['Fres_PP'] = data['Fres_PP'].fillna(round(mean_fres_pp, 1))

In [5]:
# Check des doublons de valeurs
data.duplicated().value_counts()

False    219
Name: count, dtype: int64

In [6]:
# Suppression des doublons de valeurs 
data.drop_duplicates(inplace=True)

In [7]:
data.dtypes

Subject ID        int64
Hydroxyurea      object
Asthma           object
ICS              object
LABA             object
Gender           object
Age (months)      int64
Height (cm)     float64
Weight (Kg)     float64
BMI             float64
R5Hz_PP           int64
R20Hz_PP          int64
X5Hz_PP         float64
Fres_PP         float64
dtype: object

In [8]:
data.isna().sum()

Subject ID      0
Hydroxyurea     0
Asthma          0
ICS             0
LABA            0
Gender          0
Age (months)    0
Height (cm)     0
Weight (Kg)     0
BMI             0
R5Hz_PP         0
R20Hz_PP        0
X5Hz_PP         0
Fres_PP         0
dtype: int64

In [9]:
data.Gender.unique()

array(['Male', 'Female', 'male'], dtype=object)

In [10]:
# Remplacement des valeurs "male" par "Male" dans la colonne "Gender"
data['Gender'] = data['Gender'].replace('male', 'Male')

In [11]:
data.Gender.unique()

array(['Male', 'Female'], dtype=object)

In [12]:
# Conversion des colonnes catégorielles en numériques
data['Hydroxyurea'] = data['Hydroxyurea'].map({'Yes': 1, 'No': 0})
data['Asthma'] = data['Asthma'].map({'Yes': 1, 'No': 0})
data['ICS'] = data['ICS'].map({'Yes': 1, 'No': 0})
data['LABA'] = data['LABA'].map({'Yes': 1, 'No': 0})

In [13]:
data.value_counts()

Subject ID  Hydroxyurea  Asthma  ICS  LABA  Gender  Age (months)  Height (cm)  Weight (Kg)  BMI    R5Hz_PP  R20Hz_PP  X5Hz_PP  Fres_PP
1           1            1       1    0     Male    193           162.7        62.3         23.53  103      98        111.0    169.0      1
51          0            1       1    0     Male    154           154.3        53.7         22.55  142      153       148.0    165.0      1
48          1            0       0    0     Female  128           155.4        44.5         18.43  103      95        145.0    142.0      1
                                                    151           165.5        53.4         19.50  97       95        138.0    134.0      1
49          1            1       0    0     Male    63            107.5        17.5         15.14  123      85        13.0     97.0       1
                                                                                                                                         ..
27          1            

In [None]:
# Sauvegarde du dataset nettoyé
data.to_csv("BMI_IOS_SCD_Asthma_Clean.csv", index=False)

In [15]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
import pickle

In [16]:
df = pd.read_csv("BMI_IOS_SCD_Asthma_Clean.csv")
df

Unnamed: 0,Subject ID,Hydroxyurea,Asthma,ICS,LABA,Gender,Age (months),Height (cm),Weight (Kg),BMI,R5Hz_PP,R20Hz_PP,X5Hz_PP,Fres_PP
0,1,1,1,1,0,Male,239,164.1,61.5,22.84,145,133,-456.0,132.3
1,1,1,1,1,0,Male,193,162.7,62.3,23.53,103,98,111.0,169.0
2,1,1,1,1,1,Male,212,163.5,63.1,23.60,107,98,174.0,159.0
3,1,1,1,1,1,Male,224,163.8,63.7,23.74,87,87,-303.0,132.3
4,2,0,0,0,0,Female,204,154.5,66.4,27.82,124,121,98.0,135.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
214,87,0,1,1,1,Male,104,124.0,25.7,16.71,60,72,81.0,94.0
215,88,0,1,0,0,Female,138,160.0,50.6,19.77,84,86,96.0,136.0
216,89,0,1,1,1,Male,93,133.0,30.8,17.41,67,67,73.0,99.0
217,89,0,1,1,0,Male,90,135.0,31.3,17.17,104,83,156.0,120.0


In [17]:
# Séparation des variables secondaires et de la variable cible
X = df.drop(columns=['Asthma'])
y = df['Asthma']

In [18]:
# Identification des colonnes numériques et catégorielles
numerical_features = X.select_dtypes(include=['float64', 'int64']).columns.tolist()
categorial_features = ["Gender"]

In [19]:
# Création du preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler', StandardScaler())
        ]), numerical_features),
        ('cat', Pipeline(steps=[
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ]), categorial_features)
    ]
)

In [20]:
# Création du modèle
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

In [21]:
# Division du jeu de données en ensembles d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [22]:
# Training
model.fit(X_train, y_train)

In [23]:
# Testing
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

In [24]:
y_pred

array([1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1],
      dtype=int64)

In [25]:
accuracy

0.8636363636363636

In [26]:
pickle.dump(model, open('model_asthme.pkl', 'wb'))