In [90]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [2]:
# Import et concaténation des deux fichiers source
path = "../Data/"
df_source1 = pd.read_csv(path + "Sleep_health_and_lifestyle_dataset.csv")
df_source2 = pd.read_csv(path + "Sleep_health_and_lifestyle_dataset_part_2.csv")
df = pd.concat([df_source1, df_source2])

# Person ID 374 est un doublon
df.drop(index=373, inplace=True)

df.reset_index(drop=True, inplace=True)

In [3]:
# Modification des noms de colonne, on supprime les espaces
mapper = {
    "Person ID" : "PersonID",
    "Sleep Duration" : "SleepDuration",
    "Quality of Sleep" : "QualitySleep",
    "Physical Activity Level" : "PhysicalActivityLevel",
    "Stress Level" : "StressLevel",
    "BMI Category" : "BMI_Category",
    "Blood Pressure" : "BloodPressure",
    "Heart Rate" : "HeartRate",
    "Daily Steps" : "DailySteps",
    "Sleep Disorder" : "SleepDisorder"
}
df.rename(columns=mapper, inplace=True)

# On rectifie le type des variables (par défaut importées comme object)
new_types = {
    "Gender" : "string",
    "Occupation" : "string",
    "BMI_Category" : "string",
    "BloodPressure" : "string",
    "SleepDisorder" : "string"
}
df = df.astype(new_types)

In [None]:
# On sait qu'on n'a aucune valeur manquante pour SleepDisorder (None dans le fichier source)
# On crée une nouvelle catégorie pour les personnes sans trouble du sommeil
df.loc[df["SleepDisorder"].isna(), "SleepDisorder"] = "Healthy"

# On vérifie qu'on n'a aucune donnée manquante par ailleurs
df.isna().sum()

PersonID                 0
Gender                   0
Age                      0
Occupation               0
SleepDuration            0
QualitySleep             0
PhysicalActivityLevel    0
StressLevel              0
BMI_Category             0
BloodPressure            0
HeartRate                0
DailySteps               0
SleepDisorder            0
dtype: int64

In [5]:
# On sépare la tension en pression systolique et pression diastolique
for i in range(0,len(df)):
    pressure_list = df.loc[i,"BloodPressure"].split(sep='/')
    df.loc[i,"BloodPressureSystolic"] = int(pressure_list[0])
    df.loc[i,"BloodPressureDiastolic"] = int(pressure_list[1])
df.drop(columns="BloodPressure", inplace=True)

# Encodage des variables catégorielles
def OneHot(df, columns):
    '''
    One Hot Encoding pour une variable catégorielle.
    '''
    for column in columns:
        for iter in df[column].unique():
            name = iter.replace(" ", "")
            df[name] = 0
            df.loc[df[column]==iter,name] = 1
    
    df.drop(columns=columns, inplace=True)

    return df

df = OneHot(df, ["Gender", "Occupation", "BMI_Category"])

# Encodage de la variable à expliquer SleepDisorder
encoding = {"Healthy": 0, "Insomnia": 1, "Sleep Apnea": 2} # 3 labels : sain, insomnie, apnée du sommeil
df["SleepDisorderEncoded"] = df["SleepDisorder"].map(encoding)
encoding = {"Healthy": 0, "Insomnia": 1, "Sleep Apnea": 1} # 2 labels : avec vs sans trouble
df["HasSleepDisorder"] = df["SleepDisorder"].map(encoding)
df.drop(columns="SleepDisorder", inplace=True)

In [34]:
# Découpage en train/test
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=["PersonID","SleepDisorderEncoded", "HasSleepDisorder"]), 
                                                    df[["SleepDisorderEncoded","HasSleepDisorder"]], random_state=42,
                                                    stratify=df["SleepDisorderEncoded"])

In [71]:
SVM_HAS = SVC(kernel="linear", max_iter=100000000, random_state=42).fit(X_train, y_train["HasSleepDisorder"])
pred_train = SVM_HAS.predict(X_train)
pred_test = SVM_HAS.predict(X_test)
len(y_test[y_test["HasSleepDisorder"]==pred_test])/len(y_test)

0.9214285714285714

In [73]:
SVM_MULTI = SVC(kernel="linear", max_iter=100000000, random_state=42).fit(X_train, y_train["SleepDisorderEncoded"])
pred_train = SVM_MULTI.predict(X_train)
pred_test = SVM_MULTI.predict(X_test)
len(y_test[y_test["SleepDisorderEncoded"]==pred_test])/len(y_test)

0.9142857142857143

In [78]:
TREE_MULTI = DecisionTreeClassifier(random_state=42).fit(X_train, y_train["SleepDisorderEncoded"])
pred_train = TREE_MULTI.predict(X_train)
pred_test = TREE_MULTI.predict(X_test)
len(y_test[y_test["SleepDisorderEncoded"]==pred_test])/len(y_test)

0.8928571428571429

In [89]:
TREE_MULTI = DecisionTreeClassifier(ccp_alpha=0.01, random_state=42).fit(X_train, y_train["SleepDisorderEncoded"])
pred_train = TREE_MULTI.predict(X_train)
pred_test = TREE_MULTI.predict(X_test)
len(y_test[y_test["SleepDisorderEncoded"]==pred_test])/len(y_test)

0.9285714285714286

In [91]:
FOREST_MULTI = RandomForestClassifier(random_state=42).fit(X_train, y_train["SleepDisorderEncoded"])
pred_train = FOREST_MULTI.predict(X_train)
pred_test = FOREST_MULTI.predict(X_test)
len(y_test[y_test["SleepDisorderEncoded"]==pred_test])/len(y_test)

0.9214285714285714