<a href="https://colab.research.google.com/github/Willian2003/previsao-de-disturbios-do-sono/blob/main/ps_ligia_quality_sleep.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

df = pd.read_csv("Sleep_health_and_lifestyle_dataset.csv")

df.head()

Unnamed: 0,Person ID,Gender,Age,Occupation,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,Blood Pressure,Heart Rate,Daily Steps,Sleep Disorder
0,1,Male,27,Software Engineer,6.1,6,42,6,Overweight,126/83,77,4200,
1,2,Male,28,Doctor,6.2,6,60,8,Normal,125/80,75,10000,
2,3,Male,28,Doctor,6.2,6,60,8,Normal,125/80,75,10000,
3,4,Male,28,Sales Representative,5.9,4,30,8,Obese,140/90,85,3000,Sleep Apnea
4,5,Male,28,Sales Representative,5.9,4,30,8,Obese,140/90,85,3000,Sleep Apnea


In [2]:
# Resumo do dataframe
df.info()
# Estatísticas das colunas numéricas
df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 374 entries, 0 to 373
Data columns (total 13 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Person ID                374 non-null    int64  
 1   Gender                   374 non-null    object 
 2   Age                      374 non-null    int64  
 3   Occupation               374 non-null    object 
 4   Sleep Duration           374 non-null    float64
 5   Quality of Sleep         374 non-null    int64  
 6   Physical Activity Level  374 non-null    int64  
 7   Stress Level             374 non-null    int64  
 8   BMI Category             374 non-null    object 
 9   Blood Pressure           374 non-null    object 
 10  Heart Rate               374 non-null    int64  
 11  Daily Steps              374 non-null    int64  
 12  Sleep Disorder           155 non-null    object 
dtypes: float64(1), int64(7), object(5)
memory usage: 38.1+ KB


Unnamed: 0,Person ID,Age,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,Heart Rate,Daily Steps
count,374.0,374.0,374.0,374.0,374.0,374.0,374.0,374.0
mean,187.5,42.184492,7.132086,7.312834,59.171123,5.385027,70.165775,6816.84492
std,108.108742,8.673133,0.795657,1.196956,20.830804,1.774526,4.135676,1617.915679
min,1.0,27.0,5.8,4.0,30.0,3.0,65.0,3000.0
25%,94.25,35.25,6.4,6.0,45.0,4.0,68.0,5600.0
50%,187.5,43.0,7.2,7.0,60.0,5.0,70.0,7000.0
75%,280.75,50.0,7.8,8.0,75.0,7.0,72.0,8000.0
max,374.0,59.0,8.5,9.0,90.0,8.0,86.0,10000.0


In [3]:
def preprocessing(df):
    # Excluindo coluna de distúrbio do sono e ID da pessoa
    df.drop(columns=['Person ID',
                    'Sleep Disorder'], inplace=True)
    # Renomeando colunas
    df.rename(
        columns={
            'BMI Category': 'BMI',
            'Quality of Sleep': 'Quality',
            'Physical Activity Level': 'Activity',
            'Daily Steps': 'Steps',
        },
        inplace=True
    )

    df['BMI'] = df['BMI'].replace({'Normal Weight': 'Normal'})

    # Separando números da pressão arterial
    bp = df['Blood Pressure'].str.split('/', expand=True)


    df['Pressure P'] = pd.to_numeric(bp[0])
    df['Pressure D'] = pd.to_numeric(bp[1])

    df.drop(columns=['Blood Pressure'], inplace=True)

    return df

In [5]:
data = preprocessing(df)
y = data['Quality']
X = data.drop(columns=['Quality'])

In [6]:
# One-Hot Encoding
X = pd.get_dummies(X, columns=['Occupation', 'Gender', 'BMI'], drop_first=True)

In [7]:
X.dtypes

Unnamed: 0,0
Age,int64
Sleep Duration,float64
Activity,int64
Stress Level,int64
Heart Rate,int64
Steps,int64
Pressure P,int64
Pressure D,int64
Occupation_Doctor,bool
Occupation_Engineer,bool


In [None]:
# Isso que a gente fez, é se for treinar 1x, mas como vamos treinar 20x, tem que ser no looping
# from sklearn.model_selection import train_test_split

#  X_train, X_test, y_train, y_test = train_test_split(
#     X, y,
#     test_size=0.2,
#     random_state=42
# )
# from sklearn.preprocessing import StandardScaler
#
# num_cols = X_train.select_dtypes(include=['int64', 'float64']).columns

# scaler = StandardScaler()

# X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
# X_test[num_cols] = scaler.transform(X_test[num_cols])

### **Regressão Linear 20x**


In [8]:
# Listas de TESTE
rmse_linear = []
mae_linear = []
r2_linear = []

# Listas de TREINO (para overfitting)
rmse_linear_train = []
r2_linear_train = []

for seed in range(20):

    X_train_linear, X_test_linear, y_train_linear, y_test_linear = train_test_split(
        X, y,
        test_size=0.2,
        random_state=seed
    )

    # Scaler
    num_cols_linear = X_train_linear.select_dtypes(include=['int64', 'float64']).columns

    scaler_linear = StandardScaler()
    X_train_linear[num_cols_linear] = scaler_linear.fit_transform(
        X_train_linear[num_cols_linear]
    )
    X_test_linear[num_cols_linear] = scaler_linear.transform(
        X_test_linear[num_cols_linear]
    )

    # Modelo
    linear_model = LinearRegression()
    linear_model.fit(X_train_linear, y_train_linear)

    # Previsões
    y_pred_test_linear = linear_model.predict(X_test_linear)
    y_pred_train_linear = linear_model.predict(X_train_linear)

    # Métricas TESTE
    rmse_linear.append(mean_squared_error(y_test_linear, y_pred_test_linear) ** 0.5)
    mae_linear.append(mean_absolute_error(y_test_linear, y_pred_test_linear))
    r2_linear.append(r2_score(y_test_linear, y_pred_test_linear))

    # Métricas TREINO (overfitting)
    rmse_linear_train.append(mean_squared_error(y_train_linear, y_pred_train_linear) ** 0.5)
    r2_linear_train.append(r2_score(y_train_linear, y_pred_train_linear))

In [9]:
linear_results = pd.DataFrame({
    'RMSE_teste': rmse_linear,
    'MAE_teste': mae_linear,
    'R2_teste': r2_linear,
    'RMSE_treino': rmse_linear_train,
    'R2_treino': r2_linear_train
})

linear_results.describe()

Unnamed: 0,RMSE_teste,MAE_teste,R2_teste,RMSE_treino,R2_treino
count,20.0,20.0,20.0,20.0,20.0
mean,0.283377,0.162568,0.942721,0.217927,0.966362
std,0.051447,0.022017,0.020941,0.009843,0.003023
min,0.200808,0.117211,0.88839,0.19511,0.960905
25%,0.253663,0.151811,0.931089,0.211574,0.964154
50%,0.275583,0.161812,0.946085,0.217805,0.965807
75%,0.304374,0.174243,0.958788,0.225389,0.968552
max,0.389958,0.206722,0.975118,0.232328,0.972267


### **Árvore de decisão**

In [10]:
# Listas de TESTE
rmse_tree = []
mae_tree = []
r2_tree = []

# Listas de TREINO (para overfitting)
rmse_tree_train = []
r2_tree_train = []

param_grid = {
    'max_depth': [3, 4, 5, 6],
    'min_samples_split': [10, 20, 30],
    'min_samples_leaf': [5, 10, 20]
}

for seed in range(20):

    X_train_tree, X_test_tree, y_train_tree, y_test_tree = train_test_split(
        X, y,
        test_size=0.2,
        random_state=seed
    )

    base_tree = DecisionTreeRegressor(random_state=42)

    grid = GridSearchCV(
        base_tree,
        param_grid,
        scoring='neg_root_mean_squared_error',
        cv=5
    )

    grid.fit(X_train_tree, y_train_tree)

    best_tree = grid.best_estimator_

    # Previsões
    y_pred_test_tree = best_tree.predict(X_test_tree)
    y_pred_train_tree = best_tree.predict(X_train_tree)

    # Métricas TESTE
    rmse_tree.append(mean_squared_error(y_test_tree, y_pred_test_tree) ** 0.5)
    mae_tree.append(mean_absolute_error(y_test_tree, y_pred_test_tree))
    r2_tree.append(r2_score(y_test_tree, y_pred_test_tree))

    # Métricas TREINO (overfitting)
    rmse_tree_train.append(mean_squared_error(y_train_tree, y_pred_train_tree) ** 0.5)
    r2_tree_train.append(r2_score(y_train_tree, y_pred_train_tree))

In [11]:
tree_results = pd.DataFrame({
    'RMSE_teste': rmse_tree,
    'MAE_teste': mae_tree,
    'R2_teste': r2_tree,
    'RMSE_treino': rmse_tree_train,
    'R2_treino': r2_tree_train
})

tree_results.describe()

Unnamed: 0,RMSE_teste,MAE_teste,R2_teste,RMSE_treino,R2_treino
count,20.0,20.0,20.0,20.0,20.0
mean,0.262665,0.080136,0.950377,0.160174,0.981578
std,0.064315,0.023604,0.021074,0.020317,0.004678
min,0.156225,0.035758,0.907253,0.123133,0.973561
25%,0.220627,0.067338,0.936908,0.145048,0.977774
50%,0.259982,0.080397,0.95183,0.157148,0.982667
75%,0.296422,0.090632,0.964879,0.175098,0.984843
max,0.375444,0.130725,0.982458,0.189593,0.989376


# Random Forest




In [12]:
# Listas de TESTE
rmse_rf = []
mae_rf = []
r2_rf = []

# Listas de TREINO (para overfitting)
rmse_rf_train = []
r2_rf_train = []

param_grid = {
    'n_estimators': [100],
    'max_depth': [6, 10],
    'min_samples_leaf': [5, 10],
    'max_features': ['sqrt']
}

for seed in range(20):

    X_train_rf, X_test_rf, y_train_rf, y_test_rf = train_test_split(
        X, y,
        test_size=0.2,
        random_state=seed
    )

    base_rf = RandomForestRegressor(
        random_state=42,
        n_jobs=-1
    )

    grid = GridSearchCV(
        base_rf,
        param_grid,
        scoring='neg_root_mean_squared_error',
        cv=5,
        n_jobs=-1
    )

    grid.fit(X_train_rf, y_train_rf)

    best_rf = grid.best_estimator_

    # Previsões
    y_pred_test_rf = best_rf.predict(X_test_rf)
    y_pred_train_rf = best_rf.predict(X_train_rf)

    # Métricas TESTE
    rmse_rf.append(mean_squared_error(y_test_rf, y_pred_test_rf) ** 0.5)
    mae_rf.append(mean_absolute_error(y_test_rf, y_pred_test_rf))
    r2_rf.append(r2_score(y_test_rf, y_pred_test_rf))

    # Métricas TREINO (overfitting)
    rmse_rf_train.append(mean_squared_error(y_train_rf, y_pred_train_rf) ** 0.5)
    r2_rf_train.append(r2_score(y_train_rf, y_pred_train_rf))

In [13]:
rf_results = pd.DataFrame({
    'RMSE_teste': rmse_rf,
    'MAE_teste': mae_rf,
    'R2_teste': r2_rf,
    'RMSE_treino': rmse_rf_train,
    'R2_treino': r2_rf_train
})

rf_results.describe()

Unnamed: 0,RMSE_teste,MAE_teste,R2_teste,RMSE_treino,R2_treino
count,20.0,20.0,20.0,20.0,20.0
mean,0.272386,0.122561,0.947256,0.223739,0.964491
std,0.070976,0.027935,0.02128,0.010914,0.003867
min,0.135804,0.075346,0.915846,0.206865,0.957065
25%,0.230114,0.099228,0.929006,0.2139,0.962132
50%,0.257876,0.118288,0.951258,0.223806,0.963543
75%,0.325266,0.146256,0.961395,0.230255,0.968203
max,0.390066,0.165817,0.986744,0.243155,0.970533


# Comparação

In [14]:
comparison_test = pd.DataFrame({
    'Modelo': ['Regressão Linear', 'Árvore de Decisão', 'Random Forest'],

    'RMSE_teste_medio': [
        linear_results['RMSE_teste'].mean(),
        tree_results['RMSE_teste'].mean(),
        rf_results['RMSE_teste'].mean()
    ],

    'MAE_teste_medio': [
        linear_results['MAE_teste'].mean(),
        tree_results['MAE_teste'].mean(),
        rf_results['MAE_teste'].mean()
    ],

    'R2_teste_medio': [
        linear_results['R2_teste'].mean(),
        tree_results['R2_teste'].mean(),
        rf_results['R2_teste'].mean()
    ],

    'RMSE_teste_std': [
        linear_results['RMSE_teste'].std(),
        tree_results['RMSE_teste'].std(),
        rf_results['RMSE_teste'].std()
    ],

    'R2_teste_std': [
        linear_results['R2_teste'].std(),
        tree_results['R2_teste'].std(),
        rf_results['R2_teste'].std()
    ]
})

comparison_test

Unnamed: 0,Modelo,RMSE_teste_medio,MAE_teste_medio,R2_teste_medio,RMSE_teste_std,R2_teste_std
0,Regressão Linear,0.283377,0.162568,0.942721,0.051447,0.020941
1,Árvore de Decisão,0.262665,0.080136,0.950377,0.064315,0.021074
2,Random Forest,0.272386,0.122561,0.947256,0.070976,0.02128


In [15]:
comparison_overfit = pd.DataFrame({
    'Modelo': ['Regressão Linear', 'Árvore de Decisão', 'Random Forest'],

    'RMSE_treino_medio': [
        linear_results['RMSE_treino'].mean(),
        tree_results['RMSE_treino'].mean(),
        rf_results['RMSE_treino'].mean()
    ],

    'RMSE_teste_medio': [
        linear_results['RMSE_teste'].mean(),
        tree_results['RMSE_teste'].mean(),
        rf_results['RMSE_teste'].mean()
    ],

    'R2_treino_medio': [
        linear_results['R2_treino'].mean(),
        tree_results['R2_treino'].mean(),
        rf_results['R2_treino'].mean()
    ],

    'R2_teste_medio': [
        linear_results['R2_teste'].mean(),
        tree_results['R2_teste'].mean(),
        rf_results['R2_teste'].mean()
    ]
})

comparison_overfit

Unnamed: 0,Modelo,RMSE_treino_medio,RMSE_teste_medio,R2_treino_medio,R2_teste_medio
0,Regressão Linear,0.217927,0.283377,0.966362,0.942721
1,Árvore de Decisão,0.160174,0.262665,0.981578,0.950377
2,Random Forest,0.223739,0.272386,0.964491,0.947256
