In [9]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [13]:
data = pd.read_csv("dataframe.csv")
#onehot novamente porque não está salvando no csv
data = pd.get_dummies(
    data,
    columns=["Occupation", "Gender", "BMI"],
    drop_first=True
)

y = data["Quality"]
X = data.drop(columns=["Quality"])

### **Regressão Linear**

In [14]:
rmse_linear = []
mae_linear = []
r2_linear = []

rmse_linear_train = []
r2_linear_train = []

for seed in range(20):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=seed
    )

    num_cols = X_train.select_dtypes(include=["int64", "float64"]).columns

    scaler = StandardScaler()
    X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
    X_test[num_cols] = scaler.transform(X_test[num_cols])

    model = LinearRegression()
    model.fit(X_train, y_train)

    y_pred_test = model.predict(X_test)
    y_pred_train = model.predict(X_train)

    rmse_linear.append(np.sqrt(mean_squared_error(y_test, y_pred_test)))
    mae_linear.append(mean_absolute_error(y_test, y_pred_test))
    r2_linear.append(r2_score(y_test, y_pred_test))

    rmse_linear_train.append(np.sqrt(mean_squared_error(y_train, y_pred_train)))
    r2_linear_train.append(r2_score(y_train, y_pred_train))

linear_results = pd.DataFrame({
    "RMSE_teste": rmse_linear,
    "MAE_teste": mae_linear,
    "R2_teste": r2_linear,
    "RMSE_treino": rmse_linear_train,
    "R2_treino": r2_linear_train
})

linear_results.describe()

Unnamed: 0,RMSE_teste,MAE_teste,R2_teste,RMSE_treino,R2_treino
count,20.0,20.0,20.0,20.0,20.0
mean,0.283377,0.162568,0.942721,0.217927,0.966362
std,0.051447,0.022017,0.020941,0.009843,0.003023
min,0.200808,0.117211,0.88839,0.19511,0.960905
25%,0.253663,0.151811,0.931089,0.211574,0.964154
50%,0.275583,0.161812,0.946085,0.217805,0.965807
75%,0.304374,0.174243,0.958788,0.225389,0.968552
max,0.389958,0.206722,0.975118,0.232328,0.972267


### **Árvores de Decisão**

In [15]:
rmse_tree = []
mae_tree = []
r2_tree = []

rmse_tree_train = []
r2_tree_train = []

param_grid_tree = {
    'max_depth': [3, 4, 5, 6],
    'min_samples_split': [10, 20, 30],
    'min_samples_leaf': [5, 10, 20]
}

for seed in range(20):

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=seed
    )

    base_tree = DecisionTreeRegressor(random_state=42)

    grid = GridSearchCV(
        base_tree,
        param_grid_tree,
        scoring='neg_root_mean_squared_error',
        cv=5
    )

    grid.fit(X_train, y_train)
    best_tree = grid.best_estimator_

    y_pred_test = best_tree.predict(X_test)
    y_pred_train = best_tree.predict(X_train)

    rmse_tree.append(np.sqrt(mean_squared_error(y_test, y_pred_test)))
    mae_tree.append(mean_absolute_error(y_test, y_pred_test))
    r2_tree.append(r2_score(y_test, y_pred_test))

    rmse_tree_train.append(np.sqrt(mean_squared_error(y_train, y_pred_train)))
    r2_tree_train.append(r2_score(y_train, y_pred_train))

tree_results = pd.DataFrame({
    'RMSE_teste': rmse_tree,
    'MAE_teste': mae_tree,
    'R2_teste': r2_tree,
    'RMSE_treino': rmse_tree_train,
    'R2_treino': r2_tree_train
})

tree_results.describe()

Unnamed: 0,RMSE_teste,MAE_teste,R2_teste,RMSE_treino,R2_treino
count,20.0,20.0,20.0,20.0,20.0
mean,0.262665,0.080136,0.950377,0.160174,0.981578
std,0.064315,0.023604,0.021074,0.020317,0.004678
min,0.156225,0.035758,0.907253,0.123133,0.973561
25%,0.220627,0.067338,0.936908,0.145048,0.977774
50%,0.259982,0.080397,0.95183,0.157148,0.982667
75%,0.296422,0.090632,0.964879,0.175098,0.984843
max,0.375444,0.130725,0.982458,0.189593,0.989376


### **Random Forest**

In [16]:
rmse_rf = []
mae_rf = []
r2_rf = []

rmse_rf_train = []
r2_rf_train = []

param_grid_rf = {
    'n_estimators': [100],
    'max_depth': [6, 10],
    'min_samples_leaf': [5, 10],
    'max_features': ['sqrt']
}

for seed in range(20):

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=seed
    )

    base_rf = RandomForestRegressor(random_state=42, n_jobs=-1)

    grid = GridSearchCV(
        base_rf,
        param_grid_rf,
        scoring='neg_root_mean_squared_error',
        cv=5,
        n_jobs=-1
    )

    grid.fit(X_train, y_train)
    best_rf = grid.best_estimator_

    y_pred_test = best_rf.predict(X_test)
    y_pred_train = best_rf.predict(X_train)

    rmse_rf.append(np.sqrt(mean_squared_error(y_test, y_pred_test)))
    mae_rf.append(mean_absolute_error(y_test, y_pred_test))
    r2_rf.append(r2_score(y_test, y_pred_test))

    rmse_rf_train.append(np.sqrt(mean_squared_error(y_train, y_pred_train)))
    r2_rf_train.append(r2_score(y_train, y_pred_train))

rf_results = pd.DataFrame({
    'RMSE_teste': rmse_rf,
    'MAE_teste': mae_rf,
    'R2_teste': r2_rf,
    'RMSE_treino': rmse_rf_train,
    'R2_treino': r2_rf_train
})

rf_results.describe()

Unnamed: 0,RMSE_teste,MAE_teste,R2_teste,RMSE_treino,R2_treino
count,20.0,20.0,20.0,20.0,20.0
mean,0.272386,0.122561,0.947256,0.223739,0.964491
std,0.070976,0.027935,0.02128,0.010914,0.003867
min,0.135804,0.075346,0.915846,0.206865,0.957065
25%,0.230114,0.099228,0.929006,0.2139,0.962132
50%,0.257876,0.118288,0.951258,0.223806,0.963543
75%,0.325266,0.146256,0.961395,0.230255,0.968203
max,0.390066,0.165817,0.986744,0.243155,0.970533


### **Comparação**

In [17]:
comparison = pd.DataFrame({
    'Modelo': ['Regressão Linear', 'Árvore de Decisão', 'Random Forest'],
    'RMSE_medio': [
        linear_results['RMSE_teste'].mean(),
        tree_results['RMSE_teste'].mean(),
        rf_results['RMSE_teste'].mean()
    ],
    'MAE_medio': [
        linear_results['MAE_teste'].mean(),
        tree_results['MAE_teste'].mean(),
        rf_results['MAE_teste'].mean()
    ],
    'R2_medio': [
        linear_results['R2_teste'].mean(),
        tree_results['R2_teste'].mean(),
        rf_results['R2_teste'].mean()
    ]
})

comparison

Unnamed: 0,Modelo,RMSE_medio,MAE_medio,R2_medio
0,Regressão Linear,0.283377,0.162568,0.942721
1,Árvore de Decisão,0.262665,0.080136,0.950377
2,Random Forest,0.272386,0.122561,0.947256


In [18]:
comparison_overfit = pd.DataFrame({
    'Modelo': ['Regressão Linear', 'Árvore de Decisão', 'Random Forest'],

    'RMSE_treino_medio': [
        linear_results['RMSE_treino'].mean(),
        tree_results['RMSE_treino'].mean(),
        rf_results['RMSE_treino'].mean()
    ],

    'RMSE_teste_medio': [
        linear_results['RMSE_teste'].mean(),
        tree_results['RMSE_teste'].mean(),
        rf_results['RMSE_teste'].mean()
    ],

    'R2_treino_medio': [
        linear_results['R2_treino'].mean(),
        tree_results['R2_treino'].mean(),
        rf_results['R2_treino'].mean()
    ],

    'R2_teste_medio': [
        linear_results['R2_teste'].mean(),
        tree_results['R2_teste'].mean(),
        rf_results['R2_teste'].mean()
    ]
})

comparison_overfit


Unnamed: 0,Modelo,RMSE_treino_medio,RMSE_teste_medio,R2_treino_medio,R2_teste_medio
0,Regressão Linear,0.217927,0.283377,0.966362,0.942721
1,Árvore de Decisão,0.160174,0.262665,0.981578,0.950377
2,Random Forest,0.223739,0.272386,0.964491,0.947256


### **Joblib**

In [19]:
#modelo final escolhido: Random Forest
import joblib
joblib.dump(best_rf, "sleep_quality_model.pkl")

['sleep_quality_model.pkl']

Salvando

In [20]:
linear_results.to_csv("linear_results.csv", index=False)
tree_results.to_csv("tree_results.csv", index=False)
rf_results.to_csv("rf_results.csv", index=False)
comparison.to_csv("comparison.csv", index=False)
comparison_overfit.to_csv("comparison_overfit.csv", index=False)