# import library

In [8]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


# LOAD DATASET

In [2]:
# Load dataset
df = pd.read_csv('/content/Sleep_health_and_lifestyle_dataset.csv')

In [3]:
# Melihat dimensi data
print("Dimensi data:", df.shape)

# Menampilkan 5 baris pertama
print("\n5 baris pertama:")
print(df.head())

Dimensi data: (374, 13)

5 baris pertama:
   Person ID Gender  Age            Occupation  Sleep Duration  \
0          1   Male   27     Software Engineer             6.1   
1          2   Male   28                Doctor             6.2   
2          3   Male   28                Doctor             6.2   
3          4   Male   28  Sales Representative             5.9   
4          5   Male   28  Sales Representative             5.9   

   Quality of Sleep  Physical Activity Level  Stress Level BMI Category  \
0                 6                       42             6   Overweight   
1                 6                       60             8       Normal   
2                 6                       60             8       Normal   
3                 4                       30             8        Obese   
4                 4                       30             8        Obese   

  Blood Pressure  Heart Rate  Daily Steps Sleep Disorder  
0         126/83          77         4200          

# Hipotesis Awal
Beberapa contoh hipotesis awal dari dataset kesehatan tidur dan gaya hidup:

- Terdapat hubungan antara durasi tidur (Sleep Duration) dengan tingkat stres (Stress Level).

- Gaya hidup (seperti konsumsi kafein, olahraga, penggunaan gadget sebelum tidur) memengaruhi kualitas tidur.

- Faktor usia dan jenis kelamin memiliki pengaruh terhadap kualitas tidur.

# EKSPLORASI DATA ANALISIS (EDA)

In [6]:
# Info dataset (cek tipe data & missing values)
print("\nInfo dataset:")
print(df.info())

# Statistik deskriptif
print("\nStatistik deskriptif:")
print(df.describe(include='all'))

# Cek missing value
print("\nJumlah missing value per kolom:")
print(df.isnull().sum())

# Cek duplikat
print("\nJumlah data duplikat:", df.duplicated().sum())


Info dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 374 entries, 0 to 373
Data columns (total 13 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Person ID                374 non-null    int64  
 1   Gender                   374 non-null    object 
 2   Age                      374 non-null    int64  
 3   Occupation               374 non-null    object 
 4   Sleep Duration           374 non-null    float64
 5   Quality of Sleep         374 non-null    int64  
 6   Physical Activity Level  374 non-null    int64  
 7   Stress Level             374 non-null    int64  
 8   BMI Category             374 non-null    object 
 9   Blood Pressure           374 non-null    object 
 10  Heart Rate               374 non-null    int64  
 11  Daily Steps              374 non-null    int64  
 12  Sleep Disorder           155 non-null    object 
dtypes: float64(1), int64(7), object(5)
memory usage: 38.1+ KB
None

S

# METRIK EVALUASI

## KLASIFIKASI

In [7]:
# Asumsi 'Quality of Sleep' adalah kolom target (ubah sesuai kolom sebenarnya)
# dan sudah diubah menjadi numerik jika perlu

# Buat X dan y
X = df.drop(['Quality of Sleep'], axis=1).select_dtypes(include=['int64', 'float64'])  # Hanya fitur numerik
y = df['Quality of Sleep']

# Bagi data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model Random Forest (contoh)
model = RandomForestClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Metrik evaluasi
print("Akurasi:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Akurasi: 0.9733333333333334

Classification Report:
               precision    recall  f1-score   support

           4       1.00      1.00      1.00         2
           5       1.00      1.00      1.00         1
           6       1.00      1.00      1.00        26
           7       1.00      0.88      0.93        16
           8       0.90      1.00      0.95        18
           9       1.00      1.00      1.00        12

    accuracy                           0.97        75
   macro avg       0.98      0.98      0.98        75
weighted avg       0.98      0.97      0.97        75


Confusion Matrix:
 [[ 2  0  0  0  0  0]
 [ 0  1  0  0  0  0]
 [ 0  0 26  0  0  0]
 [ 0  0  0 14  2  0]
 [ 0  0  0  0 18  0]
 [ 0  0  0  0  0 12]]


## REGRESI

In [10]:
# Target regresi
X = df.drop(['Sleep Duration'], axis=1).select_dtypes(include=['int64', 'float64'])
y = df['Sleep Duration']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Evaluasi regresi
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred)**0.5
r2 = r2_score(y_test, y_pred)

print("MAE:", mae)
print("MSE:", mse)
print("RMSE:", rmse)
print("R2 Score:", r2)

MAE: 0.2921912629168277
MSE: 0.12416492681871989
RMSE: 0.35237043976292887
R2 Score: 0.8135019564013235
