In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

In [None]:
# Gantilah ID file dengan ID dari Google Drive URL
file_id = '1PgdeKv5l5SLOmcll6A5R5Dnu8DmXBvAP'

# Buat URL unduhan langsung
download_url = f'https://drive.google.com/uc?id={file_id}'

# Baca file CSV dari URL
df = pd.read_csv(download_url)

In [None]:
# Tampilkan DataFrame untuk memastikan telah dibaca dengan benar
df.head()

Unnamed: 0,Date,Person_ID,Age,Gender,Sleep Start Time,Sleep End Time,Total Sleep Hours,Sleep Quality,Exercise (mins/day),Caffeine Intake (mg),Screen Time Before Bed (mins),Work Hours (hrs/day),Productivity Score,Mood Score,Stress Level
0,2024-04-12,1860,32,Other,23.33,4.61,5.28,3,86,87,116,8.80892,8,3,6
1,2024-11-04,1769,41,Female,21.02,2.43,5.41,5,32,21,88,6.329833,10,3,7
2,2024-08-31,2528,20,Male,22.1,3.45,5.35,7,17,88,59,8.506306,10,9,10
3,2024-02-22,8041,37,Other,23.1,6.65,7.55,8,46,34,80,6.07024,8,4,2
4,2024-02-23,4843,46,Other,21.42,4.17,6.75,10,61,269,94,11.374994,8,7,9


In [None]:
df.tail()

Unnamed: 0,Date,Person_ID,Age,Gender,Sleep Start Time,Sleep End Time,Total Sleep Hours,Sleep Quality,Exercise (mins/day),Caffeine Intake (mg),Screen Time Before Bed (mins),Work Hours (hrs/day),Productivity Score,Mood Score,Stress Level
4995,2024-01-03,5192,38,Female,22.78,4.15,5.37,1,55,216,96,9.384504,5,2,1
4996,2024-06-02,7134,55,Male,21.48,5.39,7.91,4,70,81,177,5.016193,5,6,2
4997,2024-08-13,6265,44,Female,22.02,2.82,4.8,6,21,90,139,11.788651,2,7,1
4998,2024-12-26,4205,55,Other,23.47,7.87,8.4,9,15,87,161,7.093145,8,1,6
4999,2024-04-11,2304,58,Male,20.72,1.37,4.65,1,6,26,35,5.615948,6,9,4


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 15 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Date                           5000 non-null   object 
 1   Person_ID                      5000 non-null   int64  
 2   Age                            5000 non-null   int64  
 3   Gender                         5000 non-null   object 
 4   Sleep Start Time               5000 non-null   float64
 5   Sleep End Time                 5000 non-null   float64
 6   Total Sleep Hours              5000 non-null   float64
 7   Sleep Quality                  5000 non-null   int64  
 8   Exercise (mins/day)            5000 non-null   int64  
 9   Caffeine Intake (mg)           5000 non-null   int64  
 10  Screen Time Before Bed (mins)  5000 non-null   int64  
 11  Work Hours (hrs/day)           5000 non-null   float64
 12  Productivity Score             5000 non-null   i

In [None]:
# Cek missing values
print("\nMissing values per fitur:")
print(df.isnull().sum())


Missing values per fitur:
Date                             0
Person_ID                        0
Age                              0
Gender                           0
Sleep Start Time                 0
Sleep End Time                   0
Total Sleep Hours                0
Sleep Quality                    0
Exercise (mins/day)              0
Caffeine Intake (mg)             0
Screen Time Before Bed (mins)    0
Work Hours (hrs/day)             0
Productivity Score               0
Mood Score                       0
Stress Level                     0
dtype: int64


In [None]:
# Cek duplicate values
print(df.duplicated().sum())

0


In [None]:
df = df.rename(columns={'Sleep Start Time': 'Sleep_Start_Time'})
df = df.rename(columns={'Sleep End Time': 'Sleep_End_Time'})
df = df.rename(columns={'Total Sleep Hours': 'Total_Sleep_Hours'})
df = df.rename(columns={'Sleep Quality': 'Sleep_Quality'})
df = df.rename(columns={'Exercise (mins/day)': 'Exercise(mins/day)'})
df = df.rename(columns={'Caffeine Intake (mg)': 'Caffeine_Intake(mg)'})
df = df.rename(columns={'Screen Time Before Bed (mins)': 'Screen_Time_Before_Bed(mins)'})
df = df.rename(columns={'Work Hours (hrs/day)': 'Work_Hours(hrs/day)'})
df = df.rename(columns={'Productivity Score': 'ProductivityScore'})
df = df.rename(columns={'Mood Score': 'Mood_Score'})
df = df.rename(columns={'Stress Level': 'StressLevel'})

In [None]:
df.head()

Unnamed: 0,Date,Person_ID,Age,Gender,Sleep_Start_Time,Sleep_End_Time,Total_Sleep_Hours,Sleep_Quality,Exercise(mins/day),Caffeine_Intake(mg),Screen_Time_Before_Bed(mins),Work_Hours(hrs/day),ProductivityScore,Mood_Score,StressLevel
0,2024-04-12,1860,32,Other,23.33,4.61,5.28,3,86,87,116,8.80892,8,3,6
1,2024-11-04,1769,41,Female,21.02,2.43,5.41,5,32,21,88,6.329833,10,3,7
2,2024-08-31,2528,20,Male,22.1,3.45,5.35,7,17,88,59,8.506306,10,9,10
3,2024-02-22,8041,37,Other,23.1,6.65,7.55,8,46,34,80,6.07024,8,4,2
4,2024-02-23,4843,46,Other,21.42,4.17,6.75,10,61,269,94,11.374994,8,7,9


In [None]:
# Hapus kolom 'RowNumber', 'CustomerId', dan 'Surname'
df = df.drop(columns=['Exercise(mins/day)', 'Caffeine_Intake(mg)', 'Screen_Time_Before_Bed(mins)', 'Work_Hours(hrs/day)', 'ProductivityScore', 'Mood_Score', 'StressLevel'])

In [None]:
# Tampilkan DataFrame untuk memastikan kolom telah dihapus
df.head()

Unnamed: 0,Date,Person_ID,Age,Gender,Sleep_Start_Time,Sleep_End_Time,Total_Sleep_Hours,Sleep_Quality
0,2024-04-12,1860,32,Other,23.33,4.61,5.28,3
1,2024-11-04,1769,41,Female,21.02,2.43,5.41,5
2,2024-08-31,2528,20,Male,22.1,3.45,5.35,7
3,2024-02-22,8041,37,Other,23.1,6.65,7.55,8
4,2024-02-23,4843,46,Other,21.42,4.17,6.75,10


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Date               5000 non-null   object 
 1   Person_ID          5000 non-null   int64  
 2   Age                5000 non-null   int64  
 3   Gender             5000 non-null   object 
 4   Sleep_Start_Time   5000 non-null   float64
 5   Sleep_End_Time     5000 non-null   float64
 6   Total_Sleep_Hours  5000 non-null   float64
 7   Sleep_Quality      5000 non-null   int64  
dtypes: float64(3), int64(3), object(2)
memory usage: 312.6+ KB


In [None]:
# Konversi `Sleep_Start_Time` dan `Sleep_End_Time` dari float ke `datetime.time`
def convert_float_to_time(time_float):
    """Mengonversi waktu dalam format float (23.5) menjadi `datetime.time`."""
    hours = int(time_float)  # Ambil jam
    minutes = int((time_float - hours) * 60)  # Ambil menit
    return datetime.strptime(f"{hours:02d}:{minutes:02d}", "%H:%M").time()

In [None]:
df["Sleep_Start_Time"] = df["Sleep_Start_Time"].apply(convert_float_to_time)
df["Sleep_End_Time"] = df["Sleep_End_Time"].apply(convert_float_to_time)


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Date               5000 non-null   object 
 1   Person_ID          5000 non-null   int64  
 2   Age                5000 non-null   int64  
 3   Gender             5000 non-null   object 
 4   Sleep_Start_Time   5000 non-null   object 
 5   Sleep_End_Time     5000 non-null   object 
 6   Total_Sleep_Hours  5000 non-null   float64
 7   Sleep_Quality      5000 non-null   int64  
dtypes: float64(1), int64(3), object(4)
memory usage: 312.6+ KB


In [None]:
# Fungsi untuk menghitung durasi tidur dalam jam
def calculate_sleep_duration(bedtime, wakeup_time):
    """Menghitung durasi tidur dalam jam."""
    bed_dt = datetime.combine(datetime.today(), bedtime)
    wake_dt = datetime.combine(datetime.today(), wakeup_time)
    if wake_dt < bed_dt:  # Jika tidur melewati tengah malam
        wake_dt += timedelta(days=1)
    return (wake_dt - bed_dt).seconds / 3600  # Hasil dalam jam

In [None]:
# Tambahkan kolom durasi tidur
df["Sleep_Duration"] = df.apply(lambda row: calculate_sleep_duration(row["Sleep_Start_Time"], row["Sleep_End_Time"]), axis=1)

In [None]:
df.head()

Unnamed: 0,Date,Person_ID,Age,Gender,Sleep_Start_Time,Sleep_End_Time,Total_Sleep_Hours,Sleep_Quality,Sleep_Duration
0,2024-04-12,1860,32,Other,23:19:00,04:36:00,5.28,3,5.283333
1,2024-11-04,1769,41,Female,21:01:00,02:25:00,5.41,5,5.4
2,2024-08-31,2528,20,Male,22:06:00,03:27:00,5.35,7,5.35
3,2024-02-22,8041,37,Other,23:06:00,06:39:00,7.55,8,7.55
4,2024-02-23,4843,46,Other,21:25:00,04:10:00,6.75,10,6.75


In [None]:
# Cek tipe data setelah konversi
print(df.dtypes)

Date                  object
Person_ID              int64
Age                    int64
Gender                object
Sleep_Start_Time      object
Sleep_End_Time        object
Total_Sleep_Hours    float64
Sleep_Quality          int64
Sleep_Duration       float64
dtype: object


In [None]:
print(type(df["Sleep_Start_Time"].iloc[0]))  # Lihat tipe data baris pertama


<class 'datetime.time'>


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
import joblib

# Labeling kualitas tidur
def categorize_sleep_quality(quality):
    if quality >= 7:
        return "Baik"
    elif 5 <= quality < 7:
        return "Cukup"
    else:
        return "Kurang"

df["Sleep_Category"] = df["Sleep_Duration"].apply(categorize_sleep_quality)

# Encode kategori kualitas tidur ke angka
label_encoder = LabelEncoder()
df["Sleep_Category_Encoded"] = label_encoder.fit_transform(df["Sleep_Category"])

# Pilih fitur dan target
X = df[["Sleep_Duration"]]
y = df["Sleep_Category_Encoded"]

# Split data untuk training dan testing (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Inisialisasi model Random Forest
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Prediksi dan evaluasi akurasi
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy * 100:.2f}%")

# Simpan model dan label encoder
joblib.dump(model, "sleep_quality_model.pkl")
joblib.dump(label_encoder, "label_encoder.pkl")


Model Accuracy: 100.00%


['label_encoder.pkl']

In [None]:
df.head(30)

Unnamed: 0,Date,Person_ID,Age,Gender,Sleep_Start_Time,Sleep_End_Time,Total_Sleep_Hours,Sleep_Quality,Sleep_Duration,Sleep_Category,Sleep_Category_Encoded
0,2024-04-12,1860,32,Other,23:19:00,04:36:00,5.28,3,5.283333,Cukup,1
1,2024-11-04,1769,41,Female,21:01:00,02:25:00,5.41,5,5.4,Cukup,1
2,2024-08-31,2528,20,Male,22:06:00,03:27:00,5.35,7,5.35,Cukup,1
3,2024-02-22,8041,37,Other,23:06:00,06:39:00,7.55,8,7.55,Baik,0
4,2024-02-23,4843,46,Other,21:25:00,04:10:00,6.75,10,6.75,Cukup,1
5,2024-07-08,7439,38,Male,21:46:00,06:24:00,8.64,10,8.633333,Baik,0
6,2024-01-09,6463,18,Other,22:49:00,06:52:00,8.03,3,8.05,Baik,0
7,2024-01-28,7278,26,Female,20:46:00,03:08:00,6.35,8,6.366667,Cukup,1
8,2024-04-10,9110,31,Other,20:04:00,03:22:00,7.31,7,7.3,Baik,0
9,2024-02-21,6116,49,Female,20:22:00,03:53:00,7.52,4,7.516667,Baik,0


In [None]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))


Accuracy: 1.0


In [None]:
# Hapus kolom 'RowNumber', 'CustomerId', dan 'Surname'
df = df.drop(columns=['Date', 'Person_ID', 'Gender', 'Age', 'Total_Sleep_Hours'])

# Tampilkan DataFrame untuk memastikan kolom telah dihapus
df.head()

Unnamed: 0,Sleep_Start_Time,Sleep_End_Time,Sleep_Quality,Sleep_Duration,Sleep_Category,Sleep_Category_Encoded
0,23:19:00,04:36:00,3,5.283333,Cukup,1
1,21:01:00,02:25:00,5,5.4,Cukup,1
2,22:06:00,03:27:00,7,5.35,Cukup,1
3,23:06:00,06:39:00,8,7.55,Baik,0
4,21:25:00,04:10:00,10,6.75,Cukup,1


In [None]:
# Encode kategori tidur menjadi angka
label_encoder = LabelEncoder()
df["Sleep_Category_Encoded"] = label_encoder.fit_transform(df["Sleep_Category"])


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [None]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_scaled, y_train)

print("✅ Model Random Forest berhasil dilatih!")


✅ Model Random Forest berhasil dilatih!


In [None]:
# Gunakan kembali df sebagai data uji
test_data = df[["Sleep_Duration", "Sleep_Quality"]]

# Normalisasi input menggunakan scaler yang sama (pastikan urutan sama)
test_features_scaled = scaler.transform(test_data[X_train.columns])

# Prediksi kategori tidur
predictions = rf_model.predict(test_features_scaled)

# Mapping hasil prediksi ke kategori asli
sleep_category_mapping = {idx: label for idx, label in enumerate(label_encoder.classes_)}
df["Predicted_Sleep_Category"] = [sleep_category_mapping[p] for p in predictions]

# Tampilkan hasil
print(df[["Sleep_Duration", "Sleep_Quality", "Predicted_Sleep_Category"]])


      Sleep_Duration  Sleep_Quality Predicted_Sleep_Category
0           5.283333              3                    Cukup
1           5.400000              5                    Cukup
2           5.350000              7                    Cukup
3           7.550000              8                     Baik
4           6.750000             10                    Cukup
...              ...            ...                      ...
4995        5.383333              1                    Cukup
4996        7.916667              4                     Baik
4997        4.800000              6                   Kurang
4998        8.400000              9                     Baik
4999        4.650000              1                   Kurang

[5000 rows x 3 columns]


In [None]:
pip install --upgrade numpy




In [None]:
import pickle

with open("sleep_quality_model.pkl", "rb") as file:
    model = pickle.load(file)

print(type(model))  # Harusnya tampil DecisionTreeClassifier atau model lain


<class 'numpy.ndarray'>


In [None]:
import pickle

# Simpan model
with open("sleep_quality_model.pkl", "wb") as file:
    pickle.dump(rf_model, file)

# Simpan scaler juga agar bisa digunakan saat prediksi nanti
with open("scaler.pkl", "wb") as file:
    pickle.dump(scaler, file)

# Simpan label encoder untuk mapping hasil prediksi
with open("label_encoder.pkl", "wb") as file:
    pickle.dump(label_encoder, file)


In [None]:
from google.colab import files

# Download file model
files.download("sleep_quality_model.pkl")
files.download("scaler.pkl")
files.download("label_encoder.pkl")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>