In [92]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score
from imblearn.over_sampling import SMOTE

  and should_run_async(code)


In [93]:
os.chdir('/content/')
os.listdir()

# fill the Nan value -> no disorder
data = pd.read_csv('/content/drive/MyDrive/Sleep_health_and_lifestyle_dataset.csv')

  and should_run_async(code)


In [94]:
data['Sleep Disorder'] = data['Sleep Disorder'].fillna('no disorder')  # 填充空值

# data preprocessing
data['BMI Category'] = data['BMI Category'].replace('Normal Weight', 'Normal')
data[['Blood Pressure systolic', 'Blood Pressure diastolic']] = data['Blood Pressure'].str.split('/', expand=True)
data['Blood Pressure systolic'] = pd.to_numeric(data['Blood Pressure systolic'])
data['Blood Pressure diastolic'] = pd.to_numeric(data['Blood Pressure diastolic'])
data = data.drop(columns=['Person ID', 'Blood Pressure'])

# spilt data
X_data = data.drop(columns=['Sleep Disorder'])
y_data = data[['Sleep Disorder']]
X_train, X_val, y_train, y_val = train_test_split(X_data, y_data, test_size=0.2, random_state=42)

  and should_run_async(code)


In [95]:
X_train['Occupation'].value_counts()

  and should_run_async(code)


Unnamed: 0_level_0,count
Occupation,Unnamed: 1_level_1
Nurse,61
Doctor,53
Engineer,53
Lawyer,36
Teacher,34
Accountant,32
Salesperson,22
Scientist,4
Software Engineer,2
Sales Representative,1


In [96]:
for column in X_train.columns:
    if X_train[column].dtype in ['int64', 'float64']:
        unique_values = X_train[column].nunique()
        print(f"{column}: {unique_values} unique values")

# discretize the data since most of them are continuous value
def discretize_column(column, bins=6):
    """
    seperate to 6 value with max min value (since the minumun kinds of value of all the column is 6)
    """
    if column.dtype in ['int64', 'float64']:
        labels = ['Very Low', 'Low', 'Medium Low', 'Medium High', 'High', 'Very High']
        min_val = column.min()
        max_val = column.max()
        # calculate the range
        bin_edges = [min_val + (max_val - min_val) * i / bins for i in range(bins + 1)]
        return pd.cut(column, bins=bin_edges, labels=labels, include_lowest=True), bin_edges
    return column, None

Age: 30 unique values
Sleep Duration: 27 unique values
Quality of Sleep: 6 unique values
Physical Activity Level: 16 unique values
Stress Level: 6 unique values
Heart Rate: 18 unique values
Daily Steps: 19 unique values
Blood Pressure systolic: 18 unique values
Blood Pressure diastolic: 16 unique values


  and should_run_async(code)


In [97]:
# X_train discretized and record the seperate range
discretized_train = {}
bin_edges = {}
for column in X_train.columns:
    discretized_train[column], bin_edges[column] = discretize_column(X_train[column])

X_train_discrete = pd.DataFrame(discretized_train)

# discretized X_val by the seperate range of X_train
def discretize_using_bins(column, bins, labels):

    if column.dtype in ['int64', 'float64']:
        return pd.cut(column, bins=bins, labels=labels, include_lowest=True)
    return column

discretized_val = {}
for column in X_val.columns:
    if column in bin_edges:
        discretized_val[column] = discretize_using_bins(X_val[column], bin_edges[column],
                                                        labels=['Very Low', 'Low', 'Medium Low', 'Medium High', 'High', 'Very High'])
    else:
        discretized_val[column] = X_val[column]

X_val_discrete = pd.DataFrame(discretized_val)

  and should_run_async(code)


In [98]:
# since TransactionEncoder in mlxtend.preprocessingwould seem the same name as same object even in different col
for column in X_train_discrete.columns:
    X_train_discrete[column] = X_train_discrete[column].apply(lambda x: f"{column}_{x}")

for column in X_val_discrete.columns:
    X_val_discrete[column] = X_val_discrete[column].apply(lambda x: f"{column}_{x}")

# concat y for apriori
train_data_prefixed = pd.concat([X_train_discrete, y_train], axis=1)

#print(train_data_prefixed.head())

  and should_run_async(code)


In [99]:
from mlxtend.frequent_patterns import apriori
from mlxtend.preprocessing import TransactionEncoder

min_support = 0.75

# since the number of data is not uniform
data_no_disorder = train_data_prefixed[train_data_prefixed['Sleep Disorder'] == 'no disorder']
data_insomnia = train_data_prefixed[train_data_prefixed['Sleep Disorder'] == 'Insomnia']
data_sleep_apnea = train_data_prefixed[train_data_prefixed['Sleep Disorder'] == 'Sleep Apnea']


def process_transactions(data):
    transactions = data.apply(lambda row: row.tolist(), axis=1).tolist()
    te = TransactionEncoder()
    te_ary = te.fit(transactions).transform(transactions)
    return pd.DataFrame(te_ary, columns=te.columns_)

transactions_no_disorder = process_transactions(data_no_disorder)
transactions_insomnia = process_transactions(data_insomnia)
transactions_sleep_apnea = process_transactions(data_sleep_apnea)

frequent_itemsets_no_disorder = apriori(transactions_no_disorder, min_support=min_support, use_colnames=True)
frequent_itemsets_insomnia = apriori(transactions_insomnia, min_support=min_support, use_colnames=True)
frequent_itemsets_sleep_apnea = apriori(transactions_sleep_apnea, min_support=min_support, use_colnames=True)

# filter the one with only y value
def filter_frequent_itemsets(frequent_itemsets, label):
    exclude_itemset = {frozenset({label})}
    return frequent_itemsets[~frequent_itemsets['itemsets'].isin(exclude_itemset)]

filtered_A = frequent_itemsets_insomnia[frequent_itemsets_insomnia['itemsets'].apply(lambda x: 'Insomnia' in x)]
filtered_B = frequent_itemsets_no_disorder[frequent_itemsets_no_disorder['itemsets'].apply(lambda x: 'no disorder' in x)]
filtered_C = frequent_itemsets_sleep_apnea[frequent_itemsets_sleep_apnea['itemsets'].apply(lambda x: 'Sleep Apnea' in x)]

frequent_itemsets_no_disorder = filter_frequent_itemsets(filtered_B, 'no disorder')
frequent_itemsets_insomnia = filter_frequent_itemsets(filtered_A, 'Insomnia')
frequent_itemsets_sleep_apnea = filter_frequent_itemsets(filtered_C, 'Sleep Apnea')

# concat all
frequent_itemsets = pd.concat(
    [frequent_itemsets_no_disorder, frequent_itemsets_insomnia, frequent_itemsets_sleep_apnea]
).reset_index(drop=True)


#pd.set_option('display.max_colwidth', None)
#pd.set_option('display.expand_frame_repr', False)
#print(frequent_itemsets.to_string(index=False, justify="left"))
#print(frequent_itemsets)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.expand_frame_repr', False)

frequent_itemsets = frequent_itemsets.sort_values(by='support', ascending=False)


max_itemsets_length = max(frequent_itemsets['itemsets'].apply(lambda x: len(str(x))))


for index, row in frequent_itemsets.iterrows():
    # support to percentage
    support_percentage = f"{row['support'] * 100:.2f}%"

    itemset_str = str(row['itemsets']).ljust(max_itemsets_length)
    print(f"{support_percentage.ljust(10)} {itemset_str}")

90.91%     frozenset({'BMI Category_Normal', 'no disorder'})                                                                                                                    
88.71%     frozenset({'Gender_Female', 'Sleep Apnea'})                                                                                                                          
87.10%     frozenset({'BMI Category_Overweight', 'Sleep Apnea'})                                                                                                                
85.48%     frozenset({'Gender_Female', 'BMI Category_Overweight', 'Sleep Apnea'})                                                                                               
83.87%     frozenset({'Blood Pressure systolic_Very High', 'Sleep Apnea'})                                                                                                      
83.61%     frozenset({'Insomnia', 'Sleep Duration_Low'})                                                           

  and should_run_async(code)


In [100]:
print(len(frequent_itemsets))

47


  and should_run_async(code)


In [101]:
# remove the y value since there is no y value for test
def remove_labels_from_itemsets(frequent_itemsets, exclude_labels):

    filtered_itemsets = frequent_itemsets['itemsets'].apply(
        lambda x: frozenset(item for item in x if item not in exclude_labels)
    )
    # renew itemsets
    frequent_itemsets['itemsets'] = filtered_itemsets

    frequent_itemsets = frequent_itemsets[frequent_itemsets['itemsets'].apply(len) > 0]
    return frequent_itemsets

exclude_labels = ['Sleep Apnea', 'no disorder', 'Insomnia']

temp = frequent_itemsets.copy()
frequent_itemsets_cleaned = remove_labels_from_itemsets(temp, exclude_labels)
#print(frequent_itemsets_cleaned)
#print(frequent_itemsets)


  and should_run_async(code)


In [102]:
def construct_features(data, frequent_itemsets, frequent_itemsets_cleaned = None):
    """
    change data to feature matrix，if the data has that feature than set to 1 else 0
    """
    if frequent_itemsets_cleaned is None: # tarin
      X_freq = pd.DataFrame(0, index=data.index, columns=frequent_itemsets['itemsets'].apply(lambda x: str(x)))
      for itemset in frequent_itemsets['itemsets']:
          col_name = str(itemset)
          X_freq[col_name] = data.apply(lambda row: 1 if all(item in row.values for item in itemset) else 0, axis=1)
      return X_freq

    else: # valid
      X_freq = pd.DataFrame(0, index=data.index, columns=frequent_itemsets['itemsets'].apply(lambda x: str(x)))


      for i, itemset in enumerate(frequent_itemsets_cleaned['itemsets']):
        col_name = str(frequent_itemsets['itemsets'].iloc[i])
        X_freq[col_name] = data.apply(lambda row: 1 if all(item in row.values for item in itemset) else 0, axis=1)
      return X_freq


X_freq_train = construct_features(train_data_prefixed, frequent_itemsets)
X_freq_val = construct_features(X_val_discrete, frequent_itemsets, frequent_itemsets_cleaned)

  and should_run_async(code)


In [103]:
# Data Augmentation（SMOTE）
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=42)
X_freq_train, y_train = smote.fit_resample(X_freq_train, y_train)

  and should_run_async(code)


In [104]:
# KNN classifier
#knn = KNeighborsClassifier(n_neighbors=5)
knn = KNeighborsClassifier(n_neighbors=5, weights='distance')
knn.fit(X_freq_train, y_train.values.ravel())  # make sure the shape of y

y_pred = knn.predict(X_freq_val)

print("Classification Report:")
print(classification_report(y_val, y_pred, digits=4))
print(f"Accuracy: {accuracy_score(y_val, y_pred):.4f}")


# check prediction distribution
from collections import Counter
print("Predicted label distribution:", Counter(y_pred))

Classification Report:
              precision    recall  f1-score   support

    Insomnia     0.7059    0.7500    0.7273        16
 Sleep Apnea     0.8000    0.7500    0.7742        16
 no disorder     0.9302    0.9302    0.9302        43

    accuracy                         0.8533        75
   macro avg     0.8120    0.8101    0.8106        75
weighted avg     0.8546    0.8533    0.8536        75

Accuracy: 0.8533
Predicted label distribution: Counter({'no disorder': 43, 'Insomnia': 17, 'Sleep Apnea': 15})


  and should_run_async(code)


In [108]:
label_to_num = {'no disorder': 0, 'Sleep Apnea': 1, 'Insomnia': 2}
num_to_label = {v: k for k, v in label_to_num.items()}

# change to number for auroc library
y_val_label = y_val['Sleep Disorder'].map(label_to_num).values
y_pred_label = [label_to_num[label] for label in y_pred]

# for auroc
y_pred_proba = knn.predict_proba(X_freq_val)
auroc = roc_auc_score(y_val_label, y_pred_proba, multi_class='ovr')

print((f"AUROC: {auroc:.4f}"))

AUROC: 0.4518


  and should_run_async(code)
