## SLEEP DISORDER PREDICTION

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report


try:
    data = pd.read_csv('Sleep_health_and_lifestyle_dataset.csv')
except FileNotFoundError:
    print("Error: 'Sleep_health_and_lifestyle_dataset.csv' not found. Make sure the file is in the correct directory.")
    exit()


print("\nUnique values in 'Sleep Disorder' column IMMEDIATELY after loading:")
print(data['Sleep Disorder'].unique())

print("Initial Data:")
print(data.head())
print("\nData Info:")
data.info()
print("\nUnique values in categorical columns:")
for col in data.select_dtypes(include='object').columns:
    print(f"{col}: {data[col].nunique()} unique values")
    if data[col].nunique() < 10:  # Print unique values for low cardinality categorical features
        print(data[col].value_counts())
print("\nValue counts of 'Sleep Disorder' column:")
print(data['Sleep Disorder'].value_counts())

print("\nUnique values in 'Sleep Disorder' column:")
print(data['Sleep Disorder'].unique())

#Handling the target variable: Create a binary target for sleep disorder
data['Has_Sleep_Disorder'] = data['Sleep Disorder'].apply(lambda x: 0 if pd.isna(x) else 1)
data = data.drop('Sleep Disorder', axis=1)

# 4. Separate features (X) and target (y)
columns_to_drop = ['Person ID', 'Name', 'Has_Sleep_Disorder']
X = data.drop(columns=columns_to_drop, axis=1, errors='ignore')
y = data['Has_Sleep_Disorder']

print("\nValue counts of 'Has_Sleep_Disorder' BEFORE splitting:")
print(y.value_counts())

print("\nValue counts of 'Has_Sleep_Disorder' before split:")
print(y.value_counts())

# Identifying categorical and numerical features
categorical_features = X.select_dtypes(include='object').columns.tolist()
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Specific handling for 'Blood Pressure' and 'BMI Category'
# Spliting 'Blood Pressure' into systolic and diastolic
if 'Blood Pressure' in X.columns:
    X[['BP_Systolic', 'BP_Diastolic']] = X['Blood Pressure'].str.split('/', expand=True).astype(int)
    X = X.drop('Blood Pressure', axis=1)
    numerical_features.extend(['BP_Systolic', 'BP_Diastolic'])
    if 'Blood Pressure' in numerical_features:
        numerical_features.remove('Blood Pressure')

# Encoding 'BMI Category' using a simple mapping
if 'BMI Category' in X.columns:
    bmi_mapping = {'Normal': 0, 'Normal Weight': 1, 'Overweight': 2, 'Obese': 3}
    X['BMI_Encoded'] = X['BMI Category'].map(bmi_mapping)
    X = X.drop('BMI Category', axis=1)
    if 'BMI Category' in categorical_features:
        categorical_features.remove('BMI Category')
    numerical_features.append('BMI_Encoded')

# Updating the lists of categorical and numerical features
categorical_features = [col for col in categorical_features if col in X.columns]
numerical_features = [col for col in numerical_features if col in X.columns]

print("\nCategorical Features:", categorical_features)
print("Numerical Features:", numerical_features)

# Creating preprocessing pipelines for numerical and categorical features
numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Creating a ColumnTransformer to apply different transformations to different columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)])

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Creating a pipeline with the preprocessor and a model
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', LogisticRegression(random_state=42))])

# Training the model
model.fit(X_train, y_train)

#  Making predictions on the test set
y_pred = model.predict(X_test)

# Evaluating the model
accuracy = accuracy_score(y_test, y_pred)
print(f"\nAccuracy on the Test Set: {accuracy:.4f}")

print("\nClassification Report on the Test Set:")
print(classification_report(y_test, y_pred))




Unique values in 'Sleep Disorder' column IMMEDIATELY after loading:
[nan 'Sleep Apnea' 'Insomnia']
Initial Data:
   Person ID Gender  Age            Occupation  Sleep Duration  \
0          1   Male   27     Software Engineer             6.1   
1          2   Male   28                Doctor             6.2   
2          3   Male   28                Doctor             6.2   
3          4   Male   28  Sales Representative             5.9   
4          5   Male   28  Sales Representative             5.9   

   Quality of Sleep  Physical Activity Level  Stress Level BMI Category  \
0                 6                       42             6   Overweight   
1                 6                       60             8       Normal   
2                 6                       60             8       Normal   
3                 4                       30             8        Obese   
4                 4                       30             8        Obese   

  Blood Pressure  Heart Rate  Daily St

In [None]:
import joblib


filename = 'sleep_disorder_model.joblib'
joblib.dump(model, filename)

print(f"Model saved as {filename}")

# To load the model in the future:
loaded_model = joblib.load(filename)
print("Model loaded successfully!")

Model saved as sleep_disorder_model.joblib
Model loaded successfully!
