In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load the dataset
file_path = 'Sleep_health_and_lifestyle_dataset.csv'
dataset = pd.read_csv(file_path)
dataset['Sleep Disorder'] = dataset['Sleep Disorder'].fillna('No Disorder')
# Encode categorical variables
categorical_columns = ['Gender', 'Occupation', 'BMI Category', 'Blood Pressure', 'Sleep Disorder']
encoders = {col: LabelEncoder() for col in categorical_columns}
for col in categorical_columns:
    dataset[col] = encoders[col].fit_transform(dataset[col])

# Separate features and target
X = dataset.drop(columns=['Person ID', 'Sleep Disorder'])  # Drop ID and target column
y = dataset['Sleep Disorder']

# Normalize numerical features
scaler = StandardScaler()
numerical_columns = X.select_dtypes(include=['int64', 'float64']).columns
X[numerical_columns] = scaler.fit_transform(X[numerical_columns])

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check class balance
print("Class distribution in training set:")
print(y_train.value_counts(normalize=True))

# Define models and hyperparameters
models_and_params = {
    "LogisticRegression": {
        "model": LogisticRegression(max_iter=1000),
        "params": {"C": [0.1, 1, 10, 100]}
    },
    "RandomForestClassifier": {
        "model": RandomForestClassifier(),
        "params": {"n_estimators": [50, 100, 200], "max_depth": [None, 10, 20, 30]}
    },
    "SVC": {
        "model": SVC(),
        "params": {"C": [0.1, 1, 10], "kernel": ["linear", "rbf"]}
    },
    "DecisionTreeClassifier": {
        "model": DecisionTreeClassifier(),
        "params": {"max_depth": [None, 10, 20, 30], "min_samples_split": [2, 5, 10]}
    },
    "KNeighborsClassifier": {
        "model": KNeighborsClassifier(),
        "params": {"n_neighbors": [3, 5, 7], "weights": ["uniform", "distance"]}
    },
    "GradientBoostingClassifier": {
        "model": GradientBoostingClassifier(),
        "params": {"n_estimators": [50, 100, 200], "learning_rate": [0.01, 0.1, 0.2]}
    }
}

# Perform GridSearchCV for each model
best_models = {}
results = {}

for model_name, config in models_and_params.items():
    print(f"Training {model_name}...")
    grid_search = GridSearchCV(config["model"], config["params"], cv=5, scoring="accuracy", n_jobs=-1)
    grid_search.fit(X_train, y_train)
    
    # Save the best model and its accuracy
    best_models[model_name] = grid_search.best_estimator_
    y_pred = grid_search.best_estimator_.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{model_name} - Best Params: {grid_search.best_params_}")
    print(f"{model_name} - Accuracy: {accuracy}")
    print(f"{model_name} - Classification Report:\n{classification_report(y_test, y_pred)}\n")


Class distribution in training set:
Sleep Disorder
2    0.588629
1    0.207358
0    0.204013
Name: proportion, dtype: float64
Training LogisticRegression...
LogisticRegression - Best Params: {'C': 10}
LogisticRegression - Accuracy: 0.9066666666666666
LogisticRegression - Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.88      0.82        16
           1       0.92      0.75      0.83        16
           2       0.95      0.98      0.97        43

    accuracy                           0.91        75
   macro avg       0.89      0.87      0.87        75
weighted avg       0.91      0.91      0.91        75


Training RandomForestClassifier...
RandomForestClassifier - Best Params: {'max_depth': None, 'n_estimators': 100}
RandomForestClassifier - Accuracy: 0.88
RandomForestClassifier - Classification Report:
              precision    recall  f1-score   support

           0       0.72      0.81      0.76        16
           1

In [None]:
# Check for missing values in the target column (disorder column)
disorder_column = df['Disorder']  # replace 'Disorder' with your actual target column name

# Checking for the percentage of missing values
missing_percentage = disorder_column.isnull().mean() * 100
print(f"Percentage of missing values in Disorder column: {missing_percentage:.2f}%")

# Handle missing values
# Option 1: Drop rows with missing target variable values
df_cleaned = df.dropna(subset=['Disorder'])

# Option 2: Fill missing values (if applicable, depending on the situation)
# For example, fill with the most frequent disorder
most_frequent_disorder = disorder_column.mode()[0]
df_filled = df.fillna({'Disorder': most_frequent_disorder})


In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
import pickle

# Load the dataset
file_path = 'Sleep_health_and_lifestyle_dataset.csv'
dataset = pd.read_csv(file_path)

# Encode categorical variables
categorical_columns = ['Gender', 'Occupation', 'BMI Category', 'Blood Pressure', 'Sleep Disorder']
encoders = {col: LabelEncoder() for col in categorical_columns}
for col in categorical_columns:
    dataset[col] = encoders[col].fit_transform(dataset[col])

# Separate features and target
X = dataset.drop(columns=['Person ID', 'Sleep Disorder'])  # Drop ID and target column
y = dataset['Sleep Disorder']

# Normalize numerical features
scaler = StandardScaler()
numerical_columns = X.select_dtypes(include=['int64', 'float64']).columns
X[numerical_columns] = scaler.fit_transform(X[numerical_columns])

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check class balance
print("Class distribution in training set:")
print(y_train.value_counts(normalize=True))

# Apply SMOTE to balance the dataset
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

# Check class distribution after balancing
print("Class distribution after balancing:")
print(pd.Series(y_train_balanced).value_counts(normalize=True))

# Define models and hyperparameters
models_and_params = {
    "LogisticRegression": {
        "model": LogisticRegression(max_iter=1000),
        "params": {"C": [0.1, 1, 10, 100]}
    },
    "RandomForestClassifier": {
        "model": RandomForestClassifier(),
        "params": {"n_estimators": [50, 100, 200], "max_depth": [None, 10, 20, 30]}
    },
    "SVC": {
        "model": SVC(),
        "params": {"C": [0.1, 1, 10], "kernel": ["linear", "rbf"]}
    },
    "DecisionTreeClassifier": {
        "model": DecisionTreeClassifier(),
        "params": {"max_depth": [None, 10, 20, 30], "min_samples_split": [2, 5, 10]}
    },
    "KNeighborsClassifier": {
        "model": KNeighborsClassifier(),
        "params": {"n_neighbors": [3, 5, 7], "weights": ["uniform", "distance"]}
    },
    "GradientBoostingClassifier": {
        "model": GradientBoostingClassifier(),
        "params": {"n_estimators": [50, 100, 200], "learning_rate": [0.01, 0.1, 0.2]}
    }
}

# Perform GridSearchCV for each model
best_models = {}
results = {}

for model_name, config in models_and_params.items():
    print(f"Training {model_name}...")
    grid_search = GridSearchCV(config["model"], config["params"], cv=5, scoring="accuracy", n_jobs=-1)
    grid_search.fit(X_train_balanced, y_train_balanced)
    
    # Save the best model and its accuracy
    best_models[model_name] = grid_search.best_estimator_
    y_pred = grid_search.best_estimator_.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{model_name} - Best Params: {grid_search.best_params_}")
    print(f"{model_name} - Accuracy: {accuracy}")
    print(f"{model_name} - Classification zReport:\n{classification_report(y_test, y_pred)}\n")

# Save the best-performing model (Logistic Regression example)
best_model = LogisticRegression(C=10, max_iter=1000)
best_model.fit(X_train_balanced, y_train_balanced)

# Save the model as a .pkl file
model_filename = "best_sleep_disorder_model.pkl"
with open(model_filename, "wb") as file:
    pickle.dump(best_model, file)

print(f"Best model saved as {model_filename}")


Class distribution in training set:
Sleep Disorder
2    0.588629
1    0.207358
0    0.204013
Name: proportion, dtype: float64
Class distribution after balancing:
Sleep Disorder
0    0.333333
2    0.333333
1    0.333333
Name: proportion, dtype: float64
Training LogisticRegression...
LogisticRegression - Best Params: {'C': 1}
LogisticRegression - Accuracy: 0.9066666666666666
LogisticRegression - Classification zReport:
              precision    recall  f1-score   support

           0       0.78      0.88      0.82        16
           1       0.92      0.75      0.83        16
           2       0.95      0.98      0.97        43

    accuracy                           0.91        75
   macro avg       0.89      0.87      0.87        75
weighted avg       0.91      0.91      0.91        75


Training RandomForestClassifier...
RandomForestClassifier - Best Params: {'max_depth': 10, 'n_estimators': 100}
RandomForestClassifier - Accuracy: 0.88
RandomForestClassifier - Classification zRepo

In [10]:
for col in categorical_columns:
    with open(f"{col}_encoder.pkl", "wb") as file:
        pickle.dump(encoders[col], file)


In [11]:
with open("scaler.pkl", "wb") as file:
    pickle.dump(scaler, file)


In [12]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import pickle

# Load your dataset
file_path = 'Sleep_health_and_lifestyle_dataset.csv'
dataset = pd.read_csv(file_path)

# Define categorical columns
categorical_columns = ['Gender', 'Occupation', 'BMI Category', 'Blood Pressure', 'Sleep Disorder']

# Create LabelEncoders and encode the dataset
encoders = {col: LabelEncoder() for col in categorical_columns}
for col in categorical_columns:
    dataset[col] = encoders[col].fit_transform(dataset[col])

# Save the encoders as a pickle file
with open('encoders.pkl', 'wb') as file:
    pickle.dump(encoders, file)

print("Encoders saved to encoders.pkl")


Encoders saved to encoders.pkl
