In [1]:
import pandas as pd
df = pd.read_csv(r"..Sleep Apnea and Insomnia\Sleep Data.csv", keep_default_na=False)
df.drop(['Person ID','Occupation','Quality of Sleep','Stress Level','Physical Activity Level'],axis = 1, inplace = True)

# GENDER MAPPING
gender_mapping = {'Male': 0, 'Female': 1}
# Apply the mapping to the 'Gender' column
df['Gender'] = df['Gender'].map(gender_mapping)

# BMI MAPPING
bmi_category_mapping = {
    'Normal Weight': 18.0,
    'Normal': 25.0, 
    'Overweight': 27.5,
    'Obese': 32.5
}
# Apply the mapping to the 'BMI Category' column
df['BMI'] = df['BMI Category'].map(bmi_category_mapping)

# BP PREPROCESSING
# Extract systolic and diastolic values from 'Blood Pressure'
df[['Systolic_BP', 'Diastolic_BP']] = df['Blood Pressure'].str.split('/', expand=True)

# Convert extracted values to numeric
df['Systolic_BP'] = pd.to_numeric(df['Systolic_BP'], errors='coerce')
df['Diastolic_BP'] = pd.to_numeric(df['Diastolic_BP'], errors='coerce')
df.drop(['Blood Pressure','BMI Category',],axis = 1, inplace = True)
df.drop(['BMI'],axis = 1, inplace = True)
df.rename(columns={'Heart Rate': 'Bpm', 'Gender':'sex'}, inplace=True)
df.head()

Unnamed: 0,sex,Age,Sleep Duration,Bpm,Daily Steps,Sleep Disorder,Systolic_BP,Diastolic_BP
0,0,27,6.1,77,4200,,126,83
1,0,28,6.2,75,10000,,125,80
2,0,28,6.2,75,10000,,125,80
3,0,28,5.9,85,3000,Sleep Apnea,140,90
4,0,28,5.9,85,3000,Sleep Apnea,140,90


In [3]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report

X = df.drop(columns='Sleep Disorder')
y = df['Sleep Disorder']

# Define the parameter grid for the grid search
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [4, 6, 8],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 3, 4],
    'max_features': ['sqrt', 'log2']
}

# Create a RandomForestClassifier instance
rf_classifier = RandomForestClassifier(random_state=42)

# Perform GridSearchCV
grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best Parameters:", best_params)
print("Best Score:", best_score)

# Use the best model obtained from grid search for predictions
best_rf_classifier = grid_search.best_estimator_
y_pred = best_rf_classifier.predict(X_test)

# Calculate accuracy on the test set
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Generate and print classification report on the test set
class_report = classification_report(y_test, y_pred)
print("Classification Report:\n", class_report)


Best Parameters: {'max_depth': 6, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best Score: 0.9182732878935409
Accuracy: 0.9166666666666666
Classification Report:
               precision    recall  f1-score   support

    Insomnia       0.85      0.82      0.84        28
        None       0.94      0.98      0.96       113
 Sleep Apnea       0.87      0.74      0.80        27

    accuracy                           0.92       168
   macro avg       0.89      0.85      0.87       168
weighted avg       0.91      0.92      0.91       168



In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

X = df.drop(columns='Sleep Disorder')
y = df['Sleep Disorder']
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Create a RandomForestClassifier with regularization parameters
regularized_rf_classifier = RandomForestClassifier(
    n_estimators=45,
    max_depth=3,              # Adjust the maximum depth of the trees
    min_samples_split=6,      # Adjust the minimum number of samples required to split an internal node
    min_samples_leaf=4,       # Adjust the minimum number of samples required to be at a leaf node
    max_features='sqrt',      # Use 'auto', 'sqrt', or 'log2' for the number of features to consider for the best split
    random_state=42,
    class_weight='balanced',  # Add balanced class weights
)

# Fit the model to the training data
regularized_rf_classifier.fit(X_train, y_train)
# Make predictions on the test set
y_pred_regularized = regularized_rf_classifier.predict(X_test)
# Calculate accuracy on the test set
accuracy_regularized = accuracy_score(y_test, y_pred_regularized)
print("Accuracy (Regularized):", accuracy_regularized)
# Generate and print classification report on the test set
class_report_regularized = classification_report(y_test, y_pred_regularized)
print("Classification Report (Regularized):\n", class_report_regularized)

# Generate confusion matrix
print("Confusion Matrix:")

print(confusion_matrix(y_test, y_pred_regularized))
# Perform cross-validation on the entire dataset
cv_scores = cross_val_score(regularized_rf_classifier, X, y, cv=5, scoring='accuracy')

# Print cross-validation scores
print("Cross-Validation Scores:", cv_scores)
print("Mean CV Accuracy:", cv_scores.mean())


Accuracy (Regularized): 0.9285714285714286
Classification Report (Regularized):
               precision    recall  f1-score   support

    Insomnia       0.86      0.89      0.88        28
        None       0.94      0.98      0.96       113
 Sleep Apnea       0.95      0.74      0.83        27

    accuracy                           0.93       168
   macro avg       0.92      0.87      0.89       168
weighted avg       0.93      0.93      0.93       168

Confusion Matrix:
[[ 25   3   0]
 [  1 111   1]
 [  3   4  20]]
Cross-Validation Scores: [0.82142857 0.98214286 0.80357143 0.97321429 0.87387387]
Mean CV Accuracy: 0.8908462033462033


In [13]:
import pickle
# Specify the full path to save the pickle file
file_path = r'C:\Users\Ibrahim\Desktop\SA_I_rf.pickle'

# Save the model to a pickle file
with open(file_path, 'wb') as f:
    pickle.dump(regularized_rf_classifier, f)


In [16]:
import numpy as np
import pandas as pd
import pickle
from sklearn.preprocessing import StandardScaler

# Load the pickle file containing the voting classifier
with open(r'..Webpage\model\SA_I_rf.pickle', 'rb') as file:
    voting_classifier = pickle.load(file)

# Define the updated input features with feature names
features_data = np.array([[1, 28, 5.9, 85, 3000, 140, 90]])

# Define feature names
feature_names = ["sex", "Age", "Sleep Duration", "Bpm", "Daily Steps", "Systolic_BP", "Diastolic_BP"]

# Create a DataFrame with features and feature names
features_df = pd.DataFrame(features_data, columns=feature_names)

# Make a prediction
prediction = voting_classifier.predict(features_df)

# Print the prediction
print("Predicted stroke outcome:", prediction)

Predicted stroke outcome: ['Sleep Apnea']
