In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
import pickle
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

# Load the dataset
data = pd.read_csv(r'E:\3rd Year 1st semi\Fdm\FDM_Project\Strock Prediction\Dataset\healthcare-dataset-stroke-data.csv')

# Impute missing values in 'bmi' using mean
imputer = SimpleImputer(strategy='mean')
data['bmi'] = imputer.fit_transform(data[['bmi']])

# Drop 'id' column
data.drop('id', axis=1, inplace=True)

# Label encoding for categorical features
label_encoders = {}
categorical_columns = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']
for col in categorical_columns:
    label_encoders[col] = LabelEncoder()
    data[col] = label_encoders[col].fit_transform(data[col])

In [3]:
# Check for duplicates
data.drop_duplicates(inplace=True)

# Separate features and target variable
X = data.drop(columns=['stroke'])  # Features (independent variables)
y = data['stroke']  # Target (dependent variable)

# Scale numerical features
scaler = StandardScaler()
numerical_columns = ['age', 'avg_glucose_level', 'bmi']
X[numerical_columns] = scaler.fit_transform(X[numerical_columns])

In [11]:
# Step 5: Splitting the Dataset into Training and Test Sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("\nTraining and Test Data Sizes:")
print(f"Training Data: {X_train.shape}, Test Data: {X_test.shape}")


Training and Test Data Sizes:
Training Data: (4088, 10), Test Data: (1022, 10)


In [12]:
# Train Random Forest model
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)

# Make predictions
y_pred_rf = rf_model.predict(X_test)

# Evaluate Random Forest
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f"Random Forest Accuracy: {accuracy_rf * 100:.2f}%")

# Confusion Matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_rf))

# Classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred_rf))





Random Forest Accuracy: 94.03%

Confusion Matrix:
[[960   0]
 [ 61   1]]

Classification Report:
              precision    recall  f1-score   support

           0       0.94      1.00      0.97       960
           1       1.00      0.02      0.03        62

    accuracy                           0.94      1022
   macro avg       0.97      0.51      0.50      1022
weighted avg       0.94      0.94      0.91      1022



In [13]:
# Train Decision Tree model
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train, y_train)

# Make predictions
y_pred_dt = dt_model.predict(X_test)

# Evaluate Decision Tree
accuracy_dt = accuracy_score(y_test, y_pred_dt)
print(f"Decision Tree Accuracy: {accuracy_dt * 100:.2f}%")

# Confusion Matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_rf))
# Classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred_dt))


Decision Tree Accuracy: 91.19%

Confusion Matrix:
[[960   0]
 [ 61   1]]

Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.96      0.95       960
           1       0.22      0.18      0.20        62

    accuracy                           0.91      1022
   macro avg       0.58      0.57      0.57      1022
weighted avg       0.90      0.91      0.91      1022



In [14]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply SMOTE for balancing the dataset
smote = SMOTE(random_state=42)
X_smote, y_smote = smote.fit_resample(X_train, y_train)

# Scale numerical features after SMOTE
X_smote[numerical_columns] = scaler.fit_transform(X_smote[numerical_columns])

In [15]:
### Model Training and Evaluation

# Logistic Regression
log_model = LogisticRegression()
log_model.fit(X_smote, y_smote)
y_pred_log = log_model.predict(X_test)

# Evaluate Logistic Regression
accuracy_log = accuracy_score(y_test, y_pred_log)
print(f"Logistic Regression Accuracy: {accuracy_log * 100:.2f}%")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_log))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_log))


Logistic Regression Accuracy: 58.41%

Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.57      0.72       960
           1       0.11      0.87      0.20        62

    accuracy                           0.58      1022
   macro avg       0.55      0.72      0.46      1022
weighted avg       0.93      0.58      0.69      1022


Confusion Matrix:
[[543 417]
 [  8  54]]


In [16]:
# Random Forest with Hyperparameter Tuning
param_grid_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [5, 10, None],  # Limiting maximum depth
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}
rf = RandomForestClassifier(random_state=42)
grid_search_rf = GridSearchCV(rf, param_grid_rf, cv=5, scoring='accuracy', n_jobs=-1)
grid_search_rf.fit(X_smote, y_smote)

print("Best Hyperparameters for Random Forest:", grid_search_rf.best_params_)
print("Best Cross-Validated Accuracy:", grid_search_rf.best_score_)

Best Hyperparameters for Random Forest: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300}
Best Cross-Validated Accuracy: 0.9397600157689844


In [17]:
# Evaluate Random Forest on Test Set
best_rf = grid_search_rf.best_estimator_
y_pred_rf = best_rf.predict(X_test)
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f"Random Forest Accuracy after tuning: {accuracy_rf * 100:.2f}%")
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_rf))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_rf))


Random Forest Accuracy after tuning: 80.43%

Confusion Matrix:
[[803 157]
 [ 43  19]]

Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.84      0.89       960
           1       0.11      0.31      0.16        62

    accuracy                           0.80      1022
   macro avg       0.53      0.57      0.52      1022
weighted avg       0.90      0.80      0.84      1022



In [18]:
# Decision Tree with Reduced Complexity
dt_model = DecisionTreeClassifier(max_depth=5, min_samples_split=5)  # Limit complexity
dt_model.fit(X_smote, y_smote)
y_pred_dt = dt_model.predict(X_test)
accuracy_dt = accuracy_score(y_test, y_pred_dt)
print(f"Decision Tree Accuracy: {accuracy_dt * 100:.2f}%")
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_dt))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_dt))

Decision Tree Accuracy: 56.95%

Confusion Matrix:
[[541 419]
 [ 21  41]]

Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.56      0.71       960
           1       0.09      0.66      0.16        62

    accuracy                           0.57      1022
   macro avg       0.53      0.61      0.43      1022
weighted avg       0.91      0.57      0.68      1022



In [19]:
# Support Vector Machine with Regularization
svm_model = SVC(C=1.0, kernel='linear')  # Regularization parameter C
svm_model.fit(X_smote, y_smote)
y_pred_svm = svm_model.predict(X_test)
accuracy_svm = accuracy_score(y_test, y_pred_svm)
print(f"SVM Accuracy: {accuracy_svm * 100:.2f}%")
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_svm))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_svm))

SVM Accuracy: 56.36%

Confusion Matrix:
[[522 438]
 [  8  54]]

Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.54      0.70       960
           1       0.11      0.87      0.19        62

    accuracy                           0.56      1022
   macro avg       0.55      0.71      0.45      1022
weighted avg       0.93      0.56      0.67      1022

