In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
data = pd.read_csv('dataset.csv')


# Check for missing values and handle them
data = data.dropna()

# Convert categorical columns to numerical values
data = pd.get_dummies(data, drop_first=True)

# Separate the dependent variable (e.g., stroke) from the independent variables
X = data.drop('stroke', axis=1)  # Independent variables
Y = data['stroke']               # Dependent variable

# Split the dataset into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Standardize the data (important for PCA)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Apply PCA to reduce dimensionality (using 2 components)
pca = PCA(n_components=2)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

# Hyperparameter tuning for Random Forest using Grid Search
param_grid = {
    'n_estimators': [100, 200, 300],  # Number of trees in the forest
    'max_depth': [10, 20, 30],  # Maximum depth of the trees
    'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split a node
    'min_samples_leaf': [1, 2, 4],  # Minimum number of samples required at each leaf node
    'bootstrap': [True, False]  # Whether bootstrap samples are used when building trees
}
ranfor_model = RandomForestClassifier(n_estimators=100, random_state=42)
ranfor_model.fit(X_train_pca, Y_train)

y_pred = ranfor_model.predict(X_test_pca)
y_ranfor_prob = ranfor_model.predict_proba(X_test_pca)

cm = confusion_matrix(Y_test, Y_pred)

accuracy = accuracy_score(Y_test, Y_pred)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=[0, 1], yticklabels=[0, 1])  # Adjust labels accordingly
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title(f"Confusion Matrix (Accuracy: {accuracy:.2f})")
plt.show()

# Print best hyperparameters
print(f"Model Accuracy: {accuracy:.2f}")



In [11]:
ranfor_model = RandomForestClassifier(n_estimators=100, random_state=42)
ranfor_model.fit(X_train_pca, Y_train)

y_pred = ranfor_model.predict(X_test_pca)
y_ranfor_prob = ranfor_model.predict_proba(X_test_pca)

cm = confusion_matrix(Y_test, Y_pred)

accuracy = accuracy_score(Y_test, Y_pred)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=[0, 1], yticklabels=[0, 1])  # Adjust labels accordingly
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title(f"Confusion Matrix (Accuracy: {accuracy:.2f})")
plt.show()

# Print best hyperparameters
print(f"Model Accuracy: {accuracy:.2f}")


ValueError: Found input variables with inconsistent numbers of samples: [5815, 982]

In [13]:
data.shape

(29072, 17)

In [15]:
data.sample(5)

Unnamed: 0,id,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke,gender_Male,gender_Other,ever_married_Yes,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Urban,smoking_status_never smoked,smoking_status_smokes
17315,33152,69.0,0,0,69.65,27.8,0,False,False,True,False,True,False,False,False,True,False
20252,36632,69.0,0,0,62.08,33.9,0,True,False,False,False,False,False,False,True,True,False
10774,43859,48.0,0,0,83.04,64.2,0,False,False,True,False,False,True,False,True,True,False
24101,33429,26.0,0,0,88.16,31.8,0,False,False,True,False,False,True,False,True,True,False
32861,56256,41.0,0,0,75.39,31.7,0,False,False,True,False,False,False,False,False,True,False
