In [None]:
import pandas as pd
import os

# Check the current working directory
print("Current working directory:", os.getcwd())

# List the files in the current directory
print("Files in the current directory:", os.listdir())

# Check if the file exists
file_path = '../data/brain.csv'
if os.path.exists(file_path):
	try:
		# Load the CSV file
		df = pd.read_csv(file_path)
		print("File loaded successfully")
	except pd.errors.EmptyDataError:
		print("No columns to parse from file")
else:
	print(f"File not found: {file_path}")
df.shape

In [None]:
df

In [None]:
# Examine data types and missing values
print(df.shape)
print(df.info())
print(df.isnull().sum() / len(df) * 100)

# Summary statistics
print(df.describe())
print(df['stroke'].value_counts())

import matplotlib.pyplot as plt
import seaborn as sns

# Visualize distributions
plt.figure(figsize=(12, 6))
plt.subplot(2, 2, 1)
sns.histplot(df['age'], kde=True)
plt.title('Distribution of Age')
plt.subplot(2, 2, 2)
sns.histplot(df['avg_glucose_level'], kde=True)
plt.title('Distribution of Average Glucose Level')
plt.subplot(2, 2, 3)
sns.histplot(df['bmi'], kde=True)
plt.title('Distribution of BMI')
plt.subplot(2, 2, 4)
sns.countplot(x='stroke', data=df)
plt.title('Distribution of Stroke')
plt.tight_layout()
plt.show()

# Correlation analysis
correlation_matrix = df.corr(numeric_only=True)
print(correlation_matrix)

# Categorical feature analysis
categorical_features = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']
plt.figure(figsize=(15, 10))
for i, feature in enumerate(categorical_features):
    plt.subplot(2, 3, i + 1)
    sns.countplot(x=feature, hue='stroke', data=df)
    plt.title(f'Relationship between {feature} and Stroke')
    plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Correlation Analysis (Preliminary)
try:
    correlation_matrix = df.corr(numeric_only=True)
    import seaborn as sns
    plt.figure(figsize=(10,8))
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
    plt.title('Correlation Matrix')
    plt.show()
except Exception as e:
    print(f"An error occurred during correlation analysis: {e}")

# Outlier Detection (Preliminary) - Box Plots
numerical_cols = df.select_dtypes(include=['number']).columns
df[numerical_cols].plot(kind='box', subplots=True, layout=(2, 4), figsize=(15, 10), title='Box Plots of Numerical Features')
plt.tight_layout()
plt.show()

# Target Variable Analysis
print("\nStroke Distribution:\n", df['stroke'].value_counts())
print("\nPercentage of Stroke:\n", df['stroke'].value_counts(normalize=True) * 100)
df['stroke'].value_counts().plot(kind='bar', title='Stroke Distribution')
plt.show()

In [None]:

def remove_outliers_IQR(data, column):
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    data_filtered = data[(data[column] >= lower_bound) & (data[column] <= upper_bound)]
    return data_filtered

# Remove outliers from the dataframe for specified columns
columns_to_clean = ['bmi', 'avg_glucose_level']  # Replace with your outlier columns
for column in columns_to_clean:
    df = remove_outliers_IQR(df, column)

print("Data shape after outlier removal:", df.shape)

In [None]:
# Again Outlier Detection (Preliminary) - Box Plots
import matplotlib.pyplot as plt
numerical_cols = df.select_dtypes(include=['number']).columns
df[numerical_cols].plot(kind='box', subplots=True, layout=(2, 4), figsize=(15, 10), title='Box Plots of Numerical Features')
plt.tight_layout()
plt.show()

In [None]:
# Calculate skewness for numerical columns
skewness = df.skew(numeric_only=True)
print("Skewness of numerical columns:\n", skewness)

In [None]:
import numpy as np
# Apply log transformation to a column
def log_transform(data, column):
    data[column] = np.log1p(data[column])  # np.log1p handles values close to zero
    return data

# Columns to log transform (e.g., if avg_glucose_level is skewed)
columns_to_log = ['bmi', 'avg_glucose_level'] # Replace with your skewed columns
for column in columns_to_log:
    df = log_transform(df, column)

In [None]:
df

In [None]:
import seaborn as sns

import matplotlib.pyplot as plt

# Plot distributions of all numerical columns
numerical_cols = df.select_dtypes(include=['number']).columns

plt.figure(figsize=(15, 10))
for i, col in enumerate(numerical_cols):
    plt.subplot(2, 3, i + 1)
    sns.histplot(df[col], kde=True)
    plt.title(f'Distribution of {col}')
plt.tight_layout()
plt.show()

In [None]:
df

In [None]:
# Import required libraries
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

# Select categorical columns
categorical_columns = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']

# Create OneHotEncoder instance
encoder = OneHotEncoder(sparse_output=False, drop='first')  # drop='first' avoids multicollinearity

# Apply one-hot encoding
encoded_data = encoder.fit_transform(df[categorical_columns])

# Create a DataFrame with the encoded variables
encoded_df = pd.DataFrame(
    encoded_data,
    columns=encoder.get_feature_names_out(categorical_columns)
)

# Concatenate the encoded variables with the original numerical variables
numerical_columns = ['age', 'hypertension', 'heart_disease', 'avg_glucose_level', 'bmi', 'stroke']
df = pd.concat([df[numerical_columns].reset_index(drop=True), encoded_df], axis=1)

In [None]:
df

In [None]:
import seaborn as sns

import matplotlib.pyplot as plt

# Plot distributions of all numerical columns
numerical_cols = df.select_dtypes(include=['number']).columns

plt.figure(figsize=(15, 10))
for i, col in enumerate(numerical_cols):
    plt.subplot(5, 3, i + 1)
    sns.histplot(df[col], kde=True)
    plt.title(f'Distribution of {col}')
plt.tight_layout()
plt.show()

In [None]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

# Separate features (X) and target (y)
X = df.drop('stroke', axis=1)
y = df['stroke']

# Apply SMOTE to the training data
smote = SMOTE(random_state=42)
X_smote, y_smote = smote.fit_resample(X, y)

df_smote = pd.DataFrame(X_smote, columns=X.columns)
df_smote['stroke'] = y_smote
# Display the new balance of the dataset
stroke_counts_balanced = df_smote['stroke'].value_counts()
print("Count of Stroke occurrences after balancing:")
print(stroke_counts_balanced)
# Proportions after balancing
stroke_proportion_balanced = stroke_counts_balanced / stroke_counts_balanced.sum()
print("\nProportion of Stroke occurrences after balancing:")
print(stroke_proportion_balanced)

X = df_smote.drop('stroke', axis=1) # Features
y = df_smote['stroke']

# # Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [None]:
import optuna
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score
import numpy as np

# Define objective functions for each model
def objective_rf(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        'max_depth': trial.suggest_int('max_depth', 5, 30, log=True),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
        'random_state': 42
    }
    
    model = RandomForestClassifier(**params)
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring='roc_auc')
    return np.mean(scores)

def objective_xgb(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'subsample': trial.suggest_float('subsample', 0.7, 1.0),
        'random_state': 42,
        'eval_metric': 'logloss'
    }
    
    model = XGBClassifier(**params)
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring='roc_auc')
    return np.mean(scores)


# Create and run Optuna studies
study_rf = optuna.create_study(direction='maximize')
study_rf.optimize(objective_rf, n_trials=100)

study_xgb = optuna.create_study(direction='maximize')
study_xgb.optimize(objective_xgb, n_trials=100)


# Train the best models with their optimal parameters
best_rf = RandomForestClassifier(**study_rf.best_params)
best_rf.fit(X_train, y_train)

best_xgb = XGBClassifier(**study_xgb.best_params)
best_xgb.fit(X_train, y_train)



# Print the best hyperparameters and scores
print("Random Forest:")
print(f"Best Hyperparameters: {study_rf.best_params}")
print(f"Best Score: {study_rf.best_value}")

print("\nXGBoost:")
print(f"Best Hyperparameters: {study_xgb.best_params}")
print(f"Best Score: {study_xgb.best_value}")


In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, classification_report, roc_curve, auc
)

# Assuming you have the models trained and X_test, y_test available
# If not, add the code to split your data into train/test sets

# Function to make predictions
def get_predictions(model, X_test):
    # Predict class labels
    y_pred = model.predict(X_test)
    # Get probability estimates for ROC curve
    try:
        y_pred_proba = model.predict_proba(X_test)[:, 1]
    except:
        # Some models might not have predict_proba method
        y_pred_proba = model.decision_function(X_test) if hasattr(model, 'decision_function') else None
    return y_pred, y_pred_proba

# Function to calculate and print metrics
def calculate_metrics(y_true, y_pred, model_name):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    
    print(f"--- {model_name} Performance Metrics ---")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print("\nClassification Report:")
    print(classification_report(y_true, y_pred))
    
    return accuracy, precision, recall, f1

# Function to plot confusion matrix
def plot_confusion_matrix(y_true, y_pred, model_name):
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)
    plt.title(f'Confusion Matrix - {model_name}')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.show()

# Function to plot ROC curve
def plot_roc_curve(y_true, y_scores, model_names):
    plt.figure(figsize=(10, 8))
    
    for i, (y_score, model_name) in enumerate(zip(y_scores, model_names)):
        if y_score is not None:
            fpr, tpr, _ = roc_curve(y_true, y_score)
            roc_auc = auc(fpr, tpr)
            plt.plot(fpr, tpr, lw=2, label=f'{model_name} (AUC = {roc_auc:.4f})')
    
    plt.plot([0, 1], [0, 1], 'k--', lw=2)
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend(loc="lower right")
    plt.grid(True, alpha=0.3)
    plt.show()

# Function to compare models
def compare_models(metrics_dict):
    metrics = ['Accuracy', 'Precision', 'Recall', 'F1 Score']
    model_names = list(metrics_dict.keys())
    
    fig, ax = plt.subplots(figsize=(12, 8))
    bar_width = 0.2
    index = np.arange(len(metrics))
    
    for i, (model_name, model_metrics) in enumerate(metrics_dict.items()):
        ax.bar(index + i * bar_width, model_metrics, bar_width, label=model_name)
    
    ax.set_xlabel('Metrics')
    ax.set_ylabel('Score')
    ax.set_title('Model Comparison')
    ax.set_xticks(index + bar_width)
    ax.set_xticklabels(metrics)
    ax.legend()
    ax.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()

# Make predictions for each model
rf_pred, rf_pred_proba = get_predictions(best_rf, X_test)
xgb_pred, xgb_pred_proba = get_predictions(best_xgb, X_test)


# Calculate metrics for each model
print("\n" + "="*50)
rf_metrics = calculate_metrics(y_test, rf_pred, "Random Forest")
print("\n" + "="*50)
xgb_metrics = calculate_metrics(y_test, xgb_pred, "XGBoost")


# Plot confusion matrices
plot_confusion_matrix(y_test, rf_pred, "Random Forest")
plot_confusion_matrix(y_test, xgb_pred, "XGBoost")


# Plot ROC curves
model_names = ["Random Forest", "XGBoost"]
pred_probas = [rf_pred_proba, xgb_pred_proba]
plot_roc_curve(y_test, pred_probas, model_names)

# Compare models
metrics_dict = {
    "Random Forest": rf_metrics,
    "XGBoost": xgb_metrics
}
compare_models(metrics_dict)

# Plot feature importance for tree-based models (RF and XGB)
def plot_feature_importance(model, feature_names, model_name, top_n=10):
    if hasattr(model, 'feature_importances_'):
        # Get feature importances
        importances = model.feature_importances_
        
        # Sort feature importances in descending order
        indices = np.argsort(importances)[::-1]
        
        # Select top N features
        top_indices = indices[:top_n]
        top_importances = importances[top_indices]
        top_features = [feature_names[i] for i in top_indices]
        
        # Plot feature importances
        plt.figure(figsize=(10, 6))
        plt.title(f'Top {top_n} Feature Importances - {model_name}')
        plt.bar(range(top_n), top_importances, align='center')
        plt.xticks(range(top_n), top_features, rotation=90)
        plt.tight_layout()
        plt.show()
    else:
        print(f"{model_name} does not have feature_importances_ attribute")

# Assuming feature_names is available (replace with your actual feature names)
# If you don't have feature names, you can use:
# feature_names = [f'feature_{i}' for i in range(X_train.shape[1])]
feature_names = [f'feature_{i}' for i in range(X_test.shape[1])]

# Plot feature importance
plot_feature_importance(best_rf, feature_names, "Random Forest")
plot_feature_importance(best_xgb, feature_names, "XGBoost")