# Import

In [2]:
import pandas as pd  # For handling and manipulating data
from sklearn.preprocessing import MinMaxScaler  # For normalizing data
from sklearn.model_selection import train_test_split  # For splitting data into training and validation sets

# Load

In [3]:
train_data = pd.read_csv(r'C:\Users\aksha\Documents\ML_Python\Main\train.csv')
test_data = pd.read_csv(r'C:\Users\aksha\Documents\ML_Python\Main\test.csv')
# drop irrelevant columns
train_data_cleaned = train_data.drop(columns=['id', 'ever_married_No', 'Residence_type_Rural'])
# remove outliers
train_data_cleaned = train_data_cleaned[
    (train_data_cleaned['bmi'] >= 10) & (train_data_cleaned['bmi'] <= 50) &
    (train_data_cleaned['avg_glucose_level'] >= 55) & (train_data_cleaned['avg_glucose_level'] <= 250)
]
from sklearn.preprocessing import MinMaxScaler

# Columns to normalize
columns_to_standardize = ['age', 'bmi', 'avg_glucose_level']

# Initialize the MinMaxScaler
scaler = MinMaxScaler()

# Apply the scaler to the relevant columns
train_data_cleaned[columns_to_standardize] = scaler.fit_transform(train_data_cleaned[columns_to_standardize])
# train test split here
# Defining features and target
X = train_data_cleaned.drop(columns='stroke')
y = train_data_cleaned['stroke']

# Splitting into train and validation sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, stratify=y, random_state=42)

# Test 1

In [None]:
# Import necessary libraries
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import make_scorer, f1_score
from sklearn.neural_network import MLPClassifier

# Define Stratified K-Fold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Define the parameter grid for GridSearch
param_grid = {
    'hidden_layer_sizes': [(32, 16), (64, 32), (128, 64)],  # Different architectures
    'activation': ['relu', 'tanh'],  # Activation functions
    'solver': ['adam', 'sgd'],       # Optimizers
    'learning_rate_init': [0.001, 0.01],  # Initial learning rates
    'max_iter': [200, 300]           # Training iterations
}

# Define the model
mlp = MLPClassifier(random_state=42)

# Define the F1 scorer
f1_scorer = make_scorer(f1_score)

# Define GridSearchCV
grid_search = GridSearchCV(estimator=mlp, 
                           param_grid=param_grid, 
                           scoring=f1_scorer, 
                           cv=skf, 
                           verbose=2, 
                           n_jobs=-1)

# Perform the grid search
grid_search.fit(X_train, y_train)

# Best parameters and best score
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best F1 Score (Train CV): {grid_search.best_score_}")

# Evaluate on the test set
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
test_f1 = f1_score(y_test, y_pred)

print(f"F1 Score on the Test Set: {test_f1:.10f}")


Fitting 5 folds for each of 48 candidates, totalling 240 fits
Best Parameters: {'activation': 'relu', 'hidden_layer_sizes': (128, 64), 'learning_rate_init': 0.001, 'max_iter': 300, 'solver': 'adam'}
Best F1 Score (Train CV): 0.026060400688542613
F1 Score on the Test Set: 0.0


# Test 2

In [13]:
# Import necessary libraries for balancing
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import make_scorer, f1_score
from sklearn.neural_network import MLPClassifier

# Define the target sampling strategy
smote = SMOTE(random_state=42, sampling_strategy=0.99)
undersample = RandomUnderSampler(sampling_strategy=0.99, random_state=42)  # Target ratio of majority:minority = 0.99

# Combine SMOTE and undersampling in a pipeline
resampling_pipeline = Pipeline([
    ('smote', smote),
    ('undersample', undersample)
])

# Resample the training data
X_train_balanced, y_train_balanced = resampling_pipeline.fit_resample(X_train, y_train)

# Define Stratified K-Fold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Define the parameter grid for GridSearch
param_grid = {
    'hidden_layer_sizes': [(32, 16), (64, 32), (128, 64)],  # Different architectures
    'activation': ['relu', 'tanh'],  # Activation functions
    'solver': ['adam', 'sgd'],       # Optimizers
    'learning_rate_init': [0.001, 0.01],  # Initial learning rates
    'max_iter': [200, 300]           # Training iterations
}

# Define the model
mlp = MLPClassifier(random_state=42)

# Define the F1 scorer
f1_scorer = make_scorer(f1_score)

# Define GridSearchCV
grid_search = GridSearchCV(estimator=mlp, 
                           param_grid=param_grid, 
                           scoring=f1_scorer, 
                           cv=skf, 
                           verbose=2, 
                           n_jobs=-1)

# Perform the grid search on the resampled data
grid_search.fit(X_train_balanced, y_train_balanced)

# Best parameters and best score
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best F1 Score (Train CV): {grid_search.best_score_}")

# Evaluate on the test set
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
test_f1 = f1_score(y_test, y_pred)

print(f"F1 Score on the Test Set: {test_f1}")

Fitting 5 folds for each of 48 candidates, totalling 240 fits
Best Parameters: {'activation': 'relu', 'hidden_layer_sizes': (128, 64), 'learning_rate_init': 0.01, 'max_iter': 300, 'solver': 'adam'}
Best F1 Score (Train CV): 0.936839081173334
F1 Score on the Test Set: 0.07792207792207792


# Test 3

In [8]:
# Import necessary libraries
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import make_scorer, f1_score
from imblearn.over_sampling import BorderlineSMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

# Resampling
smote = BorderlineSMOTE(random_state=42, sampling_strategy=0.99)
undersample = RandomUnderSampler(sampling_strategy=0.99, random_state=42)

# Create a resampling pipeline
resampling_pipeline = Pipeline([
    ('smote', smote),
    ('undersample', undersample)
])

# Resample the training data
X_train_balanced, y_train_balanced = resampling_pipeline.fit_resample(X_train, y_train)

# Feature scaling
scaler = StandardScaler()
X_train_balanced = scaler.fit_transform(X_train_balanced)
X_test = scaler.transform(X_test)

# Define Stratified K-Fold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Define the parameter grid for GridSearch
param_grid = {
    'hidden_layer_sizes': [(32, 16), (64, 32), (128, 64)],  # Neural network architectures
    'activation': ['relu', 'tanh'],  # Activation functions
    'solver': ['adam', 'sgd'],       # Optimizers
    'learning_rate_init': [0.001, 0.01],  # Initial learning rates
    'max_iter': [200, 300]           # Training iterations
}

# Define the model
mlp = MLPClassifier(random_state=42)

# Define the F1 scorer
f1_scorer = make_scorer(f1_score)

# Define GridSearchCV
grid_search = GridSearchCV(estimator=mlp, 
                           param_grid=param_grid, 
                           scoring=f1_scorer, 
                           cv=skf, 
                           verbose=2, 
                           n_jobs=-1)

# Perform the grid search on the resampled data
grid_search.fit(X_train_balanced, y_train_balanced)

# Best parameters and best score
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best F1 Score (Train CV): {grid_search.best_score_}")

# Evaluate on the test set
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
test_f1 = f1_score(y_test, y_pred)

print(f"F1 Score on the Test Set: {test_f1}")

Fitting 5 folds for each of 48 candidates, totalling 240 fits
Best Parameters: {'activation': 'tanh', 'hidden_layer_sizes': (128, 64), 'learning_rate_init': 0.001, 'max_iter': 200, 'solver': 'adam'}
Best F1 Score (Train CV): 0.9759722406681961
F1 Score on the Test Set: 0.07058823529411765




# Test 4

In [9]:
# Import necessary libraries
import numpy as np
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import make_scorer, f1_score, classification_report
from imblearn.over_sampling import BorderlineSMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, mutual_info_classif

# Tip 1: Resampling (SMOTE + Undersampling Pipeline)
smote = BorderlineSMOTE(random_state=42, sampling_strategy=0.99)
undersample = RandomUnderSampler(sampling_strategy=0.99, random_state=42)
resampling_pipeline = Pipeline([
    ('smote', smote),
    ('undersample', undersample)
])

X_train_balanced, y_train_balanced = resampling_pipeline.fit_resample(X_train, y_train)

# Tip 2: Feature Scaling (RobustScaler handles outliers better than StandardScaler)
scaler = RobustScaler()
X_train_balanced = scaler.fit_transform(X_train_balanced)
X_test = scaler.transform(X_test)

# Tip 3: Dynamic Feature Selection
num_features = X_train.shape[1]
k_best_features = min(num_features, 10)  # Dynamically adjust for available features
feature_selector = SelectKBest(mutual_info_classif, k=k_best_features)
X_train_balanced = feature_selector.fit_transform(X_train_balanced, y_train_balanced)
X_test = feature_selector.transform(X_test)

# Tip 4: Stratified K-Fold for Stability
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Tip 5: Define Parameter Grid for GridSearch
param_grid = {
    'hidden_layer_sizes': [(64, 32), (128, 64, 32), (256, 128, 64)],  # Larger architectures
    'activation': ['relu', 'tanh'],  # Activation functions
    'solver': ['adam'],  # Use adam for stable convergence
    'learning_rate_init': [0.001, 0.005, 0.01],  # Learning rate
    'max_iter': [300, 500],  # Allow more training iterations
    'alpha': [0.0001, 0.001]  # L2 regularization for preventing overfitting
}

# Tip 6: Use a Custom F1 Scorer
f1_scorer = make_scorer(f1_score, average='binary')

# Tip 7: Use GridSearchCV for Hyperparameter Optimization
mlp = MLPClassifier(random_state=42)
grid_search = GridSearchCV(estimator=mlp, 
                           param_grid=param_grid, 
                           scoring=f1_scorer, 
                           cv=skf, 
                           verbose=2, 
                           n_jobs=-1)

# Train the model
grid_search.fit(X_train_balanced, y_train_balanced)

# Display the best parameters and best training F1 score
print("Best Parameters:", grid_search.best_params_)
print("Best F1 Score (Train CV):", grid_search.best_score_)

# Tip 8: Test the model on the test set
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
test_f1 = f1_score(y_test, y_pred)

print("\nF1 Score on the Test Set:", test_f1)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Tip 9 (Bonus): Save the model for future use
import joblib
joblib.dump(best_model, "best_mlp_model.pkl")




Fitting 5 folds for each of 72 candidates, totalling 360 fits
Best Parameters: {'activation': 'tanh', 'alpha': 0.0001, 'hidden_layer_sizes': (256, 128, 64), 'learning_rate_init': 0.001, 'max_iter': 300, 'solver': 'adam'}
Best F1 Score (Train CV): 0.9603167148538707

F1 Score on the Test Set: 0.04838709677419355

Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.94      0.96      3246
           1       0.03      0.12      0.05        50

    accuracy                           0.93      3296
   macro avg       0.51      0.53      0.51      3296
weighted avg       0.97      0.93      0.95      3296





['best_mlp_model.pkl']

In [10]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV, train_test_split
from sklearn.preprocessing import RobustScaler, PowerTransformer
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.metrics import f1_score, classification_report, make_scorer
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import VotingClassifier, RandomForestClassifier
from sklearn.svm import SVC
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import EditedNearestNeighbours
from imblearn.combine import SMOTEENN
from imblearn.pipeline import Pipeline
import joblib
from scipy.stats import randint, uniform

# Assuming X_train, X_test, y_train, y_test are already defined
# Example: X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42, test_size=0.2)

# Resampling technique
smote_enn = SMOTEENN(smote=SMOTE(sampling_strategy=0.99, random_state=42, n_jobs=-1),
                     enn=EditedNearestNeighbours(n_jobs=-1))

# Preprocessing steps
scaler = RobustScaler()
normalizer = PowerTransformer()

# Feature Selection
num_features = X_train.shape[1]
k_best = SelectKBest(mutual_info_classif, k=min(num_features, 10))  # Limited to top 10 features

# Conditional PCA (Disabled here to reduce complexity, can be re-enabled if needed)
pca = None  # Remove PCA for simplicity

# Models for ensemble learning with simplified settings
mlp = MLPClassifier(random_state=42, max_iter=500, hidden_layer_sizes=(128, 64), alpha=0.001)
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42, n_jobs=-1, tree_method='hist', n_estimators=100)
catboost = CatBoostClassifier(verbose=0, random_state=42, thread_count=-1, depth=6, learning_rate=0.05)
rf_clf = RandomForestClassifier(random_state=42, n_jobs=-1, n_estimators=200, max_depth=10)
svc_clf = SVC(random_state=42, probability=True, kernel='linear', C=1)

# Voting Classifier (Reduced model complexity)
voting_clf = VotingClassifier(estimators=[
    ('mlp', mlp),
    ('xgb', xgb),
    ('catboost', catboost),
    ('rf', rf_clf),
    ('svc', svc_clf)
], voting='soft', n_jobs=-1)

# Pipeline definition
steps = [
    ('scaler', scaler),
    ('normalizer', normalizer),
    ('resample', smote_enn),
    ('feature_selector', k_best),
]

# Add PCA if applicable (not included here)
if pca:
    steps.append(('pca', pca))

# Add the model at the end of the pipeline
steps.append(('model', voting_clf))

# Create the pipeline
final_pipeline = Pipeline(steps)

# RandomizedSearchCV Hyperparameter Tuning for faster optimization
param_distributions = {
    'model__mlp__hidden_layer_sizes': [(128, 64), (256, 128)],
    'model__mlp__alpha': uniform(0.0001, 0.01),
    'model__xgb__n_estimators': randint(100, 200),
    'model__xgb__learning_rate': uniform(0.01, 0.1),
    'model__xgb__max_depth': randint(3, 6),
    'model__catboost__depth': randint(6, 8),
    'model__catboost__learning_rate': uniform(0.01, 0.05),
    'model__rf__n_estimators': randint(100, 200),
    'model__rf__max_depth': randint(5, 10),
    'model__svc__C': uniform(0.1, 10),
    'model__svc__gamma': ['scale', 'auto']
}

# Custom F1 Scorer
f1_scorer = make_scorer(f1_score, average='binary')

# Stratified K-Fold Cross-Validation (Reduced to 3 folds to speed up computation)
skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

# RandomizedSearchCV with reduced iterations
random_search = RandomizedSearchCV(estimator=final_pipeline, 
                                   param_distributions=param_distributions, 
                                   scoring=f1_scorer, 
                                   cv=skf, 
                                   verbose=2, 
                                   n_iter=20,  # Reduced iterations for faster tuning
                                   n_jobs=-1,
                                   random_state=42)

# Fit the RandomizedSearchCV
random_search.fit(X_train, y_train)

# Output the best parameters and F1 score
print("\nBest Parameters:", random_search.best_params_)
print("\nBest F1 Score (Train CV):", random_search.best_score_)

# Evaluate on the test set
best_model = random_search.best_estimator_
y_pred = best_model.predict(X_test)
test_f1 = f1_score(y_test, y_pred)

print("\nF1 Score on the Test Set:", test_f1)
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Save the model
joblib.dump(best_model, "optimized_f1_model_1hour.pkl")

Fitting 3 folds for each of 20 candidates, totalling 60 fits





Best Parameters: {'model__catboost__depth': 7, 'model__catboost__learning_rate': 0.040351712384334236, 'model__mlp__alpha': 0.0028599918202254337, 'model__mlp__hidden_layer_sizes': (128, 64), 'model__rf__max_depth': 9, 'model__rf__n_estimators': 164, 'model__svc__C': 0.2563640674119393, 'model__svc__gamma': 'scale', 'model__xgb__learning_rate': 0.08722447692966574, 'model__xgb__max_depth': 5, 'model__xgb__n_estimators': 110}

Best F1 Score (Train CV): 0.09792910223424484

F1 Score on the Test Set: 0.11252268602540835

Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.86      0.92      3246
           1       0.06      0.62      0.11        50

    accuracy                           0.85      3296
   macro avg       0.53      0.74      0.52      3296
weighted avg       0.98      0.85      0.91      3296



['optimized_f1_model_1hour.pkl']