In [8]:
import pandas as pd  # For handling and manipulating data
from sklearn.preprocessing import MinMaxScaler  # For normalizing data
from sklearn.model_selection import train_test_split  # For splitting data into training and validation sets

In [9]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')
# drop irrelevant columns
train_data_cleaned = train_data.drop(columns=['id', 'ever_married_No', 'Residence_type_Rural'])
# remove outliers
train_data_cleaned = train_data_cleaned[
    (train_data_cleaned['bmi'] >= 10) & (train_data_cleaned['bmi'] <= 50) &
    (train_data_cleaned['avg_glucose_level'] >= 55) & (train_data_cleaned['avg_glucose_level'] <= 250)
]
from sklearn.preprocessing import MinMaxScaler

# Columns to normalize
columns_to_standardize = ['age', 'bmi', 'avg_glucose_level']

# Initialize the MinMaxScaler
scaler = MinMaxScaler()

# Apply the scaler to the relevant columns
train_data_cleaned[columns_to_standardize] = scaler.fit_transform(train_data_cleaned[columns_to_standardize])
# train test split here
# Defining features and target
X = train_data_cleaned.drop(columns='stroke')
y = train_data_cleaned['stroke']

# Splitting into train and validation sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, stratify=y, random_state=42)

In [None]:
# Import necessary libraries
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import make_scorer, f1_score
from sklearn.neural_network import MLPClassifier

# Define Stratified K-Fold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Define the parameter grid for GridSearch
param_grid = {
    'hidden_layer_sizes': [(32, 16), (64, 32), (128, 64)],  # Different architectures
    'activation': ['relu', 'tanh'],  # Activation functions
    'solver': ['adam', 'sgd'],       # Optimizers
    'learning_rate_init': [0.001, 0.01],  # Initial learning rates
    'max_iter': [200, 300]           # Training iterations
}

# Define the model
mlp = MLPClassifier(random_state=42)

# Define the F1 scorer
f1_scorer = make_scorer(f1_score)

# Define GridSearchCV
grid_search = GridSearchCV(estimator=mlp, 
                           param_grid=param_grid, 
                           scoring=f1_scorer, 
                           cv=skf, 
                           verbose=2, 
                           n_jobs=-1)

# Perform the grid search
grid_search.fit(X_train, y_train)

# Best parameters and best score
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best F1 Score (Train CV): {grid_search.best_score_}")

# Evaluate on the test set
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
test_f1 = f1_score(y_test, y_pred)

print(f"F1 Score on the Test Set: {test_f1:.10f}")


Fitting 5 folds for each of 48 candidates, totalling 240 fits
Best Parameters: {'activation': 'relu', 'hidden_layer_sizes': (128, 64), 'learning_rate_init': 0.001, 'max_iter': 300, 'solver': 'adam'}
Best F1 Score (Train CV): 0.026060400688542613
F1 Score on the Test Set: 0.0


In [13]:
# Import necessary libraries for balancing
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import make_scorer, f1_score
from sklearn.neural_network import MLPClassifier

# Define the target sampling strategy
smote = SMOTE(random_state=42, sampling_strategy=0.99)
undersample = RandomUnderSampler(sampling_strategy=0.99, random_state=42)  # Target ratio of majority:minority = 0.99

# Combine SMOTE and undersampling in a pipeline
resampling_pipeline = Pipeline([
    ('smote', smote),
    ('undersample', undersample)
])

# Resample the training data
X_train_balanced, y_train_balanced = resampling_pipeline.fit_resample(X_train, y_train)

# Define Stratified K-Fold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Define the parameter grid for GridSearch
param_grid = {
    'hidden_layer_sizes': [(32, 16), (64, 32), (128, 64)],  # Different architectures
    'activation': ['relu', 'tanh'],  # Activation functions
    'solver': ['adam', 'sgd'],       # Optimizers
    'learning_rate_init': [0.001, 0.01],  # Initial learning rates
    'max_iter': [200, 300]           # Training iterations
}

# Define the model
mlp = MLPClassifier(random_state=42)

# Define the F1 scorer
f1_scorer = make_scorer(f1_score)

# Define GridSearchCV
grid_search = GridSearchCV(estimator=mlp, 
                           param_grid=param_grid, 
                           scoring=f1_scorer, 
                           cv=skf, 
                           verbose=2, 
                           n_jobs=-1)

# Perform the grid search on the resampled data
grid_search.fit(X_train_balanced, y_train_balanced)

# Best parameters and best score
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best F1 Score (Train CV): {grid_search.best_score_}")

# Evaluate on the test set
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
test_f1 = f1_score(y_test, y_pred)

print(f"F1 Score on the Test Set: {test_f1}")

Fitting 5 folds for each of 48 candidates, totalling 240 fits
Best Parameters: {'activation': 'relu', 'hidden_layer_sizes': (128, 64), 'learning_rate_init': 0.01, 'max_iter': 300, 'solver': 'adam'}
Best F1 Score (Train CV): 0.936839081173334
F1 Score on the Test Set: 0.07792207792207792
