In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.combine import SMOTETomek

# Step 1: Load the dataset
# Replace 'your_dataset.csv' with the actual file path
data = pd.read_csv("Dev_data_to_be_shared.csv")

# Step 2: Separate features (X) and target variable (y)
# Replace 'bad_flag' with the name of your target column
X = data.drop(columns=["bad_flag","account_number"])  # Features
y = data["bad_flag"] 
X.fillna(0, inplace=True)# Target variable

# Step 3: Train-Test Split
# Split the dataset into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

# Step 4: Apply SMOTE-Tomek to balance the classes in the training set
# smt = SMOTETomek(random_state=42)
# X_train_resampled, y_train_resampled = smt.fit_resample(X_train, y_train)

# Step 5: Scale the features
scaler = StandardScaler()
X_train_original = scaler.fit_transform(X_train)  # Scale the resampled training data
X_test_original = scaler.transform(X_test)
X_original = scaler.transform(X)# Scale the test data

# Print data shapes for verification
print(f"Original Training Set Shape (X_train): {X_train.shape}, (y_train): {y_train.shape}")
# print(f"Resampled Training Set Shape (X_train_resampled): {X_train_resampled.shape}, (y_train_resampled): {y_train.shape}")
print(f"Test Set Shape (X_test): {X_test.shape}, (y_test): {y_test.shape}")



Original Training Set Shape (X_train): (48403, 1214), (y_train): (48403,)
Test Set Shape (X_test): (48403, 1214), (y_test): (48403,)


In [2]:
print(X_resampled_scaled.shape)
print(y_resampled.shape)

NameError: name 'X_resampled_scaled' is not defined

In [None]:
import joblib  # Library for saving and loading models
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.neural_network import MLPClassifier
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
import numpy as np

# Resample using SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

# # Convert the dataset to float32 to reduce memory usage
# X_resampled = X_resampled.astype(np.float32)
# X_test = X_test.astype(np.float32)

# Standardize features
scaler = StandardScaler()
X_resampled_scaled = scaler.fit_transform(X_resampled)
X_test_scaled = scaler.transform(X_test)

# Define parameter grid
param_grid = {
    'hidden_layer_sizes': [(128,), (256,), (512,), (256, 128), (512, 256)],
    'activation': ['relu', 'tanh'],
    'solver': ['adam', 'sgd'],
    'alpha': [0.0001, 0.001, 0.01],
    'learning_rate': ['constant', 'adaptive'],
    'max_iter': [10],
    'early_stopping': [True]
}

# Initialize MLPClassifier
mlp = MLPClassifier(random_state=42)

# Perform Grid Search
grid_search = GridSearchCV(
    estimator=mlp,
    param_grid=param_grid,
    scoring='f1',
    cv=3,
    verbose=1,
    n_jobs=-1
)

# Fit Grid Search
grid_search.fit(X_resampled_scaled, y_resampled)

# Save all results
results = pd.DataFrame(grid_search.cv_results_)
results.to_csv('mlp_hyperparameter_tuning_results.csv', index=False)

# Print total models tested and top 5 models
print(f"Total models tested: {len(results)}")
print("Top 5 models:")
print(results[['mean_test_score', 'std_test_score', 'params']].sort_values(by='mean_test_score', ascending=False).head())

# Save the best model
best_model = grid_search.best_estimator_
joblib.dump(best_model, "best_mlp_model.joblib")
print("Best model saved as 'best_mlp_model.joblib'")

# Evaluate the best model on the test set
y_pred = best_model.predict(X_test_scaled)
print("Classification Report:")
print(classification_report(y_test, y_pred))
