In [63]:
import pandas as pd
import os
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import datetime
import warnings
warnings.filterwarnings('ignore')

In [64]:
RANDOM_STATE = 42
TARGET = 'FLAG_APPROVED'

In [65]:
# Create a results directory if it doesn't exist
os.makedirs('model_results', exist_ok=True)

# Generate a timestamp for filenames
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")

In [66]:
# Load the training and testing data
X_train = pd.read_csv('PreprocessedData/Train_data.csv')
y_train = X_train.pop(TARGET)  # Assuming the target column is named 'target'

X_test = pd.read_csv('PreprocessedData/Test_data.csv')
y_test = X_test.pop(TARGET)  # Assuming the target column is named 'target'

In [67]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25519 entries, 0 to 25518
Data columns (total 18 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   FLAG_MALE            25519 non-null  bool   
 1   FLAG_OWN_CAR         25519 non-null  bool   
 2   FLAG_OWN_REALTY      25519 non-null  bool   
 3   CNT_CHILDREN         25519 non-null  int64  
 4   AMT_INCOME_TOTAL     25519 non-null  float64
 5   NAME_INCOME_TYPE     25519 non-null  object 
 6   NAME_EDUCATION_TYPE  25519 non-null  object 
 7   NAME_FAMILY_STATUS   25519 non-null  object 
 8   NAME_HOUSING_TYPE    25519 non-null  object 
 9   DAYS_EMPLOYED        25519 non-null  int64  
 10  FLAG_WORK_PHONE      25519 non-null  bool   
 11  FLAG_PHONE           25519 non-null  bool   
 12  FLAG_EMAIL           25519 non-null  bool   
 13  OCCUPATION_TYPE      17589 non-null  object 
 14  CNT_FAM_MEMBERS      25519 non-null  float64
 15  FLAG_EMPLOYED        25519 non-null 

In [68]:
# Identify categorical columns
categorical_columns = X_train.select_dtypes(include=['object']).columns

# Perform one-hot encoding on categorical columns
X_train = pd.get_dummies(X_train, columns=categorical_columns, drop_first=True)
X_test = pd.get_dummies(X_test, columns=categorical_columns, drop_first=True)

# Align columns of X_test with X_train to ensure consistency
X_test = X_test.reindex(columns=X_train.columns, fill_value=0)

In [69]:
# Save the dataframes
X_train.to_csv('PreprocessedData/X_train_OHE.csv', index=False)
X_test.to_csv('PreprocessedData/X_test_OHE.csv', index=False)

In [None]:
classifier_dict = {
    'LogisticRegression': LogisticRegression(max_iter=1000, random_state=RANDOM_STATE),
    'RandomForestClassifier': RandomForestClassifier(random_state=RANDOM_STATE),
    'GradientBoostingClassifier': GradientBoostingClassifier(random_state=RANDOM_STATE),
    'SVC': SVC(random_state=RANDOM_STATE),
}

In [71]:
param_grid = {
    'LogisticRegression': {
        'clf__C': [0.01, 0.1, 1.0, 10.0],
        'clf__penalty': ['l1', 'l2', None],
        'clf__solver': ['saga', 'liblinear'],
        'clf__class_weight': ['balanced'],
    },
    
    'SVC': {
        'clf__C': [0.1, 1.0, 10.0, 100.0],
        'clf__kernel': ['linear', 'rbf'],
        'clf__gamma': ['scale', 'auto'],
        'clf__class_weight': ['balanced'],
    },
    
    'RandomForestClassifier': {
        'clf__n_estimators': [100, 200],
        'clf__max_depth': [5, 10, 15, None],
        'clf__min_samples_split': [2, 5, 10],
        'clf__min_samples_leaf': [1, 2, 4],
        'clf__max_features': ['sqrt', 'log2', None],
        'clf__class_weight': ['balanced'],
    },
    
    'GradientBoostingClassifier': {
        'clf__learning_rate': [0.01, 0.05, 0.1, 0.2],
        'clf__n_estimators': [100, 200, 300],
        'clf__max_depth': [3, 5, 7, 9],
        'clf__min_samples_split': [2, 5, 10],
        'clf__subsample': [0.8, 0.9, 1.0]
    }
}

In [72]:
# Define scoring metric
scoring = {
    'accuracy': 'accuracy',
    'precision': 'precision',
    'recall': 'recall',
    'f1': 'f1',
    'roc_auc': 'roc_auc'
}
# Store results for all classifiers
all_results = {}
all_cv_results_dfs = []
best_configs = []

In [None]:

for name, classifier in classifier_dict.items():
    print('-' * 60)
    print(f"Training {name}")
    print('-' * 60)
    try:
        # Create the pipeline
        pipeline = Pipeline([
            ('normalizer', MinMaxScaler()),
            ('clf', classifier)
        ])
        # Create and run GridSearchCV with accuracy as the main metric
        grid_search = GridSearchCV(
            pipeline,
            param_grid=param_grid[name],
            cv=5,  # 5-fold cross-validation
            scoring=scoring,
            refit='accuracy',  # Now using accuracy as the primary metric
            return_train_score=True,
            verbose=1,
            n_jobs=-1  # Use all available cores
        )
        
        # Assign grid_search to random_search for compatibility with the rest of the code
        random_search = grid_search
        
        # Fit the random search
        random_search.fit(X_train, y_train)
        
        # Save results
        print(f"\nBest parameters for {name}:")
        print(random_search.best_params_)
        print(f"Best Accuracy: {random_search.best_score_:.4f}")
        
        # Get all results
        cv_results = pd.DataFrame(random_search.cv_results_)
        
        # Add classifier name to the results
        cv_results['classifier'] = name
        
        # Collect all CV results
        all_cv_results_dfs.append(cv_results)
        
        # Store best configuration
        all_results[name] = {
            'best_params': random_search.best_params_,
            'best_score': random_search.best_score_,
            'best_estimator': random_search.best_estimator_
        }
        
        # Add best configuration to our list
        best_idx = random_search.best_index_
        best_config = {
            'classifier': name,
            'rank': 1,  # It's the best for this classifier
            'accuracy': random_search.best_score_
        }
        
        # Add all metrics
        for metric in scoring.keys():
            best_config[f'{metric}'] = cv_results[f'mean_test_{metric}'][best_idx]
            best_config[f'{metric}_std'] = cv_results[f'std_test_{metric}'][best_idx]
        
        # Add parameters
        for param, value in random_search.best_params_.items():
            best_config[param] = value
        
        best_configs.append(best_config)
        
        # Print detailed metrics for the best configuration
        print("\nDetailed metrics for best configuration:")
        for metric in scoring.keys():
            mean = cv_results[f'mean_test_{metric}'][best_idx]
            std = cv_results[f'std_test_{metric}'][best_idx]
            print(f"{metric}: {mean:.4f} (±{std:.4f})")
    except Exception as e:
        print(f"Error occurred while training {name}: {e}")

# Find the best overall classifier based on accuracy
best_classifier = max(all_results.items(), key=lambda x: x[1]['best_score'])
print("\n" + "=" * 60)
print(f"Best overall classifier: {best_classifier[0]}")
print(f"Best parameters: {best_classifier[1]['best_params']}")
print(f"Best Accuracy: {best_classifier[1]['best_score']:.4f}")
print("=" * 60)

# Combine all CV results
all_cv_results = pd.concat(all_cv_results_dfs, ignore_index=True)

# Create a DataFrame of best configurations
best_configs_df = pd.DataFrame(best_configs)

# Sort by accuracy
best_configs_df = best_configs_df.sort_values('accuracy', ascending=False)

# Save the results to CSV
all_cv_results.to_csv(f'model_results/all_cv_results_{timestamp}.csv', index=False)
best_configs_df.to_csv(f'model_results/best_configs_{timestamp}.csv', index=False)

print(f"\nResults saved to CSV files in the 'model_results' directory")

# Print summary table
print("\nSummary of best configurations for each classifier:")
summary_columns = ['classifier', 'accuracy', 'precision', 'recall', 'f1', 'roc_auc']
print(best_configs_df[summary_columns].to_string(index=False))

# Save the best overall model
from joblib import dump
best_model = best_classifier[1]['best_estimator']
dump(best_model, f'model_results/best_model_{timestamp}.joblib')
print(f"\nBest model saved to 'model_results/best_model_{timestamp}.joblib'")

# Create a run configuration file with all parameters
run_config = {
    'timestamp': timestamp,
    'best_classifier': best_classifier[0],
    'best_score': best_classifier[1]['best_score'],
    'best_params': best_classifier[1]['best_params'],
    'metric': 'accuracy'  # Document which metric was used for selection
}

# Save as a JSON file
import json
with open(f'model_results/run_config_{timestamp}.json', 'w') as f:
    json.dump(run_config, f, indent=4)

print(f"Run configuration saved to 'model_results/run_config_{timestamp}.json'")

------------------------------------------------------------
Training LogisticRegression
------------------------------------------------------------
Fitting 5 folds for each of 24 candidates, totalling 120 fits
Fitting 5 folds for each of 24 candidates, totalling 120 fits

Best parameters for LogisticRegression:
{'clf__C': 0.1, 'clf__class_weight': 'balanced', 'clf__penalty': 'l1', 'clf__solver': 'saga'}
Best Accuracy: 0.5160

Detailed metrics for best configuration:
accuracy: 0.5160 (±0.0056)
precision: 0.5339 (±0.0051)
recall: 0.4907 (±0.0291)
f1: 0.5110 (±0.0166)
roc_auc: 0.5210 (±0.0071)
------------------------------------------------------------
Training SVC
------------------------------------------------------------
Fitting 5 folds for each of 16 candidates, totalling 80 fits


KeyboardInterrupt: 