In [4]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.combine import SMOTEENN
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam, RMSprop
from tensorflow.keras.callbacks import EarlyStopping
import optuna
from optuna.integration import TFKerasPruningCallback
import tensorflow as tf

# Load dataset
train_data = pd.read_csv('Training_dt.csv')
train_data = train_data.drop('Mask No', axis=1)
test_data = pd.read_csv('Testing_dt.csv')
test_data = test_data.drop('Mask No', axis=1)

# # Define columns
# numerical_cols = ['LEASE_TENOR_INCLUDING_HP', 'YOM', 'CUSTOMER AGE', 'Exp']
# categorical_cols = ['PRODUCT_NAME', 'Sub_purpose_code_based_on_risk', 'CRIB_SCORE', 'TOTAL INCOME',
#                     'Percentage_of_Total_Current_Balance_to_Total_Amount_Granted_Limit_slabs',
#                     'Percentage_of_Total_Arrears_Amount_to_Total_Amount_Granted_Limit_slabs',
#                     'Percentage_of_Total_Installments_to_Total_Current_Balance_slabs']
# target_col = 'Cluster'


# Define columns
numerical_cols = ['LEASE_TENOR_INCLUDING_HP', 'CUSTOMER AGE', 'Exp', 'YOM']
categorical_cols = ['PRODUCT_NAME', 'Sub_purpose_code_based_on_risk', 'CRIB_SCORE','TOTAL INCOME',
                    'Percentage_of_Total_Installments_to_Total_Current_Balance_slabs',
                    'Percentage_of_Total_Current_Balance_to_Total_Amount_Granted_Limit_slabs',
                    'Percentage_of_Total_Arrears_Amount_to_Total_Amount_Granted_Limit_slabs']
target_col = 'Cluster'



# Define X and y
X_train = train_data[numerical_cols + categorical_cols]
y_train = train_data[target_col]
X_test = test_data[numerical_cols + categorical_cols]
y_test = test_data[target_col]

# Encode categorical columns
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', categorical_transformer, categorical_cols)])

# Preprocess the data
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

# Apply SMOTEENN to handle class imbalance
smoteenn = SMOTEENN(sampling_strategy='auto')
X_resampled, y_resampled = smoteenn.fit_resample(X_train_processed, y_train)

# Encode target variable
label_encoder = LabelEncoder()
y_resampled = label_encoder.fit_transform(y_resampled)
y_test_encoded = label_encoder.transform(y_test)

# Define ML algorithms
ml_algorithms = {
    'RandomForest': RandomForestClassifier(),
    'GradientBoosting': GradientBoostingClassifier(),
    'LogisticRegression': LogisticRegression(max_iter=1000)
}

# Cross-validation to select the best ML algorithm
best_algorithm = None
best_score = -np.inf

for name, clf in ml_algorithms.items():
    try:
        scores = cross_val_score(clf, X_resampled, y_resampled, 
                                 cv=StratifiedKFold(n_splits=5), 
                                 scoring='roc_auc_ovr')
        mean_score = np.mean(scores)
        print(f"{name}: ROC AUC = {mean_score:.4f}")
        if mean_score > best_score:
            best_score = mean_score
            best_algorithm = clf
    except Exception as e:
        print(f"Error with {name}: {e}")

# Check if a valid algorithm was selected
if best_algorithm is None:
    raise ValueError("No valid machine learning algorithm was selected.")

print(f"Best Algorithm: {best_algorithm.__class__.__name__}")

# Fit the best ML algorithm on the entire training set
best_algorithm.fit(X_resampled, y_resampled)

# Use the predictions of the best ML algorithm as additional input to the ANN
ml_train_predictions = best_algorithm.predict_proba(X_resampled)
ml_test_predictions = best_algorithm.predict_proba(X_test_processed)

# Concatenate the ML algorithm's predictions with the original input features
X_train_combined = np.hstack([X_resampled, ml_train_predictions])
X_test_combined = np.hstack([X_test_processed, ml_test_predictions])

# Define the model using the functional API and Optuna for hyperparameter tuning
def create_model(trial):
    # Suggest the number of layers and neurons
    n_layers = trial.suggest_int('n_layers', 1, 4)
    inputs = Input(shape=(X_train_combined.shape[1],))
    x = inputs

    for i in range(n_layers):
        num_neurons = trial.suggest_int(f'n_units_l{i}', 32, 512, step=32)
        x = Dense(num_neurons, activation='relu')(x)
        x = BatchNormalization()(x)  # Added batch normalization
        dropout_rate = trial.suggest_float(f'dropout_l{i}', 0.2, 0.5)
        x = Dropout(dropout_rate)(x)

    # Output layer
    outputs = Dense(len(np.unique(y_resampled)), activation='softmax')(x)
    model = Model(inputs=inputs, outputs=outputs)

    # Suggest optimizer and learning rate
    optimizer_name = trial.suggest_categorical('optimizer', ['adam', 'rmsprop'])
    learning_rate = trial.suggest_float('learning_rate', 1e-3, 1e-2, log=True)
    
    if optimizer_name == 'adam':
        optimizer = Adam(learning_rate=learning_rate)
    else:
        optimizer = RMSprop(learning_rate=learning_rate)

    model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

# Define the objective function for Optuna
def objective(trial):
    model = create_model(trial)
    
    early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    
    history = model.fit(X_train_combined, y_resampled, 
                        validation_split=0.2, 
                        epochs=100, 
                        batch_size=trial.suggest_int('batch_size', 16, 128, step=16),
                        callbacks=[TFKerasPruningCallback(trial, 'val_loss'), early_stopping],
                        verbose=0)
    
    # Evaluate on the validation set
    val_loss = history.history['val_loss'][-1]
    return val_loss

# Run the hyperparameter optimization
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100)  # Increased number of trials

# Train the best model on the entire training data
best_trial = study.best_trial
model = create_model(best_trial)

# Final training with the best hyperparameters
model.fit(X_train_combined, y_resampled, 
          epochs=100,  # Fixed number of epochs
          batch_size=best_trial.params['batch_size'],
          verbose=1)

# Evaluate on the test data
y_pred_prob = model.predict(X_test_combined)
y_pred = np.argmax(y_pred_prob, axis=1)

# Metrics
print("Accuracy:", accuracy_score(y_test_encoded, y_pred))
print("ROC AUC:", roc_auc_score(tf.keras.utils.to_categorical(y_test_encoded), y_pred_prob, multi_class='ovr'))
print(classification_report(y_test_encoded, y_pred))

# Save the model
model.save('best_model.h5')

# Save the best hyperparameters
with open('best_hyperparameters.txt', 'w') as f:
    f.write(str(best_trial.params))


RandomForest: ROC AUC = 0.9848
GradientBoosting: ROC AUC = 0.9479
LogisticRegression: ROC AUC = 0.8828
Best Algorithm: RandomForestClassifier


[I 2024-08-26 16:20:24,651] A new study created in memory with name: no-name-654d4cae-a9ad-4046-b1a8-b18412b95b46
[I 2024-08-26 16:20:36,234] Trial 0 finished with value: 3.352075594875714e-08 and parameters: {'n_layers': 1, 'n_units_l0': 160, 'dropout_l0': 0.404155752934665, 'optimizer': 'rmsprop', 'learning_rate': 0.0004081178139328964, 'batch_size': 48}. Best is trial 0 with value: 3.352075594875714e-08.
[I 2024-08-26 16:20:59,472] Trial 1 finished with value: 5.778598222150322e-08 and parameters: {'n_layers': 4, 'n_units_l0': 448, 'dropout_l0': 0.3607145334191786, 'n_units_l1': 448, 'dropout_l1': 0.45335102015555645, 'n_units_l2': 416, 'dropout_l2': 0.26126579835307473, 'n_units_l3': 352, 'dropout_l3': 0.4270749502374224, 'optimizer': 'rmsprop', 'learning_rate': 0.002135148403626089, 'batch_size': 16}. Best is trial 0 with value: 3.352075594875714e-08.
[I 2024-08-26 16:21:04,620] Trial 2 finished with value: 4.9190287398914734e-08 and parameters: {'n_layers': 2, 'n_units_l0': 352, 

Epoch 1/100
[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.9569 - loss: 0.1130
Epoch 2/100
[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.9986 - loss: 0.0046
Epoch 3/100
[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9994 - loss: 0.0022
Epoch 4/100
[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9992 - loss: 0.0051
Epoch 5/100
[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9993 - loss: 0.0019
Epoch 6/100
[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9988 - loss: 0.0040
Epoch 7/100
[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9999 - loss: 0.0010    
Epoch 8/100
[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9997 - loss: 0.0017
Epoch 9/100
[1m188/188[0m 



Accuracy: 0.6440677966101694
ROC AUC: 0.7002647223268179
              precision    recall  f1-score   support

           0       0.80      0.58      0.67       669
           1       0.51      0.76      0.61       393

    accuracy                           0.64      1062
   macro avg       0.66      0.67      0.64      1062
weighted avg       0.69      0.64      0.65      1062



In [12]:
import joblib

# Save the LabelEncoder
joblib.dump(label_encoder, 'label_encoder.pkl')
print("LabelEncoder saved as 'label_encoder.pkl'")

# Save the preprocessor
joblib.dump(preprocessor, 'preprocessor.pkl')
print("Preprocessor saved as 'preprocessor.pkl'")

# Save the best ML model
joblib.dump(best_algorithm, 'best_ml_model.pkl')
print("Best ML model saved as 'best_ml_model.pkl'")

LabelEncoder saved as 'label_encoder.pkl'
Preprocessor saved as 'preprocessor.pkl'
Best ML model saved as 'best_ml_model.pkl'


In [5]:
# Generate predictions on the test set
y_pred_prob = model.predict(X_test_combined)
y_pred = np.argmax(y_pred_prob, axis=1)

# Inverse transform the encoded target variable to get the original labels
y_pred_actual = label_encoder.inverse_transform(y_pred)
y_test_actual = label_encoder.inverse_transform(y_test_encoded)

# Retrieve the original categorical values from the test set
X_test_original = X_test.copy()

# Create a DataFrame with the predictions and actual values
predictions_df = X_test_original.copy()
predictions_df[target_col] = y_test_actual  # Actual target values from the test set
predictions_df['Predicted_' + target_col] = y_pred_actual  # Predicted target values

# Save the DataFrame to a CSV file
predictions_df.to_csv('test_set_predictions.csv', index=False)

print("Predictions saved to 'test_set_predictions.csv'")


[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 
Predictions saved to 'test_set_predictions.csv'


In [19]:
import pandas as pd
import numpy as np
import joblib
import tensorflow as tf

# Load the trained model, preprocessor, and label encoder
model = tf.keras.models.load_model('best_model.h5')
preprocessor = joblib.load('preprocessor.pkl')
label_encoder = joblib.load('label_encoder.pkl')

# Load the new dataset (assuming it has the same structure as the test set)
new_data = pd.read_csv('Testing535.csv')

# Drop any unnecessary columns (e.g., 'Mask No' if present)
#new_data = new_data.drop('Mask No', axis=1)

# Define the columns used in the model
numerical_cols = ['LEASE_TENOR_INCLUDING_HP', 'CUSTOMER AGE', 'Exp', 'YOM']
categorical_cols = ['PRODUCT_NAME', 'Sub_purpose_code_based_on_risk', 'CRIB_SCORE', 'TOTAL INCOME',
                    'Percentage_of_Total_Installments_to_Total_Current_Balance_slabs',
                    'Percentage_of_Total_Current_Balance_to_Total_Amount_Granted_Limit_slabs',
                    'Percentage_of_Total_Arrears_Amount_to_Total_Amount_Granted_Limit_slabs']
target_col = 'Cluster'

# Preprocess the new data
X_new_processed = preprocessor.transform(new_data[numerical_cols + categorical_cols])

# Load the best ML model
best_algorithm = joblib.load('best_ml_model.pkl')

# Generate predictions using the best ML model
ml_new_predictions = best_algorithm.predict_proba(X_new_processed)

# Concatenate the ML algorithm's predictions with the original input features
X_new_combined = np.hstack([X_new_processed, ml_new_predictions])

# Make predictions using the trained ANN model
y_pred_prob = model.predict(X_new_combined)
y_pred = np.argmax(y_pred_prob, axis=1)

# Inverse transform the encoded target variable to get the original labels
y_pred_actual = label_encoder.inverse_transform(y_pred)

# Add predictions to the new data
new_data['Predicted_' + target_col] = y_pred_actual

# Save the new dataset with predictions to a CSV file
new_data.to_csv('new_data_with_predictions.csv', index=False)

print("Bulk predictions saved to 'new_data_with_predictions.csv'")




[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step


PermissionError: [Errno 13] Permission denied: 'new_data_with_predictions.csv'

In [18]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# Assuming you have y_test_encoded (actual labels) and y_pred (predicted labels)

# Generate the confusion matrix
cm = confusion_matrix(y_test_encoded, y_pred)

# Plot the confusion matrix using seaborn
plt.figure(figsize=(10, 7))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix')
plt.show()

# Optionally, you can use ConfusionMatrixDisplay for a more customized plot
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=label_encoder.classes_)
disp.plot(cmap='Blues')
plt.show()


ValueError: Found input variables with inconsistent numbers of samples: [1062, 534]