# Financial Client Classifier


## Reading the Dataset

**Reading the Dataset**

In [41]:
import pandas as pd
train_data = pd.read_csv('data/train_set.csv', index_col=0)
test_data = pd.read_csv('data/test_set.csv', index_col=0)

## Data preprocessing

In [42]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

def encode_columns(dataframe, columns):
    """
    Encodes the specified columns of a DataFrame using Label Encoding.
    Returns the transformed DataFrame and the fitted LabelEncoders for each column.
    """
    encoders = {}
    for col in columns:
        if col in dataframe:
            encoder = LabelEncoder()
            dataframe[col] = encoder.fit_transform(dataframe[col])
            encoders[col] = encoder
        else:
            raise KeyError(f"Column '{col}' not found in the DataFrame.")
    return dataframe, encoders

# Columns to encode
columns_to_encode = ['Payment_Behaviour', 'Credit_Mix', 'Type_of_Loan', 'Occupation', 'Payment_of_Min_Amount', 'Credit_Score']

# Apply encoding to train_data and store encoders
train_data, encoders = encode_columns(train_data, columns_to_encode)

# Use the same encoders for test_data
for col, encoder in encoders.items():
    if col in test_data:
        test_data[col] = encoder.transform(test_data[col])
        

# Columns to train
columns_to_train = ['Age',
                    'Num_Bank_Accounts',
                    'Num_Credit_Card',
                    'Num_of_Loan',
                    'Delay_from_due_date',
                    'Num_of_Delayed_Payment',
                    'Credit_History_Age',
                    'Credit_Utilization_Ratio',
                    'Annual_Income',
                    'Monthly_Inhand_Salary',
                    'Payment_Behaviour',
                    'Type_of_Loan',
                    'Num_Credit_Inquiries',
                    'Changed_Credit_Limit',
                    'Outstanding_Debt',
                    'Total_EMI_per_month',
                    'Amount_invested_monthly',
                    'Monthly_Balance',
                    'Occupation',
                    'Payment_of_Min_Amount',
                    'Credit_Mix']

# Extracting features and label
X_train = train_data[columns_to_train]
y_train = train_data['Credit_Score']

# Splitting data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Extracting test features
X_test = test_data[columns_to_train]

## Training and valuating

Our strategy is to first try out many models to find the top 3 algorithm, and then fine-tune these 3 estimators to find out the best estimator in each algorithm, and finally stack all the best estimator to form our stacking model.

## model selection

In [43]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

# Define the models
models = {
    'KNN': KNeighborsClassifier(),
    'SVC': SVC(),
    'MLP': MLPClassifier(),
    'LG': LogisticRegression(),
    'RF': RandomForestClassifier(),
    'XGBoost': XGBClassifier(),
    'Naive_Bayes': GaussianNB()
}

# Dictionary to store test scores
test_scores = {}

# Loop through the models, train and get test score
for model_name, model in models.items():
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions on the test set
    predictions = model.predict(X_val)
    
    # Calculate the accuracy
    accuracy = accuracy_score(y_val, predictions)
    test_scores[model_name] = accuracy

# Output the test scores
print("Test Scores:")
for model_name, score in test_scores.items():
    print(f"{model_name}: {score}")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Test Scores:
KNN: 0.7505
SVC: 0.5257142857142857
MLP: 0.5685714285714286
LG: 0.5312142857142857
RF: 0.8048571428571428
XGBoost: 0.7657142857142857
Naive_Bayes: 0.5875714285714285


from the result, RF, XGBoost and KNN are the top 3 algorithm on accuracy. 

## fine-tuning 

In [44]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

# Define the parameter grid for KNN
param_grid_knn = {
    'n_neighbors': [3, 5, 7, 10],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}

# Define the parameter grid for Random Forest
param_grid_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

# Define the parameter grid for XGBoost
param_grid_xgb = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 6, 10]
}

# Create the GridSearchCV objects
grid_search_knn = GridSearchCV(KNeighborsClassifier(), param_grid_knn, cv=5, n_jobs=-1, verbose=2)
grid_search_rf = GridSearchCV(RandomForestClassifier(), param_grid_rf, cv=5, n_jobs=-1, verbose=2)
grid_search_xgb = GridSearchCV(XGBClassifier(), param_grid_xgb, cv=5, n_jobs=-1, verbose=2)

# Fit the grid search to the data for KNN
grid_search_knn.fit(X_train, y_train)

# Fit the grid search to the data for Random Forest
grid_search_rf.fit(X_train, y_train)

# Fit the grid search to the data for XGBoost
grid_search_xgb.fit(X_train, y_train)

# Print the best parameters and scores for each model
print("Best parameters for KNN:", grid_search_knn.best_params_)
print("Best score for KNN:", grid_search_knn.best_score_)

print("Best parameters for Random Forest:", grid_search_rf.best_params_)
print("Best score for Random Forest:", grid_search_rf.best_score_)

print("Best parameters for XGBoost:", grid_search_xgb.best_params_)
print("Best score for XGBoost:", grid_search_xgb.best_score_)

Fitting 5 folds for each of 16 candidates, totalling 80 fits
Fitting 5 folds for each of 36 candidates, totalling 180 fits
Fitting 5 folds for each of 27 candidates, totalling 135 fits




Best parameters for KNN: {'metric': 'manhattan', 'n_neighbors': 5, 'weights': 'distance'}
Best score for KNN: 0.7742142857142857
Best parameters for Random Forest: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 300}
Best score for Random Forest: 0.7990357142857143
Best parameters for XGBoost: {'learning_rate': 0.1, 'max_depth': 10, 'n_estimators': 300}
Best score for XGBoost: 0.7893035714285715


In [45]:
import joblib
joblib.dump(grid_search_knn, "models/grid_search_knn.pkl")
joblib.dump(grid_search_rf, "models/grid_search_rf.pkl")
joblib.dump(grid_search_xgb, "models/grid_search_xgb.pkl")

['models/grid_search_xgb.pkl']

In [46]:
import joblib 
grid_search_knn = joblib.load('models/grid_search_knn.pkl')
grid_search_rf = joblib.load('models/grid_search_rf.pkl')
grid_search_xgb = joblib.load('models/grid_search_xgb.pkl')


In [47]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

# Define the parameter grid for Logistic Regression
param_grid_lg = {
    'C': [0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear']
}

# Create the GridSearchCV object for Logistic Regression
grid_search_lg = GridSearchCV(LogisticRegression(), param_grid_lg, cv=5, n_jobs=-1, verbose=2)

# Fit the grid search to the data for Logistic Regression
grid_search_lg.fit(X_train, y_train)

# Print the best parameters for Logistic Regression
print("Best parameters for Logistic Regression:", grid_search_lg.best_params_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best parameters for Logistic Regression: {'C': 10, 'penalty': 'l1', 'solver': 'liblinear'}




### stacking

In [48]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier


# Using the best models from previous grid searches
best_knn = grid_search_knn.best_estimator_
best_rf = grid_search_rf.best_estimator_
best_xgb = grid_search_xgb.best_estimator_

# Using the best LG model from the new grid search as the final estimator
best_lg = grid_search_lg.best_estimator_

# Define the stacking ensemble
stacking_ensemble = StackingClassifier(
    estimators=[('KNN', best_knn), ('RF', best_rf), ('XGBoost', best_xgb)],
    final_estimator=best_lg,
    cv=5
)

# Fit the stacking ensemble
stacking_ensemble.fit(X_train, y_train)

# valuate the model
stacking_score = stacking_ensemble.score(X_val, y_val)
print("Stacking Ensemble Test Score:", stacking_score)

Stacking Ensemble Test Score: 0.8042142857142857


In [49]:
from sklearn.metrics import classification_report

y_pred = stacking_ensemble.predict(X_val)
classification_rep = classification_report(y_val, y_pred)
print(classification_rep)

              precision    recall  f1-score   support

           0       0.78      0.76      0.77      2546
           1       0.80      0.82      0.81      4112
           2       0.81      0.81      0.81      7342

    accuracy                           0.80     14000
   macro avg       0.80      0.80      0.80     14000
weighted avg       0.80      0.80      0.80     14000



In [50]:
joblib.dump(stacking_ensemble, 'models/stacking_ensemble.pkl')

['models/stacking_ensemble.pkl']

## Generating the Submission

Below you will find a function that does the predictions for the test set and generates a submission, this submission is to be uploaded to kaggle to update the leaderboard

In [52]:
def generate_submission():
    list_of_predictions = stacking_ensemble.predict(X_test)
    preds = encoders["Credit_Score"].inverse_transform(list_of_predictions)
    df = pd.DataFrame({'Credit_Score': preds}, index=X_test.index)
    df.to_csv('sandbox_submission3.csv')

In [53]:
generate_submission()