# Dataset Preprocessing

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install category_encoders

In [None]:
# Import the packages

import pandas as pd
import numpy as np
import itertools

import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
import matplotlib.gridspec as gridspec
import seaborn as sns
sns.set(style='whitegrid', palette='muted', font_scale=1.5)
from pylab import rcParams
rcParams['figure.figsize'] = 12, 8
import os

# Any results you write to the current directory are saved as output.

In [None]:
# load the dataset
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Datasets/Project/train - train.csv.csv')
df_test = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Datasets/Project/test - test.csv.csv')

#Check number of rows and columns in the dataset
print("The dataset has %d rows and %d columns." % df.shape)

In [None]:
df.describe()
df_test.describe()

In [None]:
df.head(10)
df_test.head(10)

**Finding Missing Values**

In [None]:
missing_values = df[pd.isnull(df).any(axis=1)]
missing_values

missing_values = df_test[pd.isnull(df_test).any(axis=1)]
missing_values

### Removing unnecessary columns

In [None]:
df = df.drop(["ID", "Customer_ID", "Month", "Name", "SSN"], axis=1)
df_test = df_test.drop(["ID", "Customer_ID", "Month", "Name", "SSN"], axis=1)


In [None]:
import pandas as pd

# Assuming df is your DataFrame
unique_values_per_column = {}

for column in df.columns:
    unique_values = df[column].unique()
    unique_values_per_column[column] = unique_values

# Display the unique values for each column
for column, values in unique_values_per_column.items():
    print(f"Column: {column}")
    print(f"Unique Values: {values}")
    print()

In [None]:
df = df[(df['Occupation'] != '_______')]
df = df[(df['Credit_Mix'] != '_')]
df = df[(df['Payment_of_Min_Amount'] != 'NM')]
df = df[(df['Payment_Behaviour'] != '!@9#%8')]


### Category Encoding

In [None]:
# 1 = POOR, 2 = Standard and 3 = GOOD
df["Credit_Score"] = df["Credit_Score"].apply(lambda x: 0 if x=="Poor" else (1 if x=="Standard" else 2))

In [None]:
import category_encoders as ce

In [None]:
encoder = ce.OrdinalEncoder(cols=["Occupation", "Num_Bank_Accounts", "Num_Credit_Card", "Num_of_Loan", "Type_of_Loan", "Num_of_Delayed_Payment", "Num_Credit_Inquiries", "Credit_Mix", "Credit_History_Age", "Payment_of_Min_Amount", "Payment_Behaviour"])

df = encoder.fit_transform(df)

In [None]:
df.head(10)
df.info()

In [None]:
encoder = ce.OrdinalEncoder(cols=["Occupation","Num_Bank_Accounts", "Num_Credit_Card", "Num_of_Loan", "Type_of_Loan", "Num_of_Delayed_Payment", "Num_Credit_Inquiries", "Credit_Mix", "Credit_History_Age", "Payment_of_Min_Amount", "Payment_Behaviour"])

df_test = encoder.fit_transform(df_test)

In [None]:
df_test.info()

In [None]:
# check missing values in variables

df.isnull().sum()
print()
df_test.isnull().sum()

In [None]:
df.replace('_', np.nan, inplace=True)
df_test.replace('_', np.nan, inplace=True)
df = df.apply(pd.to_numeric, errors='coerce')
df_test = df_test.apply(pd.to_numeric, errors='coerce')

df = df.fillna(df.mean())
df_test = df_test.fillna(df_test.mean())

### Removing Outliers

In [None]:
import pandas as pd

# Assuming df is your DataFrame
# Specify the factor for IQR (e.g., 1.5)
iqr_factor = 1.5

# Compute the first quartile (Q1) and third quartile (Q3)
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)

# Calculate the IQR for each column
IQR = Q3 - Q1

# Create a boolean mask for outliers
outliers_mask = (df < (Q1 - iqr_factor * IQR)) | (df > (Q3 + iqr_factor * IQR))

# Replace outliers with median values column-wise
df = df.where(~outliers_mask, df.median(axis=0), axis=1)

# Display the resulting DataFrame with outliers replaced by median values
print(df)

# Model Evaluation

### Determining Feature Importance with Random Forest Classifier

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

# Assuming you have a DataFrame 'df' with features and a target variable
# Replace this with your actual DataFrame

# Split the data into features (X) and target variable (y)
X = df.drop("Credit_Score", axis=1)  # Assuming 'target' is the column you want to predict
y = df["Credit_Score"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features (not always necessary for Random Forest, but can be useful)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Build a Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the classifier
rf_classifier.fit(X_train_scaled, y_train)

# Predict on the test set
y_pred = rf_classifier.predict(X_test_scaled)

# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

# Get feature importances from the trained model
feature_importances = rf_classifier.feature_importances_

# Display feature importances in a bar plot
feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

plt.figure(figsize=(10, 6))
plt.bar(feature_importance_df['Feature'], feature_importance_df['Importance'])
plt.xlabel('Feature')
plt.ylabel('Importance')
plt.title('Random Forest Feature Importance')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

## ROC AUC Curve

In [None]:
from sklearn import metrics
from sklearn.metrics import (confusion_matrix, precision_recall_curve, auc, roc_curve, recall_score,
                             classification_report, f1_score, average_precision_score, precision_recall_fscore_support)

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

# Assuming you have a DataFrame 'df' with features and a target variable
# Replace this with your actual DataFrame

# Split the data into features (X) and target variable (y)
X = df.drop("Credit_Score", axis=1)  # Assuming 'target' is the column you want to predict
y = df["Credit_Score"]

# Split the data into training and testing sets
train_X, test_X, train_Y, test_Y = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features (not always necessary for Decision Tree, Random Forest, and Neural Network)
# Comment out these lines if not needed
scaler = StandardScaler()
train_X_scaled = scaler.fit_transform(train_X)
test_X_scaled = scaler.transform(test_X)

# Decision Tree
modelDT = DecisionTreeClassifier(random_state=0)
y_score_dt = modelDT.fit(train_X_scaled, train_Y).predict_proba(test_X_scaled)

# Neural Network
modelNN = MLPClassifier(hidden_layer_sizes=(100,), max_iter=1000, random_state=0)
y_score_nn = modelNN.fit(train_X_scaled, train_Y).predict_proba(test_X_scaled)

# Random Forest
modelRF = RandomForestClassifier(n_estimators=100, random_state=0)
y_score_rf = modelRF.fit(train_X_scaled, train_Y).predict_proba(test_X_scaled)

# Determine the number of classes
num_classes = test_Y.shape[1] if len(test_Y.shape) > 1 else len(np.unique(test_Y))

# Convert DataFrame to NumPy array if necessary
if isinstance(test_Y, pd.DataFrame):
    test_Y_array = test_Y.values
else:
    test_Y_array = np.asarray(test_Y)

# Compute ROC curves and AUCs for each model
models = [
    ('Decision Tree', y_score_dt),
    ('Neural Network', y_score_nn),
    ('Random Forest', y_score_rf)
]

plt.figure(figsize=(10, 8))

for model_name, y_score_model in models:
    fpr = dict()
    tpr = dict()
    roc_auc = dict()

    for i in range(num_classes):
        true_labels = test_Y_array[:, i] if len(test_Y_array.shape) > 1 else (test_Y_array == i).astype(int)
        y_score_i = y_score_model[:, i]

        fpr[i], tpr[i], _ = roc_curve(true_labels, y_score_i)
        roc_auc[i] = auc(fpr[i], tpr[i])

        # Plot ROC curves for each model
        plt.plot(fpr[i], tpr[i], label=f'{model_name} - Class {i} (AUC = {roc_auc[i]:.2f})')

# Plot the random classifier line
plt.plot([0, 1], [0, 1], 'k--', lw=2)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves for Multiclass Classification')
plt.legend()
plt.show()


**Precision-Recall Curve**

In [None]:
from sklearn.metrics import precision_recall_curve, average_precision_score

plt.figure(figsize=(10, 8))

for model_name, y_score_model in models:
    precision = dict()
    recall = dict()
    average_precision = dict()

    for i in range(num_classes):
        true_labels = test_Y_array[:, i] if len(test_Y_array.shape) > 1 else (test_Y_array == i).astype(int)
        y_score_i = y_score_model[:, i]

        precision[i], recall[i], _ = precision_recall_curve(true_labels, y_score_i)
        average_precision[i] = average_precision_score(true_labels, y_score_i)

        # Plot precision-recall curves for each model
        plt.plot(recall[i], precision[i], label=f'{model_name} - Class {i} (Avg Precision = {average_precision[i]:.2f})')

plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curves for Multiclass Classification')
plt.legend(fontsize=8)
plt.show()

**Classification Report**

In [None]:
from sklearn.metrics import classification_report

from sklearn.metrics import precision_score, recall_score, f1_score

for model_name, y_score_model in models:
    for i in range(num_classes):
        true_labels = test_Y_array[:, i] if len(test_Y_array.shape) > 1 else (test_Y_array == i).astype(int)
        y_score_i = y_score_model[:, i]

        # Convert probabilities to class predictions (assuming threshold of 0.5, adjust as needed)
        y_pred_i = (y_score_i > 0.5).astype(int)

        # Print precision, recall, and F1-score for each class
        print(f'{model_name} - Class {i}:')
        print(f'Precision: {precision_score(true_labels, y_pred_i)}')
        print(f'Recall: {recall_score(true_labels, y_pred_i)}')
        print(f'F1-score: {f1_score(true_labels, y_pred_i)}\n')

# You can also compute and print a micro-average or macro-average
# For example, to compute the macro-average:
y_pred_all = np.argmax(np.array([y_score_model for _, y_score_model in models]), axis=2).T
true_labels_all = np.argmax(test_Y_array, axis=1) if len(test_Y_array.shape) > 1 else test_Y_array


In [None]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import precision_score, recall_score, f1_score

# ... (previous code)

# Lists to store results for each model
model_names = []
class_precision = []
class_recall = []
class_f1 = []

# Loop through models
for model_name, y_score_model in models:
    model_names.append(model_name)

    class_precision_model = []
    class_recall_model = []
    class_f1_model = []

    # Loop through classes
    for i in range(num_classes):
        true_labels = test_Y_array[:, i] if len(test_Y_array.shape) > 1 else (test_Y_array == i).astype(int)
        y_score_i = y_score_model[:, i]

        # Convert probabilities to class predictions (assuming threshold of 0.5, adjust as needed)
        y_pred_i = (y_score_i > 0.5).astype(int)

        # Calculate precision, recall, and F1-score for each class
        precision = precision_score(true_labels, y_pred_i)
        recall = recall_score(true_labels, y_pred_i)
        f1 = f1_score(true_labels, y_pred_i)

        class_precision_model.append(precision)
        class_recall_model.append(recall)
        class_f1_model.append(f1)

        # Print precision, recall, and F1-score for each class
        print(f'{model_name} - Class {i}:')
        print(f'Precision: {precision}')
        print(f'Recall: {recall}')
        print(f'F1-score: {f1}\n')

    class_precision.append(class_precision_model)
    class_recall.append(class_recall_model)
    class_f1.append(class_f1_model)

# Plotting results
fig, axs = plt.subplots(3, 1, figsize=(10, 15), sharex=True)
x_labels = [f"Class {i}" for i in range(num_classes)]

# Plot class-wise precision
for i, (model_name, precision_values) in enumerate(zip(model_names, class_precision)):
    axs[0].plot(x_labels, precision_values, label=model_name)
axs[0].set_title('Class-wise Precision')
axs[0].legend()

# Plot class-wise recall
for i, (model_name, recall_values) in enumerate(zip(model_names, class_recall)):
    axs[1].plot(x_labels, recall_values, label=model_name)
axs[1].set_title('Class-wise Recall')
axs[1].legend()

# Plot class-wise F1-score
for i, (model_name, f1_values) in enumerate(zip(model_names, class_f1)):
    axs[2].plot(x_labels, f1_values, label=model_name)
axs[2].set_title('Class-wise F1-score')
axs[2].legend()

plt.tight_layout()
plt.show()

# Predicting with Random Forest Classifier

In [None]:
# Import necessary libraries
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Assuming your target variable is named 'target' and the features are all other columns
# Replace 'target' with your actual target variable name

# Split the training dataset into features and target variable
X_train = df.drop('Credit_Score', axis=1)
y_train = df['Credit_Score']

# Split the training data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Create a Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the classifier on the training data
rf_classifier.fit(X_train, y_train)

# Predict on the validation set to evaluate the model
y_val_pred = rf_classifier.predict(X_val)

# Print accuracy on the validation set
accuracy = accuracy_score(y_val, y_val_pred)
print(f'Validation Accuracy: {accuracy:.2f}')

# Now, use the trained model to predict the test dataset
X_test = df_test  # Assuming df_test contains the features for the test dataset
y_test_pred = rf_classifier.predict(X_test)

# The predictions for the test dataset are stored in y_test_pred variable
# You can further use or analyze these predictions as needed


In [None]:
y_test_pred

In [None]:
predicted_df_test = df_test.copy()

In [None]:
predicted_df_test["Credit_Score"] = y_test_pred

In [None]:
predicted_df_test.head(10)

In [None]:
# 1 = POOR, 2 = Standard and 3 = GOOD
predicted_df_test["Credit_Score"] = predicted_df_test["Credit_Score"].apply(lambda x: "Poor" if x==1 else ("Standard" if x==2 else "Good"))

In [None]:
predicted_df_test.head(10)