In [None]:
# import tensorflow as tf
# tf.keras.backend.clear_session()

# Final Project

`Use Case` : The credit score classification model is designed to predict an individual's creditworthiness by analyzing a range of financial and credit-related features. By leveraging historical data on financial behavior, payment history, and other relevant factors, the model provides a credit score prediction that helps financial institutions, such as banks, credit unions, and lending agencies, assess the potential risk of lending. This predictive tool enhances decision-making, reduces default risks, and streamlines the credit approval process, enabling institutions to make data-driven decisions when extending credit to individuals.

Here’s a refined version of your statements with improved clarity and flow while maintaining the same format:

---

`Description of Features` :
1. **Annual Income**: The individual’s total income earned annually.
2. **Monthly Inhand Salary**: The individual’s take-home salary after all deductions on a monthly basis.
3. **Number of Bank Accounts**: The total number of bank accounts owned by the individual.
4. **Number of Credit Cards**: The total number of credit cards held by the individual.
5. **Interest Rate**: The interest rate applicable to any loans or credit cards held by the individual.
6. **Number of Loans**: The total number of loans taken out by the individual.
7. **Delay from Due Date**: The number of days by which the individual's payments have been delayed past the due date.
8. **Number of Delayed Payments**: The total number of times the individual has made payments after the due date.
9. **Changed Credit Limit**: Any changes in the individual’s credit limit over time, either increases or decreases.
10. **Number of Credit Inquiries**: The total number of times a lender has accessed the individual's credit report for review.
11. **Outstanding Debt**: The total amount of unpaid debt owed by the individual.
12. **Credit Utilization Ratio**: The ratio of credit currently being used by the individual compared to their total available credit.
13. **Credit History Age in Months**: The duration of the individual’s credit history, measured in months.
14. **Total EMI per Month**: The total amount paid by the individual each month as Equated Monthly Installments (EMIs) for loans or credit cards.
15. **Amount Invested Monthly**: The total amount of money the individual invests monthly in various financial products.
16. **Monthly Balance**: The individual’s remaining balance in bank accounts at the end of each month.
17. **Payment of Minimum Amount (Encoded)**: Encoded data indicating whether the individual consistently pays the minimum required amount on their credit bills.
18. **Payment Behaviour (Encoded)**: Encoded data representing the individual's overall payment behavior, such as timeliness and consistency.
19. **Credit Mix (Encoded)**: Encoded data showing the variety of credit products (e.g., loans, credit cards) used by the individual.

---
`Prediction Errors` :
1. **False Negative**: A false negative occurs when the model incorrectly classifies an individual with a 'Good' credit score as having a 'Bad' credit score. This results in the denial of credit to individuals who are actually creditworthy, causing the lending institution to miss out on potential profitable customers and business opportunities.
   
2. **False Positive**: A false positive happens when the model incorrectly classifies an individual with a 'Bad' credit score as having a 'Good' credit score. This leads to credit approval for high-risk individuals, increasing the possibility of default and financial losses for the lending institution.

Both types of errors carry significant risks for lending institutions. Striking a balance between false positives and false negatives is crucial to making sound lending decisions that optimize business growth while minimizing financial risks.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
from sklearn.exceptions import ConvergenceWarning
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
# from sklearn.model_selection import ParameterGrid

# Ignore convergence warnings
warnings.filterwarnings("ignore", category=ConvergenceWarning)

In [2]:
df = pd.read_csv('./preprocessed_data.csv')
df.head()

Unnamed: 0,Age,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,Delay_from_due_date,Num_of_Delayed_Payment,Changed_Credit_Limit,...,Occupation_Writer,Payment_Behaviour_High spent Large value payments,Payment_Behaviour_High spent Medium value payments,Payment_Behaviour_High spent Small value payments,Payment_Behaviour_Low spent Large value payments,Payment_Behaviour_Low spent Medium value payments,Payment_Behaviour_Low spent Small value payments,Credit_Mix_Bad,Credit_Mix_Good,Credit_Mix_Standard
0,23.0,19114.12,1824.843333,3.0,4.0,3.0,4.0,3.0,7.0,11.27,...,0,0,0,0,0,0,1,0,1,0
1,24.0,19114.12,1824.843333,3.0,4.0,3.0,4.0,3.0,9.0,13.27,...,0,0,1,0,0,0,0,0,1,0
2,24.0,19114.12,1824.843333,3.0,4.0,3.0,4.0,23.6,4.0,12.27,...,0,0,0,0,0,1,0,0,1,0
3,36.4,19114.12,1456.4625,3.0,4.0,3.0,4.0,4.0,5.0,11.27,...,0,0,1,0,0,0,0,0,1,0
4,28.0,34847.84,3037.986667,2.0,4.0,6.0,1.0,3.0,3.0,5.42,...,0,0,0,0,1,0,0,0,1,0


In [3]:
numerical_columns = ['Age','Annual_Income', 
                     'Monthly_Inhand_Salary',         
                     'Num_Bank_Accounts',           
                     'Num_Credit_Card',          
                     'Interest_Rate',               
                     'Num_of_Loan',                 
                     'Delay_from_due_date',        
                     'Num_of_Delayed_Payment',     
                     'Changed_Credit_Limit',        
                     'Num_Credit_Inquiries',        
                     'Outstanding_Debt',           
                     'Credit_Utilization_Ratio',    
                     'Credit_History_Age_in_months',
                     'Payment_of_Min_Amount',       
                     'Total_EMI_per_month',         
                     'Amount_invested_monthly',            
                     'Monthly_Balance'] 

In [4]:
# Call the StandardScaler
scaler = StandardScaler()

# Apply scaling on numerical columns
df[numerical_columns] = scaler.fit_transform(df[numerical_columns])
df_all = pd.DataFrame(scaler.fit_transform(df))

In [5]:
df.head()

Unnamed: 0,Age,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,Delay_from_due_date,Num_of_Delayed_Payment,Changed_Credit_Limit,...,Occupation_Writer,Payment_Behaviour_High spent Large value payments,Payment_Behaviour_High spent Medium value payments,Payment_Behaviour_High spent Small value payments,Payment_Behaviour_Low spent Large value payments,Payment_Behaviour_Low spent Medium value payments,Payment_Behaviour_Low spent Small value payments,Credit_Mix_Bad,Credit_Mix_Good,Credit_Mix_Standard
0,-0.101318,-0.110979,-0.746198,-0.923114,-0.750401,-1.416221,0.192117,-1.231201,-0.111866,0.133165,...,0,0,0,0,0,0,1,0,1,0
1,-0.082386,-0.110979,-0.746198,-0.923114,-0.750401,-1.416221,0.192117,-1.231201,-0.10242,0.429791,...,0,0,1,0,0,0,0,0,1,0
2,-0.082386,-0.110979,-0.746198,-0.923114,-0.750401,-1.416221,0.192117,0.166539,-0.126034,0.281478,...,0,0,0,0,0,1,0,0,1,0
3,0.152372,-0.110979,-0.865595,-0.923114,-0.750401,-1.416221,0.192117,-1.16335,-0.121311,0.133165,...,0,0,1,0,0,0,0,0,1,0
4,-0.006657,-0.099136,-0.353001,-1.30992,-0.750401,-1.021405,-1.046721,-1.231201,-0.130757,-0.734467,...,0,0,0,0,1,0,0,0,1,0


In [6]:
df_all.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,31,32,33,34,35,36,37,38,39,40
0,-0.101318,-0.110979,-0.746198,-0.923114,-0.750401,-1.416221,0.192117,-1.231201,-0.111866,0.133165,...,-0.269779,-0.415802,-0.488821,-0.373808,-0.358725,-0.415238,1.619409,-0.562513,1.514688,-0.915697
1,-0.082386,-0.110979,-0.746198,-0.923114,-0.750401,-1.416221,0.192117,-1.231201,-0.10242,0.429791,...,-0.269779,-0.415802,2.045739,-0.373808,-0.358725,-0.415238,-0.617509,-0.562513,1.514688,-0.915697
2,-0.082386,-0.110979,-0.746198,-0.923114,-0.750401,-1.416221,0.192117,0.166539,-0.126034,0.281478,...,-0.269779,-0.415802,-0.488821,-0.373808,-0.358725,2.408258,-0.617509,-0.562513,1.514688,-0.915697
3,0.152372,-0.110979,-0.865595,-0.923114,-0.750401,-1.416221,0.192117,-1.16335,-0.121311,0.133165,...,-0.269779,-0.415802,2.045739,-0.373808,-0.358725,-0.415238,-0.617509,-0.562513,1.514688,-0.915697
4,-0.006657,-0.099136,-0.353001,-1.30992,-0.750401,-1.021405,-1.046721,-1.231201,-0.130757,-0.734467,...,-0.269779,-0.415802,-0.488821,-0.373808,2.787653,-0.415238,-0.617509,-0.562513,1.514688,-0.915697


In [7]:
# Define the categorical columns made from Credit_Mix column
credit_mix_columns = ['Credit_Mix_Bad', 'Credit_Mix_Good', 'Credit_Mix_Standard']


# This separates the features (input data) from the dataframe
features = df.drop(credit_mix_columns, axis=1)

# The target (output) labels consist of the three Credit Mix categories
labels = df[credit_mix_columns]

In [8]:
# # Define the mapping for one-hot encoded labels back to the original values
# label_mapping = {
#     'Credit_Mix_Good': 'Good',
#     'Credit_Mix_Standard': 'Standard',
#     'Credit_Mix_Bad': 'Bad'
# }

# # Convert the encoded labels back to single labels
# labels = df[credit_mix_columns].idxmax(axis=1).map(label_mapping)

# labels.head()

In [9]:
# Split the original dataset into training (80%) and temporary (20%) sets
X_train, X_temp, y_train, y_temp = train_test_split(features, labels,
                                                    test_size=0.2, random_state=42)

# Then split the tempporary (20%) set into testing (10%) and validation (10%) sets
X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp,
                                                test_size=0.5, random_state=42)

# Create a dictionary with dataset names and their shapes
data_shapes = {
    'Dataset': ['X_train', 'X_val', 'X_test', 'y_train', 'y_val', 'y_test'],
    'Shape': [X_train.shape, X_val.shape, X_test.shape, y_train.shape, y_val.shape, y_test.shape]
}

# Create a DataFrame to display the shapes
df_shapes = pd.DataFrame(data_shapes)

# Display the DataFrame
df_shapes

Unnamed: 0,Dataset,Shape
0,X_train,"(25800, 38)"
1,X_val,"(3226, 38)"
2,X_test,"(3225, 38)"
3,y_train,"(25800, 3)"
4,y_val,"(3226, 3)"
5,y_test,"(3225, 3)"


In [10]:
y_val.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3226 entries, 17873 to 23341
Data columns (total 3 columns):
 #   Column               Non-Null Count  Dtype
---  ------               --------------  -----
 0   Credit_Mix_Bad       3226 non-null   int64
 1   Credit_Mix_Good      3226 non-null   int64
 2   Credit_Mix_Standard  3226 non-null   int64
dtypes: int64(3)
memory usage: 100.8 KB


In [11]:
y_val.head()

Unnamed: 0,Credit_Mix_Bad,Credit_Mix_Good,Credit_Mix_Standard
17873,0,0,1
512,1,0,0
4794,0,0,1
288,1,0,0
6888,1,0,0


In [12]:
# Initialize the Sequential model
model = Sequential()

# Add the first hidden layer with 200 neurons and ReLU activation function
# The input_shape parameter specifies the shape of the input data
model.add(Dense(500, activation='relu', input_shape=(X_train.shape[1],)))

# Add a Dropout layer to prevent overfitting
# This randomly sets 30% of the input units to 0 during training
model.add(Dropout(0.3))

# Add the second hidden layer with 100 neurons and ReLU activation function
model.add(Dense(250, activation='relu'))

# Add another Dropout layer to further prevent overfitting
model.add(Dropout(0.3))

# Add the output layer with the number of classes determined by the unique labels
# Softmax activation is used for multi-class classification
model.add(Dense(labels['Class'].nunique(), activation='softmax'))

# Initialize the Adam optimizer with a specified learning rate
optimizer = Adam(learning_rate=0.01)

# Compile the model specifying the optimizer, loss function, and evaluation metrics
# 'sparse_categorical_crossentropy' is used for multi-class classification
model.compile(optimizer=optimizer,
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

# Print a summary of the model's architecture
model.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
2024-10-11 20:30:25.776700: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M2
2024-10-11 20:30:25.776767: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 8.00 GB
2024-10-11 20:30:25.776795: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 2.67 GB
2024-10-11 20:30:25.776823: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-10-11 20:30:25.776846: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


KeyError: 'Class'

In [None]:
# Generate predictions
y_pred = model.predict(X_val)
y_pred_classes = np.argmax(y_pred, axis=1)  # Get the class with the highest probability
# y_val_classes = np.argmax(y_val, axis=1)
# Compute the confusion matrix
cm = confusion_matrix(y_val, y_pred_classes)

# Display the confusion matrix
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=np.arange(labels['Class'].nunique()))
disp.plot(cmap=plt.cm.Blues)
plt.title('Confusion Matrix')
plt.show()