# DATA PREPARATION

In [218]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import LabelEncoder

from sklearn.ensemble import RandomForestClassifier


## Loading data and checking first 10 entries

In [219]:
df_test_csv = pd.read_csv("C:/Users/HP/OneDrive/Desktop/projectWorks/Credit_score/credit score classification/test.csv")
df_test_csv.head(10)

Unnamed: 0,ID,Customer_ID,Month,Name,Age,SSN,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,...,Num_Credit_Inquiries,Credit_Mix,Outstanding_Debt,Credit_Utilization_Ratio,Credit_History_Age,Payment_of_Min_Amount,Total_EMI_per_month,Amount_invested_monthly,Payment_Behaviour,Monthly_Balance
0,0x160a,CUS_0xd40,September,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,1824.843333,3,...,2022.0,Good,809.98,35.030402,22 Years and 9 Months,No,49.574949,236.64268203272132,Low_spent_Small_value_payments,186.26670208571767
1,0x160b,CUS_0xd40,October,Aaron Maashoh,24,821-00-0265,Scientist,19114.12,1824.843333,3,...,4.0,Good,809.98,33.053114,22 Years and 10 Months,No,49.574949,21.465380264657146,High_spent_Medium_value_payments,361.444003853782
2,0x160c,CUS_0xd40,November,Aaron Maashoh,24,821-00-0265,Scientist,19114.12,1824.843333,3,...,4.0,Good,809.98,33.811894,,No,49.574949,148.23393788500923,Low_spent_Medium_value_payments,264.67544623343
3,0x160d,CUS_0xd40,December,Aaron Maashoh,24_,821-00-0265,Scientist,19114.12,,3,...,4.0,Good,809.98,32.430559,23 Years and 0 Months,No,49.574949,39.08251089460281,High_spent_Medium_value_payments,343.82687322383634
4,0x1616,CUS_0x21b1,September,Rick Rothackerj,28,004-07-5839,_______,34847.84,3037.986667,2,...,5.0,Good,605.03,25.926822,27 Years and 3 Months,No,18.816215,39.684018417945296,High_spent_Large_value_payments,485.2984336755923
5,0x1617,CUS_0x21b1,October,Rick Rothackerj,28,#F%$D@*&8,Teacher,34847.84,3037.986667,2,...,5.0,Good,605.03,30.1166,27 Years and 4 Months,No,18.816215,251.6273687501761,Low_spent_Large_value_payments,303.3550833433617
6,0x1618,CUS_0x21b1,November,Rick Rothackerj,28,004-07-5839,Teacher,34847.84,3037.986667,2,...,5.0,_,605.03,30.996424,27 Years and 5 Months,No,18.816215,72.68014533363515,High_spent_Large_value_payments,452.30230675990265
7,0x1619,CUS_0x21b1,December,Rick Rothackerj,28,004-07-5839,Teacher,34847.84,3037.986667,2,...,5.0,_,605.03,33.875167,27 Years and 6 Months,No,18.816215,153.53448761392983,!@9#%8,421.44796447960783
8,0x1622,CUS_0x2dbc,September,Langep,35,486-85-3974,Engineer,143162.64,,1,...,3.0,Good,1303.01,35.229707,18 Years and 5 Months,No,246.992319,397.5036535440465,Low_spent_Medium_value_payments,854.2260270022115
9,0x1623,CUS_0x2dbc,October,Langep,35,486-85-3974,Engineer,143162.64,12187.22,1,...,3.0,Good,1303.01,35.685836,18 Years and 6 Months,No,246.992319,453.6151305781054,Low_spent_Large_value_payments,788.1145499681528


In [220]:
df_test_csv.shape

(50000, 27)

### Findings from .head()
- unnecessary columns
- Columns have mixed types.
- outliers
- duplicate entries
- nan values


In [221]:
df_test_csv.dtypes

ID                           object
Customer_ID                  object
Month                        object
Name                         object
Age                          object
SSN                          object
Occupation                   object
Annual_Income                object
Monthly_Inhand_Salary       float64
Num_Bank_Accounts             int64
Num_Credit_Card               int64
Interest_Rate                 int64
Num_of_Loan                  object
Type_of_Loan                 object
Delay_from_due_date           int64
Num_of_Delayed_Payment       object
Changed_Credit_Limit         object
Num_Credit_Inquiries        float64
Credit_Mix                   object
Outstanding_Debt             object
Credit_Utilization_Ratio    float64
Credit_History_Age           object
Payment_of_Min_Amount        object
Total_EMI_per_month         float64
Amount_invested_monthly      object
Payment_Behaviour            object
Monthly_Balance              object
dtype: object

In [222]:
# dropping unnecessary columns
df_test = df_test_csv.drop(columns=["ID", "Customer_ID", "Name", "SSN"])

## Checking the datatype

In [223]:
df_test.dtypes

Month                        object
Age                          object
Occupation                   object
Annual_Income                object
Monthly_Inhand_Salary       float64
Num_Bank_Accounts             int64
Num_Credit_Card               int64
Interest_Rate                 int64
Num_of_Loan                  object
Type_of_Loan                 object
Delay_from_due_date           int64
Num_of_Delayed_Payment       object
Changed_Credit_Limit         object
Num_Credit_Inquiries        float64
Credit_Mix                   object
Outstanding_Debt             object
Credit_Utilization_Ratio    float64
Credit_History_Age           object
Payment_of_Min_Amount        object
Total_EMI_per_month         float64
Amount_invested_monthly      object
Payment_Behaviour            object
Monthly_Balance              object
dtype: object

In [224]:
df_test['Monthly_Balance'].unique()[:10]  # Peek at unique values


array(['186.26670208571772', '361.44400385378196', '264.67544623342997',
       '343.82687322383634', '485.2984336755923', '303.3550833433617',
       '452.30230675990265', '421.44796447960783', '854.2260270022115',
       '788.1145499681528'], dtype=object)

### Findings from ,dtypes
- cols like 'Age', 'Annual_Income', 'Num_of_Loan', 'Num_of_Delayed_Payment','Changed_Credit_Limit', 'Outstanding_Debt', 'Amount_invested_monthly', 'Monthly_Balance'
- are in objects
- should be in numeric value
    - month in object so convert to numeric position.
    -Credit_History_Age in object

In [225]:
# Converting months into numeric
df_test["Month"] = pd.to_datetime(df_test.Month, format='%B').dt.month

In [226]:
# Converting to numerics and NaN the errors
numeric_cols = [
    'Age', 'Annual_Income', 'Num_of_Loan', 'Num_of_Delayed_Payment',
    'Changed_Credit_Limit', 'Outstanding_Debt', 'Amount_invested_monthly', 'Monthly_Balance'
]

for col in numeric_cols:
    df_test[col] = pd.to_numeric(df_test[col], errors='coerce')


In [227]:
# Credit_History_Age has values like this >> 22 Years and 1 Months
def month_convertor(x):
    if pd.notnull(x): # check if the value is not null then only do these
        num1 = int(x.split(" ")[0]) # extract 22 (years)
        num2 = int(x.split(" ")[3]) # extract 1 (months)
        return (num1 * 12) + num2   # final convert everything in months.
    else:
        return x # if the value is null return null

In [228]:
df_test["Credit_History_Age"] = df_test["Credit_History_Age"].apply(month_convertor)

## Checking the Object dtypes

In [229]:
object_cols = df_test.select_dtypes(include='object').columns.tolist()
print(object_cols)

['Occupation', 'Type_of_Loan', 'Credit_Mix', 'Payment_of_Min_Amount', 'Payment_Behaviour']


#### For Occupation

In [230]:
df_test['Occupation'].unique()

array(['Scientist', '_______', 'Teacher', 'Engineer', 'Entrepreneur',
       'Developer', 'Lawyer', 'Media_Manager', 'Doctor', 'Journalist',
       'Manager', 'Accountant', 'Musician', 'Mechanic', 'Writer',
       'Architect'], dtype=object)

In [231]:
df_test['Occupation'] = df_test['Occupation'].replace(['_______', '', ' '], np.nan)

In [232]:
# for Occupation, Label Encoding:
le = LabelEncoder()
df_test['Occupation_Encoded'] = le.fit_transform(df_test['Occupation'].astype(str))


In [233]:
# checking
df_test['Occupation'].unique()

array(['Scientist', nan, 'Teacher', 'Engineer', 'Entrepreneur',
       'Developer', 'Lawyer', 'Media_Manager', 'Doctor', 'Journalist',
       'Manager', 'Accountant', 'Musician', 'Mechanic', 'Writer',
       'Architect'], dtype=object)

#### For Type_of_loan

In [234]:
df_test["Type_of_Loan"].tolist()

['Auto Loan, Credit-Builder Loan, Personal Loan, and Home Equity Loan',
 'Auto Loan, Credit-Builder Loan, Personal Loan, and Home Equity Loan',
 'Auto Loan, Credit-Builder Loan, Personal Loan, and Home Equity Loan',
 'Auto Loan, Credit-Builder Loan, Personal Loan, and Home Equity Loan',
 'Credit-Builder Loan',
 'Credit-Builder Loan',
 'Credit-Builder Loan',
 'Credit-Builder Loan',
 'Auto Loan, Auto Loan, and Not Specified',
 'Auto Loan, Auto Loan, and Not Specified',
 'Auto Loan, Auto Loan, and Not Specified',
 'Auto Loan, Auto Loan, and Not Specified',
 'Not Specified',
 'Not Specified',
 'Not Specified',
 'Not Specified',
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 'Credit-Builder Loan, and Mortgage Loan',
 'Credit-Builder Loan, and Mortgage Loan',
 'Credit-Builder Loan, and Mortgage Loan',
 'Credit-Builder Loan, and Mortgage Loan',
 'Not Specified, Auto Loan, and Student Loan',
 'Not Specified, Auto Loan, and Student Loan',
 'Not Specified, Auto Loan, and Student Loan',
 'Not S

In [235]:
def clean_loans(x):
    if pd.notna(x):
        x = x.replace(" and ", ",")  # Replace "and" with comma
        x = x.replace(", ", ",")     # Remove comma+space
        # Split, strip, and skip empty strings
        return [loan.strip() for loan in x.split(",") if loan.strip() != '']
    return []
    
df_test["Type_of_Loan"] = df_test["Type_of_Loan"].apply(clean_loans)



In [236]:
# multiple values in one cell so, use 
mlb = MultiLabelBinarizer()
loan_encoded = mlb.fit_transform(df_test["Type_of_Loan"])


In [237]:
# Add to dataframe
loan_df = pd.DataFrame(loan_encoded, columns=mlb.classes_, index=df_test.index)
df_test = pd.concat([df_test, loan_df], axis=1)

Converts Eg:
- Auto Loan, Personal Loan                    into: 1001
- Credit-Builder Loan, Home Equity Loan       into: 0110

In [238]:
df_test["Type_of_Loan"].tolist()

[['Auto Loan', 'Credit-Builder Loan', 'Personal Loan', 'Home Equity Loan'],
 ['Auto Loan', 'Credit-Builder Loan', 'Personal Loan', 'Home Equity Loan'],
 ['Auto Loan', 'Credit-Builder Loan', 'Personal Loan', 'Home Equity Loan'],
 ['Auto Loan', 'Credit-Builder Loan', 'Personal Loan', 'Home Equity Loan'],
 ['Credit-Builder Loan'],
 ['Credit-Builder Loan'],
 ['Credit-Builder Loan'],
 ['Credit-Builder Loan'],
 ['Auto Loan', 'Auto Loan', 'Not Specified'],
 ['Auto Loan', 'Auto Loan', 'Not Specified'],
 ['Auto Loan', 'Auto Loan', 'Not Specified'],
 ['Auto Loan', 'Auto Loan', 'Not Specified'],
 ['Not Specified'],
 ['Not Specified'],
 ['Not Specified'],
 ['Not Specified'],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 ['Credit-Builder Loan', 'Mortgage Loan'],
 ['Credit-Builder Loan', 'Mortgage Loan'],
 ['Credit-Builder Loan', 'Mortgage Loan'],
 ['Credit-Builder Loan', 'Mortgage Loan'],
 ['Not Specified', 'Auto Loan', 'Student Loan'],
 ['Not Specified', 'Auto Loan', 'Student Loan'],
 ['Not Specified',

In [239]:
df_test[mlb.classes_].head()

Unnamed: 0,Auto Loan,Credit-Builder Loan,Debt Consolidation Loan,Home Equity Loan,Mortgage Loan,Not Specified,Payday Loan,Personal Loan,Student Loan
0,1,1,0,1,0,0,0,1,0
1,1,1,0,1,0,0,0,1,0
2,1,1,0,1,0,0,0,1,0
3,1,1,0,1,0,0,0,1,0
4,0,1,0,0,0,0,0,0,0


#### For Credit_Mix

In [240]:
df_test['Credit_Mix'].unique()

array(['Good', '_', 'Standard', 'Bad'], dtype=object)

In [241]:
df_test['Credit_Mix'] = df_test['Credit_Mix'].replace(['_', '', ' '], np.nan)

In [242]:
df_test['Credit_Mix'].fillna(df_test['Credit_Mix'].mode()[0], inplace=True)  # Most common category

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_test['Credit_Mix'].fillna(df_test['Credit_Mix'].mode()[0], inplace=True)  # Most common category


In [243]:
df_test['Credit_Mix'].unique()

array(['Good', 'Standard', 'Bad'], dtype=object)

In [244]:
# for Credit_Mix, Label Encoding:
le = LabelEncoder()
df_test['Credit_Mix_Encoded'] = le.fit_transform(df_test['Credit_Mix'].astype(str))

#### For Payment_of_Min_Amount

In [245]:
print(df_test['Payment_of_Min_Amount'].unique())

['No' 'Yes' 'NM']


In [246]:
mapping = {'No': 0, 'NM': 1, 'Yes': 2}
df_test['Payment_of_Min_Amount_Encoded'] = df_test['Payment_of_Min_Amount'].map(mapping)

#### For Payment_Behaviour

In [247]:
print(df_test['Payment_Behaviour'].unique())

['Low_spent_Small_value_payments' 'High_spent_Medium_value_payments'
 'Low_spent_Medium_value_payments' 'High_spent_Large_value_payments'
 'Low_spent_Large_value_payments' '!@9#%8'
 'High_spent_Small_value_payments']


In [248]:
# Replace junk with NaN
df_test['Payment_Behaviour'] = df_test['Payment_Behaviour'].replace('!@9#%8', np.nan)

In [249]:
# Now encode
le = LabelEncoder()

# Fill NaN temporarily to avoid errors in encoding
df_test['Payment_Behaviour'] = df_test['Payment_Behaviour'].fillna('Unknown')

df_test['Payment_Behaviour_Encoded'] = le.fit_transform(df_test['Payment_Behaviour'])

# If you want to see the mapping
print(dict(zip(le.classes_, le.transform(le.classes_))))

{'High_spent_Large_value_payments': np.int64(0), 'High_spent_Medium_value_payments': np.int64(1), 'High_spent_Small_value_payments': np.int64(2), 'Low_spent_Large_value_payments': np.int64(3), 'Low_spent_Medium_value_payments': np.int64(4), 'Low_spent_Small_value_payments': np.int64(5), 'Unknown': np.int64(6)}


# Checking missing values

In [250]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 36 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Month                          50000 non-null  int32  
 1   Age                            47523 non-null  float64
 2   Occupation                     46562 non-null  object 
 3   Annual_Income                  46480 non-null  float64
 4   Monthly_Inhand_Salary          42502 non-null  float64
 5   Num_Bank_Accounts              50000 non-null  int64  
 6   Num_Credit_Card                50000 non-null  int64  
 7   Interest_Rate                  50000 non-null  int64  
 8   Num_of_Loan                    47564 non-null  float64
 9   Type_of_Loan                   50000 non-null  object 
 10  Delay_from_due_date            50000 non-null  int64  
 11  Num_of_Delayed_Payment         45075 non-null  float64
 12  Changed_Credit_Limit           48941 non-null 

### Findings from .info()
- Total entries: 100000
- Some columns like Name(90015/100000), Type_of_loan(88592/100000), etc have null

In [251]:
df_test.shape

(50000, 36)

## Checking Duplicates

In [252]:
# drop Type_of_loan as duplicated cannot check list
df_test_no_lists = df_test.drop(columns=['Type_of_Loan'])

In [253]:
print(df_test_no_lists.duplicated().sum())

0


### Findings of .duplicated().sum()
- np.int64(0)>0   (false)
- so no duplicated value traced

## Checking null values' columns
- all columns with true and false, `df.isna().sum()`
- gives values in True shells, `df.isna().sum()>0`
- index to find the column names-> gives in array
- list () gives in list

In [254]:
missing_cols=list(df_test.isna().sum()[df_test.isna().sum()>0].index)
missing_cols

['Age',
 'Occupation',
 'Annual_Income',
 'Monthly_Inhand_Salary',
 'Num_of_Loan',
 'Num_of_Delayed_Payment',
 'Changed_Credit_Limit',
 'Num_Credit_Inquiries',
 'Outstanding_Debt',
 'Credit_History_Age',
 'Amount_invested_monthly',
 'Monthly_Balance']

### Taking a column for reference for the nan value say, Credit_History_Age 

In [255]:
df_test[df_test['Credit_History_Age'].isna()]

Unnamed: 0,Month,Age,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,Type_of_Loan,...,Debt Consolidation Loan,Home Equity Loan,Mortgage Loan,Not Specified,Payday Loan,Personal Loan,Student Loan,Credit_Mix_Encoded,Payment_of_Min_Amount_Encoded,Payment_Behaviour_Encoded
2,11,24.0,Scientist,19114.12,1824.843333,3,4,3,4.0,"[Auto Loan, Credit-Builder Loan, Personal Loan...",...,0,1,0,0,0,1,0,1,0,4
11,12,35.0,Engineer,143162.64,12187.220000,1,5,8,3.0,"[Auto Loan, Auto Loan, Not Specified]",...,0,0,0,1,0,0,0,1,0,1
18,11,22.0,Developer,35547.71,2853.309167,7,5,5,-100.0,[],...,0,0,0,0,0,0,0,2,2,1
24,9,34.0,Lawyer,131313.40,10469.207759,0,1,8,2.0,"[Credit-Builder Loan, Mortgage Loan]",...,0,0,1,0,0,0,0,1,0,1
43,12,,Teacher,33751.27,2948.605833,5,5,20,3.0,"[Credit-Builder Loan, Personal Loan, Auto Loan]",...,0,0,0,0,0,1,0,2,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49959,12,55.0,Media_Manager,20059.98,1523.665000,8,5,12,4.0,"[Home Equity Loan, Payday Loan, Not Specified,...",...,0,1,0,1,1,1,0,1,1,5
49973,10,33.0,Media_Manager,59146.36,4908.863333,3,6,6,1.0,[Personal Loan],...,0,0,0,0,0,1,0,1,1,1
49979,12,21.0,Architect,38321.39,3106.647859,4,4,3,4.0,"[Student Loan, Debt Consolidation Loan, Studen...",...,1,0,1,0,0,0,1,2,0,2
49986,11,5826.0,Media_Manager,16680.35,1528.029167,1,1,5,4.0,"[Payday Loan, Student Loan, Mortgage Loan, Not...",...,0,0,1,1,1,0,1,1,0,5


In [256]:
df_test[df_test['Monthly_Balance'].isna()]

Unnamed: 0,Month,Age,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,Type_of_Loan,...,Debt Consolidation Loan,Home Equity Loan,Mortgage Loan,Not Specified,Payday Loan,Personal Loan,Student Loan,Credit_Mix_Encoded,Payment_of_Min_Amount_Encoded,Payment_Behaviour_Encoded
135,12,44.0,Writer,58317.00,4664.750000,8,5,9,6.0,"[Mortgage Loan, Personal Loan, Home Equity Loa...",...,0,1,1,0,0,1,0,2,2,5
195,12,44.0,Lawyer,81842.28,,10,9,25,9.0,"[Student Loan, Auto Loan, Personal Loan, Auto ...",...,0,1,0,0,1,1,1,2,1,3
245,10,19.0,Lawyer,73737.36,5936.780000,7,3,18,4.0,"[Not Specified, Credit-Builder Loan, Payday Lo...",...,0,0,0,1,1,0,0,2,2,5
256,9,26.0,Scientist,60162.10,5197.508333,5,7,5,,"[Student Loan, Student Loan, Student Loan, Deb...",...,1,1,0,0,1,0,1,2,2,5
309,10,25.0,Teacher,129067.59,10795.632500,1,3,1,4.0,"[Student Loan, Auto Loan, Auto Loan, Payday Loan]",...,0,0,0,0,1,0,1,1,0,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49501,10,34.0,Writer,59650.23,4677.852500,5,7,15,6.0,"[Auto Loan, Personal Loan, Personal Loan, Not ...",...,1,0,1,1,0,1,0,2,2,5
49520,9,41.0,Entrepreneur,105931.17,,8,7,11,,"[Mortgage Loan, Auto Loan, Mortgage Loan, Stud...",...,0,0,1,0,0,0,1,2,2,5
49591,12,37.0,Engineer,62756.20,5194.683333,10,10,17,8.0,"[Home Equity Loan, Auto Loan, Credit-Builder L...",...,0,1,0,0,1,1,1,0,2,5
49910,11,20.0,Scientist,65858.48,5421.206667,7,7,30,5.0,"[Mortgage Loan, Auto Loan, Payday Loan, Not Sp...",...,0,0,1,1,1,0,1,2,2,5


### Findings from checking null values
- There are null values
    - 'Age',
    - 'Annual_Income',
    - 'Monthly_Inhand_Salary',
    - 'Num_of_Loan',
    - 'Num_of_Delayed_Payment',
    - 'Changed_Credit_Limit',
    - 'Num_Credit_Inquiries',
    - 'Outstanding_Debt',
    - 'Credit_History_Age',
    - 'Amount_invested_monthly',
    - 'Monthly_Balance'

## Handling NaN values

In [257]:
numeric_missing_cols=['Age',
              'Annual_Income',
              'Monthly_Inhand_Salary', 
              'Num_of_Loan',
              'Num_of_Delayed_Payment',
              'Changed_Credit_Limit',
              'Num_Credit_Inquiries',
              'Outstanding_Debt',
              'Amount_invested_monthly',
              'Monthly_Balance',
              'Credit_History_Age'];

In [258]:
# NaN is replaced by median value, by 'where' which works like ternary operator.
for col in numeric_missing_cols:
    col_median = df_test[col].median()
    df_test[col] = np.where(df_test[col].isna(), col_median, df_test[col])

In [259]:
# checking
df_test[df_test['Credit_History_Age'].isna()]

Unnamed: 0,Month,Age,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,Type_of_Loan,...,Debt Consolidation Loan,Home Equity Loan,Mortgage Loan,Not Specified,Payday Loan,Personal Loan,Student Loan,Credit_Mix_Encoded,Payment_of_Min_Amount_Encoded,Payment_Behaviour_Encoded


In [260]:
df_test[df_test['Monthly_Balance'].isna()]

Unnamed: 0,Month,Age,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,Type_of_Loan,...,Debt Consolidation Loan,Home Equity Loan,Mortgage Loan,Not Specified,Payday Loan,Personal Loan,Student Loan,Credit_Mix_Encoded,Payment_of_Min_Amount_Encoded,Payment_Behaviour_Encoded


### Missing values Handled

# Handling Outliers

In [261]:
for col in numeric_missing_cols:
    q1 = df_test[col].quantile(0.25)
    q3 = df_test[col].quantile(0.75)
    iqr = q3 - q1

    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr

    outlier_condition = (df_test[col] < lower_bound) | (df_test[col] > upper_bound)
    res_median = df_test[col].median()
    df_test.loc[outlier_condition, col] = res_median

In [262]:
for col in numeric_missing_cols:
    q1 = df_test[col].quantile(0.25)
    q3 = df_test[col].quantile(0.75)
    iqr = q3 - q1

    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr

    out_condition = (df_test[col] < lower_bound) | (df_test[col] > upper_bound)
    print(f"{col}: {out_condition.sum()} outliers remaining")

Age: 0 outliers remaining
Annual_Income: 1193 outliers remaining
Monthly_Inhand_Salary: 1718 outliers remaining
Num_of_Loan: 0 outliers remaining
Num_of_Delayed_Payment: 0 outliers remaining
Changed_Credit_Limit: 404 outliers remaining
Num_Credit_Inquiries: 0 outliers remaining
Outstanding_Debt: 2557 outliers remaining
Amount_invested_monthly: 3292 outliers remaining
Monthly_Balance: 3383 outliers remaining
Credit_History_Age: 0 outliers remaining


In [263]:
df_test.columns

Index(['Month', 'Age', 'Occupation', 'Annual_Income', 'Monthly_Inhand_Salary',
       'Num_Bank_Accounts', 'Num_Credit_Card', 'Interest_Rate', 'Num_of_Loan',
       'Type_of_Loan', 'Delay_from_due_date', 'Num_of_Delayed_Payment',
       'Changed_Credit_Limit', 'Num_Credit_Inquiries', 'Credit_Mix',
       'Outstanding_Debt', 'Credit_Utilization_Ratio', 'Credit_History_Age',
       'Payment_of_Min_Amount', 'Total_EMI_per_month',
       'Amount_invested_monthly', 'Payment_Behaviour', 'Monthly_Balance',
       'Occupation_Encoded', 'Auto Loan', 'Credit-Builder Loan',
       'Debt Consolidation Loan', 'Home Equity Loan', 'Mortgage Loan',
       'Not Specified', 'Payday Loan', 'Personal Loan', 'Student Loan',
       'Credit_Mix_Encoded', 'Payment_of_Min_Amount_Encoded',
       'Payment_Behaviour_Encoded'],
      dtype='object')

In [264]:
#Droping original columns of onject and replacing with its encoded version
df_testPred = df_test.drop(['Occupation', 'Credit_Mix', 'Type_of_Loan',
                               'Payment_of_Min_Amount', 'Payment_Behaviour'], axis=1)

# TARGET COLUMN PREDICTION

In [265]:
# predicting with Random Forest Classifier as it was the best model
import joblib

# Load the trained model
rf_clf = joblib.load('random_forest_model.pkl')

# Now use it to predict
y_pred_test = rf_clf.predict(df_testPred)
y_pred_test


array([1, 1, 2, ..., 0, 2, 0], shape=(50000,))

In [267]:
output = pd.DataFrame({'Customer_ID': df_test_csv['Customer_ID'], 'Predicted_Credit_Score': y_pred_test})
output.to_csv('credit_score_predictions.csv', index=False)