Python File 1

In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import numpy as np

In [3]:
# Importing the data
cs = pd.read_csv("credit_score_actual_dataset.csv")
cs.head()

Unnamed: 0,ID,Customer_ID,Month,Name,Age,SSN,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,...,Credit_Mix,Outstanding_Debt,Credit_Utilization_Ratio,Credit_History_Age,Payment_of_Min_Amount,Total_EMI_per_month,Amount_invested_monthly,Payment_Behaviour,Monthly_Balance,Credit_Score
0,5634,3392,1,Aaron Maashoh,23.0,821000265.0,Scientist,19114.12,1824.843333,3.0,...,Good,809.98,26.82262,265.0,No,49.574949,21.46538,High_spent_Small_value_payments,312.494089,Good
1,5635,3392,2,Aaron Maashoh,23.0,821000265.0,Scientist,19114.12,1824.843333,3.0,...,Good,809.98,31.94496,266.0,No,49.574949,21.46538,Low_spent_Large_value_payments,284.629162,Good
2,5636,3392,3,Aaron Maashoh,23.0,821000265.0,Scientist,19114.12,1824.843333,3.0,...,Good,809.98,28.609352,267.0,No,49.574949,21.46538,Low_spent_Medium_value_payments,331.209863,Good
3,5637,3392,4,Aaron Maashoh,23.0,821000265.0,Scientist,19114.12,1824.843333,3.0,...,Good,809.98,31.377862,268.0,No,49.574949,21.46538,Low_spent_Small_value_payments,223.45131,Good
4,5638,3392,5,Aaron Maashoh,23.0,821000265.0,Scientist,19114.12,1824.843333,3.0,...,Good,809.98,24.797347,269.0,No,49.574949,21.46538,High_spent_Medium_value_payments,341.489231,Good


In [4]:
# Checking for null values
cs.isnull().sum().sum()

0

In [11]:
# Checking for duplicates
# Check for duplicate rows based on all columns
duplicate_rows = cs[cs.duplicated()]

# Display the duplicate rows
print("Duplicate rows based on all columns:")
print(duplicate_rows) # In short, no duplicates.

Duplicate rows based on all columns:
Empty DataFrame
Columns: [ID, Customer_ID, Month, Name, Age, SSN, Occupation, Annual_Income, Monthly_Inhand_Salary, Num_Bank_Accounts, Num_Credit_Card, Interest_Rate, Num_of_Loan, Type_of_Loan, Delay_from_due_date, Num_of_Delayed_Payment, Changed_Credit_Limit, Num_Credit_Inquiries, Credit_Mix, Outstanding_Debt, Credit_Utilization_Ratio, Credit_History_Age, Payment_of_Min_Amount, Total_EMI_per_month, Amount_invested_monthly, Payment_Behaviour, Monthly_Balance, Credit_Score]
Index: []

[0 rows x 28 columns]


In [4]:
# Dropping "ID", "Name", and "SSN" due to unimportance
dropped = ['ID', 'Name', 'SSN']
cs.drop(dropped, axis = 1, inplace = True)


Encoding Features

In [5]:
# Figuring out the unique values in 'Occupation' Column

# The count of unique features
num_unique_occupations = cs['Occupation'].nunique()
print("The number of unique values in occupation is: ", num_unique_occupations)

print("===============================")
# The unique values
unique_occupations = cs['Occupation'].unique()
print("The unique entries in the occupation column are: \n", unique_occupations)

The number of unique values in occupation is:  15
The unique entries in the occupation column are: 
 ['Scientist' 'Teacher' 'Engineer' 'Entrepreneur' 'Developer' 'Lawyer'
 'Media_Manager' 'Doctor' 'Journalist' 'Manager' 'Accountant' 'Musician'
 'Mechanic' 'Writer' 'Architect']


In [6]:
# Giving each occupation a unique representative.

# Create an instance of the label encoder.
label_encoder = LabelEncoder()

# Create a new column named "Occupation_encoded" where occupation is numerated.
cs['Occupation_encoded'] = label_encoder.fit_transform(cs['Occupation'])

# Create a DataFrame to show the mappings of "Occupation" entries
occupation_mapping = pd.DataFrame({
    'Occupation': label_encoder.classes_,
    'Encoded_Value': range(len(label_encoder.classes_))
})

print(occupation_mapping)

       Occupation  Encoded_Value
0      Accountant              0
1       Architect              1
2       Developer              2
3          Doctor              3
4        Engineer              4
5    Entrepreneur              5
6      Journalist              6
7          Lawyer              7
8         Manager              8
9        Mechanic              9
10  Media_Manager             10
11       Musician             11
12      Scientist             12
13        Teacher             13
14         Writer             14


In [7]:
# Dropping the "Occupation" column
cs.drop('Occupation', axis = 1, inplace = True)

In [8]:
# Working on the "Type_of_Loan" column 

# Split the Loans at the comma,then create a new binary column for each loan. The entry to each column would be 1 when said person had taken the 
# specified loan.

# Creating a new dataframe to hold to loan columns
loan_df = pd.DataFrame()

# Creating a function to handle the values in the list after splitting
def clean_and_encode_loan(row):
    # Split the loan names at commas and remove unnecessary spaces
    loan_names = [loan.strip() for loan in row.split(',')]

    # Handle variations in loan names. When ignoring the variations two columns would be created for the same loan due to a slight difference- 
    # in the name of the loan, creating unnecessary columns.
    # "Mortgage_Loan" and "and_Mortgage_Loan" were previously recognized as two different loans (two different columns). This code addresses-
    # the mentioned issue
    cleaned_loan_names = []
    for loan in loan_names:
        cleaned_loan = loan.replace("and_", "").replace("and ", "")
        cleaned_loan_names.append(cleaned_loan)

    # Create a dictionary to hold loan information
    loan_info = {}
    for cleaned_loan in cleaned_loan_names:
        loan_info[cleaned_loan] = 1

    # Handle "No Data" and "Not Specified" entries since we don't want them to be two different columns in the dataframe. Instead, entries for-
    # said customer with these entries would have 0 entries in all loan columns.
    if "No Data" in cleaned_loan_names or "Not Specified" in cleaned_loan_names:
        loan_info = {loan: 0 for loan in loan_info}

    # Convert the dictionary to a Series and fill missing values with 0
    return pd.Series(loan_info).fillna(0)


In [9]:
# Apply the function to each row in the original DataFrame
loan_df = cs['Type_of_Loan'].apply(clean_and_encode_loan)

In [10]:
# Concatenate the encoded loan DataFrame with the original DataFrame
final_df = pd.concat([cs, loan_df], axis=1)
# Drop the original 'Type_of_Loan' column
final_df.drop('Type_of_Loan', axis = 1, inplace = True)

In [11]:
final_df.head()

Unnamed: 0,Customer_ID,Month,Age,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,Delay_from_due_date,...,Auto Loan,Credit-Builder Loan,Personal Loan,Home Equity Loan,Not Specified,No Data,Mortgage Loan,Student Loan,Debt Consolidation Loan,Payday Loan
0,3392,1,23.0,19114.12,1824.843333,3.0,4.0,3.0,4.0,3.0,...,1.0,1.0,1.0,1.0,,,,,,
1,3392,2,23.0,19114.12,1824.843333,3.0,4.0,3.0,4.0,3.0,...,1.0,1.0,1.0,1.0,,,,,,
2,3392,3,23.0,19114.12,1824.843333,3.0,4.0,3.0,4.0,3.0,...,1.0,1.0,1.0,1.0,,,,,,
3,3392,4,23.0,19114.12,1824.843333,3.0,4.0,3.0,4.0,5.0,...,1.0,1.0,1.0,1.0,,,,,,
4,3392,5,23.0,19114.12,1824.843333,3.0,4.0,3.0,4.0,6.0,...,1.0,1.0,1.0,1.0,,,,,,


In [12]:
# After studying the data, turns out the data needs more modifications. 
# Applied modifications are summerized in the following:
# Newly created columns ("Auto Loan", "Credit-Builder Loan", "Personal Loan", "Home Equity Loan", "Mortgage Loan", "Student Loan", "Debt Consolidation Loan", "Payday Loan") are going-
# to be looped through. entries = 0 are going to be converted to 1, entries = 1 are going to be unchanged, and entries = nan are going to be filled with zeros instead.


features = ["Auto Loan", "Credit-Builder Loan", "Personal Loan", "Home Equity Loan", "Mortgage Loan", "Student Loan", "Debt Consolidation Loan", "Payday Loan"]

# Loop through columns with NaN, 0, and 1 entries
for feature in features:
    # Replace 0 with 1 for the current feature
    final_df[feature] = final_df[feature].replace({0: 1})
    
    # Fill NaN values with 0 for the current feature
    final_df[feature] = final_df[feature].fillna(0)

In [13]:
# Drop meaningless columns ("No Data", "Not Specified")
x = ["No Data", "Not Specified"]
final_df.drop(x, axis = 1, inplace = True)
# Save the modified dataframe to a CSV file
final_df.to_csv('modified_final_df.csv', index=False)

Modifying the data part 2 in the file named "Modifying_2"