## Initial SetUp

In [5]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Make plots appear in the notebook
%matplotlib inline

# Set style for better visualization
plt.style.use('seaborn-v0_8-whitegrid')
sns.set(font_scale=1.2)

# Load the dataset
# Assuming the dataset folder is in the current directory
df = pd.read_csv('dataset/credit_score_dataset.csv')

# Display the first few rows of the dataset
print("Dataset Overview:")
df.head()

Dataset Overview:


Unnamed: 0,ID,Customer_ID,Month,Name,City,Street,Age,SSN,Occupation,Annual_Income,...,Num_of_Delayed_Payment,Changed_Credit_Limit,Num_Credit_Inquiries,Credit_Mix,Outstanding_Debt,Credit_Utilization_Ratio,Credit_History_Age,Payment_of_Min_Amount,Amount_invested_monthly,Credit_Score
0,,CUS_0xd40,,Aaron Maashoh,Lonton,Oxford Street,,821-00-0265,Manager,19114.12,...,7.0,11.27,4.0,_,,26.82262,22 Years and 1 Months,No,80.41529543900253,3
1,0x1603,CUS_0xd40,February,Aaron Maashoh,Lonton,Oxford Street,23.0,821-00-0265,Manager,19114.12,...,,11.27,4.0,,809.98,31.94496,,No,118.28022162236736,3
2,0x1604,CUS_0xd40,,,Lonton,Oxford Street,-500.0,821-00-0265,Manager,19114.12,...,7.0,,4.0,Good,809.98,28.609352,,No,81.699521264648,3
3,0x1605,CUS_0xd40,April,Aaron Maashoh,Lonton,Oxford Street,23.0,821-00-0265,Manager,19114.12,...,4.0,6.27,4.0,Good,809.98,,22 Years and 4 Months,No,199.4580743910713,3
4,,CUS_0xd40,May,Aaron Maashoh,Lonton,Oxford Street,23.0,821-00-0265,Manager,19114.12,...,,11.27,4.0,Good,809.98,24.797347,22 Years and 5 Months,No,41.420153086217326,3


## Initial Data Exploration

In [7]:
# Check the shape of the dataset (rows, columns)
print(f"Dataset shape: {df.shape}")

Dataset shape: (100000, 27)


In [8]:
# Get information about the dataset (data types, non-null values)
print("\nDataset Information:")
df.info()


Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 27 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   ID                        90058 non-null   object 
 1   Customer_ID               90107 non-null   object 
 2   Month                     90062 non-null   object 
 3   Name                      81113 non-null   object 
 4   City                      90149 non-null   object 
 5   Street                    90080 non-null   object 
 6   Age                       85512 non-null   float64
 7   SSN                       89999 non-null   object 
 8   Occupation                90056 non-null   object 
 9   Annual_Income             89906 non-null   object 
 10  Monthly_Inhand_Salary     76405 non-null   float64
 11  Num_Bank_Accounts         90167 non-null   float64
 12  Num_Credit_Card           89938 non-null   float64
 13  Interest_Rate          

In [9]:
# Get statistical summary of numerical columns
print("\nStatistical Summary:")
df.describe()


Statistical Summary:


Unnamed: 0,Age,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Delay_from_due_date,Num_Credit_Inquiries,Credit_Utilization_Ratio,Credit_Score
count,85512.0,76405.0,90167.0,89938.0,90151.0,90012.0,88102.0,90025.0,100000.0
mean,110.226845,4185.789272,17.105172,22.44941,72.343923,21.081156,27.846882,32.290967,1.88064
std,684.907588,3178.560506,117.728215,129.037233,465.684822,14.862707,193.610843,5.11467,0.683065
min,-500.0,303.645417,-1.0,0.0,1.0,-5.0,0.0,20.10077,0.0
25%,24.0,1625.02375,3.0,4.0,8.0,10.0,3.0,28.064418,1.0
50%,33.0,3086.683333,6.0,5.0,13.0,18.0,6.0,32.31828,2.0
75%,42.0,5940.3175,7.0,7.0,20.0,28.0,9.0,36.502272,2.0
max,8698.0,15204.633333,1798.0,1499.0,5797.0,67.0,2597.0,50.0,3.0


In [10]:
# Check for missing values
print("\nMissing Values:")
df.isnull().sum()


Missing Values:


ID                           9942
Customer_ID                  9893
Month                        9938
Name                        18887
City                         9851
Street                       9920
Age                         14488
SSN                         10001
Occupation                   9944
Annual_Income               10094
Monthly_Inhand_Salary       23595
Num_Bank_Accounts            9833
Num_Credit_Card             10062
Interest_Rate                9849
Num_of_Loan                 10191
Type_of_Loan                20312
Delay_from_due_date          9988
Num_of_Delayed_Payment      16218
Changed_Credit_Limit        10067
Num_Credit_Inquiries        11898
Credit_Mix                   9915
Outstanding_Debt             9963
Credit_Utilization_Ratio     9975
Credit_History_Age          18209
Payment_of_Min_Amount        9957
Amount_invested_monthly     14120
Credit_Score                    0
dtype: int64

## Data Cleaning

In [None]:
# Option 1: Remove rows with missing values
df_cleaned = df.dropna()

# Option 2: Fill missing values with mean (for numerical columns)
# df['numeric_column'].fillna(df['numeric_column'].mean(), inplace=True)

# Option 3: Fill missing values with mode (for categorical columns)
# df['categorical_column'].fillna(df['categorical_column'].mode()[0], inplace=True)

# Check the shape after cleaning
print(f"Shape after cleaning: {df_cleaned.shape}")

# Convert data types if needed (example: string to category)
# df['column_name'] = df['column_name'].astype('category')

## Exploratory Data Analysis (EDA)

### Distribution of Credit Scores

In [None]:
# Assuming 'credit_score' is your target column
plt.figure(figsize=(10, 6))
sns.histplot(df['credit_score'], kde=True)
plt.title('Distribution of Credit Scores')
plt.xlabel('Credit Score')
plt.ylabel('Frequency')
plt.show()

# Basic statistics of credit score
print(df['credit_score'].describe())

### Correlation Analysis

In [None]:
# Select only numeric columns
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns

# Create correlation matrix
correlation = df[numeric_cols].corr()

# Plot heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(correlation, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
plt.title('Correlation Matrix')
plt.tight_layout()
plt.show()

### Visualize Important Features

In [None]:
# Bar plot for categorical features (example: payment history)
# Replace 'payment_history' with an actual categorical column in your dataset
plt.figure(figsize=(10, 6))
sns.countplot(x='payment_history', data=df)
plt.title('Count of Payment History Categories')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.show()

# Box plot for numerical features by credit score category
# First, create a credit score category
df['credit_score_category'] = pd.cut(df['credit_score'], 
                                    bins=[300, 580, 670, 740, 850],
                                    labels=['Poor', 'Fair', 'Good', 'Excellent'])

# Then plot a numerical feature by credit score category
# Replace 'debt_to_income_ratio' with an actual numerical column in your dataset
plt.figure(figsize=(12, 6))
sns.boxplot(x='credit_score_category', y='debt_to_income_ratio', data=df)
plt.title('Debt-to-Income Ratio by Credit Score Category')
plt.xlabel('Credit Score Category')
plt.ylabel('Debt-to-Income Ratio')
plt.show()

### Pair Plots for Key Features

In [None]:
# Select a subset of important features
important_features = ['credit_score', 'annual_income', 'debt_to_income_ratio', 'loan_amount', 'interest_rate']
# Adjust the list above based on your actual column names

# Create pair plots
sns.pairplot(df[important_features])
plt.suptitle('Pair Plots of Key Features', y=1.02)
plt.show()

### Feature Relationships Analysis

In [None]:
# Scatter plot with regression line
plt.figure(figsize=(10, 6))
sns.regplot(x='annual_income', y='credit_score', data=df, scatter_kws={'alpha':0.5})
plt.title('Relationship between Annual Income and Credit Score')
plt.xlabel('Annual Income')
plt.ylabel('Credit Score')
plt.show()

# Categorical analysis
plt.figure(figsize=(12, 6))
sns.violinplot(x='employment_status', y='credit_score', data=df)
plt.title('Credit Score Distribution by Employment Status')
plt.xlabel('Employment Status')
plt.ylabel('Credit Score')
plt.xticks(rotation=45)
plt.show()

### Feature Importance Analysis (Optional)

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder

# Prepare the data
# Select features (X) and target variable (y)
# Replace these column names with your actual column names
X = df_cleaned[['annual_income', 'debt_to_income_ratio', 'payment_history', 'credit_utilization']]
y = df_cleaned['credit_score']

# Encode categorical variables
for column in X.select_dtypes(include=['object', 'category']).columns:
    le = LabelEncoder()
    X[column] = le.fit_transform(X[column])

# Create and fit the model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X, y)

# Get feature importance
importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': model.feature_importances_
}).sort_values('Importance', ascending=False)

# Plot feature importance
plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=importance)
plt.title('Feature Importance for Credit Score')
plt.tight_layout()
plt.show()