In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df= pd.read_csv("../data/raw/Bank_Personal_Loan_Modelling.csv")

In [None]:
df

In [None]:
df.shape

In [None]:
print(df.dtypes)

In [None]:
df.value_counts()

In [None]:
df.isnull().sum()

In [None]:
df.duplicated().sum()

In [None]:
df['CCAvg']=df['CCAvg'].replace("/",".")
df['CCAvg']=df['CCAvg'].astype(float)*12
df

In [None]:
df["Experience"]=abs(df["Experience"])
df

In [None]:
df = df.drop(['ID'],axis=1)
df

In [None]:
df.describe()[1:].T.style.background_gradient(cmap='Greens', axis=1)

In [None]:
plt.figure(figsize=(12, 6))

sns.scatterplot(data=df, x='ZIP Code', y='Personal Loan', color='firebrick', alpha=0.6)

plt.grid(True, linestyle='--', alpha=0.7)
plt.title('Relationship between ZIP Code and Personal Loan', fontsize=14)
plt.xlabel('ZIP Code')
plt.ylabel('Personal Loan')
plt.tight_layout()

plt.show()

In [None]:
noise=df[(df['ZIP Code']<20000) & (df['Personal Loan']<.1)]
noise

In [None]:
df.drop(index=[384],inplace=True)

In [None]:
#Correlation matrix
corr_matrix = df.corr()

#Heatmap of the Correlation matrix
plt.figure(figsize=(12, 8))
sns.heatmap(corr_matrix, annot=True, cmap="coolwarm", fmt='.2f', linewidths=0.5)
plt.title("Correlation Matrix")
plt.show()

In [None]:
nums=["Age","Experience","Income","CCAvg","Mortgage","ZIP Code"]
fig=plt.figure(figsize=(20,10))
for i,col in enumerate(nums):
    ax=fig.add_subplot(2,3,i+1)
    ax1=sns.distplot(df[col][df['Personal Loan']==0],hist=False, kde=True, label='No Personal Lone',color='red')
    sns.distplot(df[col][df['Personal Loan']==1],hist=False, kde=True , ax=ax1,label='Personal Lone',color='darkorange')

In [None]:
cols_count = ['Family', 'Education', 'Personal Loan', 'Securities Account', 'CD Account', 'Online', 'CreditCard']
for col in cols_count:
    print(f"\n{df[col].value_counts()}")
    print('_'*25)

In [None]:
for col in cols_count:
    plt.figure(figsize=(8, 4))
    sns.countplot(x=df[col], palette='viridis')
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Count')
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.show()

In [None]:
# Visualizing relationship between Income and Loan Acceptance
sns.boxplot(x='Personal Loan', y='Income', data=df)
plt.title('Income vs Loan Acceptance')
plt.show()

# Visualizing relationship between Education and Loan Acceptance
sns.countplot(x='Education', hue='Personal Loan', data=df)
plt.title('Education Level vs Loan Acceptance')
plt.show()

# Visualizing relationship between Family Size and Loan Acceptance
sns.countplot(x='Family', hue='Personal Loan', data=df)
plt.title('Family Size vs Loan Acceptance')
plt.show()

In [None]:
# Define age bins and labels
bins = [20, 30, 40, 50, 60, 70]
labels = ['20-30', '30-40', '40-50', '50-60', '60-70']

df['Age Group'] = pd.cut(df['Age'], bins=bins, labels=labels, right=False)

print(df[['Age', 'Age Group']].head())

In [None]:
# Visualizing relationship between Age Group and Loan Acceptance
sns.countplot(x='Age Group', hue='Personal Loan', data=df)
plt.title('Age Group vs Loan Acceptance')
plt.show()

In [None]:
# Visualizing relationship between Age Group and Loan Acceptance
sns.countplot(x='CreditCard', hue='Personal Loan', data=df)
plt.title('Credit Card vs Loan Acceptance')
plt.show()

In [None]:
df = df.drop(['Age Group'],axis=1)

In [None]:
df.to_csv("../data/clean/loan_accpt_clean_2.csv", index=False)