Loan Prediction

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import warnings

warnings.filterwarnings("ignore")

%matplotlib inline

In [None]:
df = pd.read_csv("loan_data.csv")
df.head()

In [None]:
df.info()

In [None]:
print(df.shape)

In [4]:
df = df.drop(['Loan_ID'], axis=1)

In [None]:
df.isnull().sum()

In [None]:
df.info()

In [None]:
# Define numeric and categorical features
numeric_features = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount']
categorical_features = [col for col in df.columns if col not in numeric_features]

def impute_and_encode(df, numeric_features, categorical_features):

  # Impute missing values
  for col in df.columns:
    if col in numeric_features:
      df[col] = df[col].fillna(df[col].mean())  # Impute with mean for numerical columns
    elif col in categorical_features:
      df[col] = df[col].fillna(df[col].mode()[0])  # Impute with mode for categorical columns
    else:
      raise ValueError(f"Column '{col}' not found in either numeric_features or categorical_features lists.")
  return df

# Impute and encode the DataFrame
df = impute_and_encode(df.copy(), numeric_features, categorical_features)

# Print the modified DataFrame
print(df)


In [None]:
#clr1 = ['#1E90FF', '#DC143C']
 
fig, ax = plt.subplots(3, 2, figsize=(10,14))
fig.suptitle('Distribution of Numerical Features By Loan_Status', color='#3C3744',
             fontsize=20, fontweight='bold', ha='center')
fig.set_facecolor('#DDDDDD') 
for i, col in enumerate(numeric_features):   
    sns.boxplot(data=df, x='Loan_Status', y=col, palette="Set2_r", ax=ax[i,0])
    ax[i,0].set_title(f'Boxplot of {col}', fontsize=12)
    sns.histplot(data=df, x=col, hue='Loan_Status', bins=20, kde=True, 
                 multiple='stack', palette="Set2_r", ax=ax[i,1])
    ax[i,1].set_title(f'Histogram of {col}', fontsize=14)
fig.tight_layout()
fig.subplots_adjust(top=0.90)
# plt.savefig('images/multivariate_num.png')

In [None]:
fig, ax = plt.subplots(3, 3, figsize=(14, 12))  
fig.set_facecolor('#DDDDDD')  

for index, cat_col in enumerate(categorical_features):
    row, col = index // 3, index % 3
    sns.countplot(x=cat_col, data=df, hue='Loan_Status', ax=ax[row, col],
                  palette="Set2_r")  # Use light colored palette

plt.subplots_adjust(hspace=1)
plt.show()
    
# save plot
# plt.savefig('./images/Univariate_Cat.png')

In [None]:
distinct_value_counts = df.nunique()
distinct_value_counts 


Handling the missing values in the dataset

In [None]:
df.isnull().sum()


In [None]:
# proportion of count data on categorical columns
for cat in categorical_features:
    print(df[cat].value_counts(normalize=True) * 100)
    print('---------------------------')

In [None]:
print(numeric_features)

In [None]:
print(categorical_features)

In [None]:
df.head