In [None]:
import numpy as np
import pandas as pd

In [None]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Import Datasets

In [None]:
DATASET_PATH_FMT = "/content/drive/MyDrive/Datasets/Loan_Eligibility/{}"

TRAIN_DATA_PATH = DATASET_PATH_FMT.format("loan-train.csv")
TEST_DATA_PATH = DATASET_PATH_FMT.format("loan-test.csv")

In [None]:
train = pd.read_csv(TRAIN_DATA_PATH)
test = pd.read_csv(TEST_DATA_PATH)

# Visualise general format

In [None]:
train

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,LP002978,Female,No,0,Graduate,No,2900,0.0,71.0,360.0,1.0,Rural,Y
610,LP002979,Male,Yes,3+,Graduate,No,4106,0.0,40.0,180.0,1.0,Rural,Y
611,LP002983,Male,Yes,1,Graduate,No,8072,240.0,253.0,360.0,1.0,Urban,Y
612,LP002984,Male,Yes,2,Graduate,No,7583,0.0,187.0,360.0,1.0,Urban,Y


In [None]:
test

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,LP001015,Male,Yes,0,Graduate,No,5720,0,110.0,360.0,1.0,Urban
1,LP001022,Male,Yes,1,Graduate,No,3076,1500,126.0,360.0,1.0,Urban
2,LP001031,Male,Yes,2,Graduate,No,5000,1800,208.0,360.0,1.0,Urban
3,LP001035,Male,Yes,2,Graduate,No,2340,2546,100.0,360.0,,Urban
4,LP001051,Male,No,0,Not Graduate,No,3276,0,78.0,360.0,1.0,Urban
...,...,...,...,...,...,...,...,...,...,...,...,...
362,LP002971,Male,Yes,3+,Not Graduate,Yes,4009,1777,113.0,360.0,1.0,Urban
363,LP002975,Male,Yes,0,Graduate,No,4158,709,115.0,360.0,1.0,Urban
364,LP002980,Male,No,0,Graduate,No,3250,1993,126.0,360.0,,Semiurban
365,LP002986,Male,Yes,0,Graduate,No,5000,2393,158.0,360.0,1.0,Rural


# Drop unneeded columns

In [None]:
train.drop("Loan_ID", axis=1, inplace=True)
test.drop("Loan_ID", axis=1, inplace=True)

In [None]:
# tolerance = 0.03
# tolerable_cols = [column for column, tolerable in (train.isna().mean() <= tolerance).items() if tolerable]

# print("Percent of rows to be dropped:", sum([train[col].isna().sum() for col in tolerable_cols]) / train.shape[0] * 100)

# Encode columns based on the test dataset

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
object_function = "mode"
numerical_function = "median"

for col in train:
  if train[col].dtypes == "object":
    train[col].fillna(
      getattr(train[col], object_function)()[0], 
      inplace=True
    )
    if col == "Loan_Status":
      continue
    test[col].fillna(
        getattr(train[col], object_function)()[0], 
        inplace=True
      )
  else:
    train[col].fillna(
      getattr(train[col], numerical_function)(), 
      inplace=True
    )
    if col == "Loan_Status":
      continue
    test[col].fillna(
      getattr(train[col], numerical_function)(), 
      inplace=True
    )

In [None]:
train['Gender'].fillna(train['Gender'].mode()[0], inplace=True)
train['Married'].fillna(train['Married'].mode()[0], inplace=True)
train['Dependents'].fillna(train['Dependents'].mode()[0], inplace=True)
train['Self_Employed'].fillna(train['Self_Employed'].mode()[0], inplace=True)
train['Credit_History'].fillna(train['Credit_History'].mode()[0], inplace=True)
test['Gender'].fillna(train['Gender'].mode()[0], inplace=True)
test['Married'].fillna(train['Married'].mode()[0], inplace=True)
test['Dependents'].fillna(train['Dependents'].mode()[0], inplace=True)
test['Self_Employed'].fillna(train['Self_Employed'].mode()[0], inplace=True)
test['Credit_History'].fillna(train['Credit_History'].mode()[0], inplace=True)
test['Loan_Amount_Term'].fillna(train['Loan_Amount_Term'].mode()[0], inplace=True)
test['LoanAmount'].fillna(train['LoanAmount'].median(), inplace=True)

for index, is_obj in (test.dtypes == "object").items():
  if is_obj:
    encoder = LabelEncoder()
    encoder.fit(train[index])
    train[index] = encoder.transform(train[index])
    test[index] = encoder.transform(test[index])

# Remaining Columns encoding

In [None]:
# convert to integer
train['Loan_Status'] = (train['Loan_Status'] == "Y").apply(int)

In [None]:
train["Total_Income"] = train["ApplicantIncome"] + train["CoapplicantIncome"]
test["Total_Income"] = test["ApplicantIncome"] + test["CoapplicantIncome"]

In [None]:
test["Gender"]

0      1
1      1
2      1
3      1
4      1
      ..
362    1
363    1
364    1
365    1
366    1
Name: Gender, Length: 367, dtype: int64

In [None]:
train.to_csv(DATASET_PATH_FMT.format("p_train.csv"), index=False)
test.to_csv(DATASET_PATH_FMT.format("p_test.csv"), index=False)