In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from skimpy import skim
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split

## Step 2: Load Data

In [3]:
# Load data
df = pd.read_csv('data_set/train.csv')
df_copy = df.copy()

## Step 3: Data Cleaning and Preprocessing for Categorical Data

In [5]:
# Include the object type columns except the 'Loan_ID' column
df_cat = df.select_dtypes(include=['object']).drop('Loan_ID', axis=1)

# Replace missing values with the most frequent value
for col in df_cat.columns:
    df_cat[col].fillna(df_cat[col].mode()[0], inplace=True)

df_cat.head()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_cat[col].fillna(df_cat[col].mode()[0], inplace=True)


Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,Property_Area,Loan_Status
0,Male,No,0,Graduate,No,Urban,Y
1,Male,Yes,1,Graduate,No,Rural,N
2,Male,Yes,0,Graduate,Yes,Urban,Y
3,Male,Yes,0,Not Graduate,No,Urban,Y
4,Male,No,0,Graduate,No,Urban,Y


## Step 4: Data Preprocessing for Numerical Data

In [6]:
numeric = df.select_dtypes(include=['int64', 'float64']).columns

for value in numeric:
    df[value] = df[value].fillna(df[value].median())

df_num = df[numeric]


## Step 5: Encode Categorical Variables

In [7]:
# Replace categorical values with numerical values
var_cat = df_cat.replace({
    'Gender': {'Male': 0, 'Female': 1},
    'Married': {'Yes': 1, 'No': 0},
    'Dependents': {'3+': 3, '0': 0, '1': 1, '2': 2},
    'Education': {'Graduate': 1, 'Not Graduate': 0},
    'Self_Employed': {'Yes': 1, 'No': 0},
    'Property_Area': {'Urban': 0, 'Rural': 1, 'Semiurban': 2},
    'Loan_Status': {'Y': 1, 'N': 0}
})

  var_cat = df_cat.replace({


## Step 6: Concatenate DataFrames

In [8]:
# Concatenate the two dataframes
df_concat = pd.concat([var_cat, df_num], axis=1)
df_concat.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,Property_Area,Loan_Status,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
0,0,0,0,1,0,0,1,5849,0.0,128.0,360.0,1.0
1,0,1,1,1,0,1,0,4583,1508.0,128.0,360.0,1.0
2,0,1,0,1,1,0,1,3000,0.0,66.0,360.0,1.0
3,0,1,0,0,0,0,1,2583,2358.0,120.0,360.0,1.0
4,0,0,0,1,0,0,1,6000,0.0,141.0,360.0,1.0


## Step 7: Data Information

In [9]:
df_concat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Gender             614 non-null    int64  
 1   Married            614 non-null    int64  
 2   Dependents         614 non-null    int64  
 3   Education          614 non-null    int64  
 4   Self_Employed      614 non-null    int64  
 5   Property_Area      614 non-null    int64  
 6   Loan_Status        614 non-null    int64  
 7   ApplicantIncome    614 non-null    int64  
 8   CoapplicantIncome  614 non-null    float64
 9   LoanAmount         614 non-null    float64
 10  Loan_Amount_Term   614 non-null    float64
 11  Credit_History     614 non-null    float64
dtypes: float64(4), int64(8)
memory usage: 57.7 KB


## Step 8: Split Data into Training and Test Sets

In [None]:
# Split the data
X = df_concat.drop(columns=['Loan_Status', 'Loan_Amount_Term', 'Gender', 'Self_Employed'], axis=1)
y = df_concat['Loan_Status']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)