In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [4]:
# Loading the datasets
train_data = pd.read_csv('data/Training Dataset.csv')
test_data = pd.read_csv('data/Test Dataset.csv')
sample_submission = pd.read_csv('data/Sample_Submission.csv')

In [5]:
# Preprocessing function
def preprocess_data(data):
    # Fill missing values
    imputer = SimpleImputer(strategy='most_frequent')
    data[['Gender', 'Married', 'Dependents', 'Self_Employed', 'Credit_History']] = imputer.fit_transform(data[['Gender', 'Married', 'Dependents', 'Self_Employed', 'Credit_History']])
    data['LoanAmount'] = data['LoanAmount'].fillna(data['LoanAmount'].mean())
    data['Loan_Amount_Term'] = data['Loan_Amount_Term'].fillna(data['Loan_Amount_Term'].mean())

    # Encode categorical variables
    label_encoders = {}
    for column in ['Gender', 'Married', 'Education', 'Self_Employed', 'Property_Area', 'Dependents']:
        le = LabelEncoder()
        data[column] = le.fit_transform(data[column])
        label_encoders[column] = le

    return data, label_encoders

In [6]:
# Preprocess the training data
train_data, label_encoders = preprocess_data(train_data)

In [7]:
# Separate features and target variable from training data
X = train_data.drop(columns=['Loan_ID', 'Loan_Status'])
y = train_data['Loan_Status'].map({'Y': 1, 'N': 0})

In [8]:
# Split the training data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
# Train a RandomForestClassifier
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

In [10]:
# Validate the model
y_pred = model.predict(X_val)
print("Validation Accuracy:", accuracy_score(y_val, y_pred))

Validation Accuracy: 0.7723577235772358


In [11]:
# Preprocess the test data using the same label encoders
for column in label_encoders:
    test_data[column] = test_data[column].fillna(test_data[column].mode()[0])
    test_data[column] = label_encoders[column].transform(test_data[column])
test_data['LoanAmount'] = test_data['LoanAmount'].fillna(test_data['LoanAmount'].mean())
test_data['Loan_Amount_Term'] = test_data['Loan_Amount_Term'].fillna(test_data['Loan_Amount_Term'].mean())
test_data['Credit_History'] = test_data['Credit_History'].fillna(test_data['Credit_History'].mode()[0])

In [12]:
# Predict on the test data
X_test = test_data.drop(columns=['Loan_ID'])
test_predictions = model.predict(X_test)

In [14]:
# Create submission file
sample_submission['Loan_Status'] = test_predictions
sample_submission['Loan_Status'] = sample_submission['Loan_Status'].map({1: 'Y', 0: 'N'})
sample_submission.to_csv('data/Submission.csv', index=False)

print("Submission file created successfully!")

Submission file created successfully!
