In [85]:
import pandas as pd
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load datasets
train_df = pd.read_csv('Training Dataset.csv')
test_df = pd.read_csv('Test Dataset.csv')

# Display the first few rows of the datasets
print(train_df.head())
print(test_df.head())

    Loan_ID Gender Married Dependents     Education Self_Employed  \
0  LP001002   Male      No          0      Graduate            No   
1  LP001003   Male     Yes          1      Graduate            No   
2  LP001005   Male     Yes          0      Graduate           Yes   
3  LP001006   Male     Yes          0  Not Graduate            No   
4  LP001008   Male      No          0      Graduate            No   

   ApplicantIncome  CoapplicantIncome  LoanAmount  Loan_Amount_Term  \
0             5849                0.0         NaN             360.0   
1             4583             1508.0       128.0             360.0   
2             3000                0.0        66.0             360.0   
3             2583             2358.0       120.0             360.0   
4             6000                0.0       141.0             360.0   

   Credit_History Property_Area Loan_Status  
0             1.0         Urban           Y  
1             1.0         Rural           N  
2             1.0   

In [87]:
# Separate features and target variable from training data
X = train_df.drop('Loan_Status', axis=1)
y = train_df['Loan_Status']

# Identify categorical and numerical columns
categorical_cols = [col for col in X.columns if X[col].dtype == 'object']
numerical_cols = [col for col in X.columns if X[col].dtype in ['int64', 'float64']]

# Preprocessing for numerical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Preprocess the data
X_processed = preprocessor.fit_transform(X)
test_processed = preprocessor.transform(test_df)

In [89]:
# Split the data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_processed, y, test_size=0.2, random_state=42)

# Define models
models = {
    'Logistic Regression': LogisticRegression(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'SVM': SVC()
}

# Train and evaluate models
for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    
    print(f'{model_name}:')
    print(f'Accuracy: {accuracy_score(y_val, y_pred)}')
    print(f'Precision: {precision_score(y_val, y_pred, pos_label="Y" or "N", average="binary")}')
    print(f'Recall: {recall_score(y_val, y_pred, pos_label="Y" or "N", average="binary")}')
    print(f'F1-score: {f1_score(y_val, y_pred, pos_label="Y" or "N", average="binary")}\n')

Logistic Regression:
Accuracy: 0.7886178861788617
Precision: 0.7596153846153846
Recall: 0.9875
F1-score: 0.8586956521739131

Decision Tree:
Accuracy: 0.6910569105691057
Precision: 0.7282608695652174
Recall: 0.8375
F1-score: 0.7790697674418605

Random Forest:
Accuracy: 0.7886178861788617
Precision: 0.7596153846153846
Recall: 0.9875
F1-score: 0.8586956521739131

SVM:
Accuracy: 0.7967479674796748
Precision: 0.7619047619047619
Recall: 1.0
F1-score: 0.8648648648648649



In [91]:
# Support Vector Machine was the best-performing model based on evaluation
best_model = SVC()
best_model.fit(X_processed, y)

# Generate predictions for the test dataset
test_predictions = best_model.predict(test_processed)

# Create a submission file
submission = pd.DataFrame({
    'Loan_Id': test_df['Loan_ID'],
    'Loan_Status': test_predictions
})

submission.to_csv('submission.csv', index=False)

In [93]:
# Load submission.csv file
sub_df = pd.read_csv('submission.csv')

# Display the first few rows of the submission file
print(sub_df.head())

    Loan_Id Loan_Status
0  LP001015           Y
1  LP001022           Y
2  LP001031           Y
3  LP001035           Y
4  LP001051           Y
