In [133]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,confusion_matrix

In [134]:
# Step 2: Load Dataset
def load_dataset(file_path):
    df = pd.read_csv(file_path)
    return df

In [135]:
# Step 3: Handle Missing Values
def handle_missing_values(df):
    df['person_emp_length'] = df['person_emp_length'].fillna(value=4)
    df['loan_int_rate'] = df['loan_int_rate'].fillna(value=10.99)
    return df

In [136]:
# Step 4: Filter DataFrame
def filter_dataframe(df, min_age, max_age, min_income, max_income, min_loan_amount):
    df = df[(df['person_age'] >= min_age) & (df['person_age'] <= max_age) &
                     (df['person_income'] >= min_income) & (df['person_income'] <= max_income) &
                     (df['loan_amnt'] >= min_loan_amount) & (df['person_home_ownership'] == 'OWN')]
    return df


In [137]:
# Step 5: Identifying the target variable
def extract_features_and_target(df):
    y = df['loan_status']
    x = df[['person_age', 'person_income', 'person_home_ownership', 'person_emp_length', 'loan_intent', 'loan_grade',
            'loan_amnt', 'loan_int_rate', 'loan_percent_income', 'cb_person_default_on_file', 'cb_person_cred_hist_length']]
    return x, y

In [138]:
# Step 6: Splitting the data into train and test
def split_train_test_data(x, y, random_state=4, train_size=0.75):
    x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=random_state, train_size=train_size)
    return x_train, x_test, y_train, y_test

In [139]:
# Step 7: Preprocess Train Data
def preprocess_train_data(x_train):
    scaler = StandardScaler()
    x_train_num_scaled = scaler.fit_transform(x_train.select_dtypes(include=['int64', 'float64']))
    x_train_num_pre = pd.DataFrame(x_train_num_scaled, columns=x_train.select_dtypes(include=['int64', 'float64']).columns)
    
    x_train_cat = x_train.select_dtypes(include=['object'])
    encoder = OneHotEncoder(drop='first')
    x_train_cat_encoded = pd.DataFrame(encoder.fit_transform(x_train_cat).toarray(), columns=encoder.get_feature_names_out(x_train_cat.columns))
    
    # Ensure the columns match before concatenating
    if not x_train_num_pre.index.equals(x_train_cat_encoded.index):
        x_train_cat_encoded.index = x_train_num_pre.index
    
    x_train_transformed = pd.concat([x_train_num_pre, x_train_cat_encoded], axis=1)
    
    return x_train_transformed

In [140]:
# Step 8: Preprocess Test Data
def preprocess_test_data(x_test, scaler, encoder):
    x_test_num_scaled = scaler.transform(x_test.select_dtypes(include=['int64', 'float64']))
    x_test_num_pre = pd.DataFrame(x_test_num_scaled, columns=x_test.select_dtypes(include=['int64', 'float64']).columns)
    
    x_test_cat = x_test.select_dtypes(include=['object'])
    x_test_cat_encoded = pd.DataFrame(encoder.transform(x_test_cat).toarray(), columns=encoder.get_feature_names_out(x_test_cat.columns))
    
    # Ensure the columns match before concatenating
    if not x_test_num_pre.index.equals(x_test_cat_encoded.index):
        x_test_cat_encoded.index = x_test_num_pre.index
    
    x_test_transformed = pd.concat([x_test_num_pre, x_test_cat_encoded], axis=1)
    
    return x_test_transformed

In [141]:
# Step 9: Model Building-knn
def build_knn_model(x_train_transformed, y_train):
    classifier = KNeighborsClassifier()
    classifier.fit(x_train_transformed, y_train)
    return classifier

In [142]:
# Step 10: Model Building - Logistic Regression
def build_logistic_regression_model(x_train_transformed, y_train):
    classifier = LogisticRegression()
    classifier.fit(x_train_transformed, y_train)
    return classifier

In [143]:
# Step 11: Model Building - Support Vector Machine (SVM)
def build_svm_model(x_train_transformed, y_train):
    classifier = SVC()
    classifier.fit(x_train_transformed, y_train)
    return classifier

In [144]:
# Step 12: Model Building - Decision Tree
def build_decision_tree_model(x_train_transformed, y_train):
    classifier = DecisionTreeClassifier()
    classifier.fit(x_train_transformed, y_train)
    return classifier

In [145]:
# Step 13: Model Building - Random Forest
def build_random_forest_model(x_train_transformed, y_train):
    classifier = RandomForestClassifier()
    classifier.fit(x_train_transformed, y_train)
    return classifier

In [146]:
# Step 14: Model Evaluation
def evaluate_model(classifier, x_test_transformed, y_test):
    y_test_pred = classifier.predict(x_test_transformed)
    accuracy = accuracy_score(y_test, y_test_pred)
    confusion = confusion_matrix(y_test, y_test_pred)
    return accuracy, confusion

In [148]:
# Step 11: Main Function
def main():
    df = load_dataset('/Users/tarunreddy/Downloads/credit_risk_dataset.csv')
    df = handle_missing_values(df)
    df = filter_dataframe(df, 23, 60, 66074, 6000000, 5000)
    x, y = extract_features_and_target(df)
    x_train, x_test, y_train, y_test = split_train_test_data(x, y, random_state=4, train_size=0.75)
    x_train_transformed = preprocess_train_data(x_train)
    
    # Fit scaler and encoder on training data only
    scaler = StandardScaler()
    encoder = OneHotEncoder(drop='first')
    x_train_cat = x_train.select_dtypes(include=['object'])
    encoder.fit(x_train_cat)
    scaler.fit(x_train.select_dtypes(include=['int64', 'float64']))
    
    x_test_transformed  = preprocess_test_data(x_test, scaler, encoder)
    
    # Build models
    knn_model = build_knn_model(x_train_transformed, y_train)
    logistic_regression_model = build_logistic_regression_model(x_train_transformed, y_train)
    svm_model = build_svm_model(x_train_transformed, y_train)
    decision_tree_model = build_decision_tree_model(x_train_transformed, y_train)
    random_forest_model = build_random_forest_model(x_train_transformed, y_train)
    
    # Evaluate models
    models = {
        'knn model': knn_model,
        'Logistic Regression': logistic_regression_model,
        'SVM': svm_model,
        'Decision Tree': decision_tree_model,
        'Random Forest': random_forest_model
    }
    
    for model_name, model in models.items():
        print(f"Evaluating {model_name}:")
        accuracy,confusion = evaluate_model(model, x_test_transformed, y_test)
        print('Accuracy:', accuracy)
        print('confusion', confusion)
   # Call the main function
if __name__ == "__main__":
    main()   

Evaluating knn model:
Accuracy: 0.9722222222222222
confusion [[140   0]
 [  4   0]]
Evaluating Logistic Regression:
Accuracy: 0.9861111111111112
confusion [[140   0]
 [  2   2]]
Evaluating SVM:
Accuracy: 0.9722222222222222
confusion [[140   0]
 [  4   0]]
Evaluating Decision Tree:
Accuracy: 0.9722222222222222
confusion [[139   1]
 [  3   1]]
Evaluating Random Forest:
Accuracy: 0.9791666666666666
confusion [[140   0]
 [  3   1]]
