In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load the dataset
loan_data = pd.read_csv('/content/loan.csv')

# Encode categorical variables
loan_data = pd.get_dummies(loan_data, drop_first=True)

# Define features and target
X = loan_data.drop('high_risk_applicant', axis=1)
y = loan_data['high_risk_applicant']

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)


In [3]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

# Create a logistic regression model
model = LogisticRegression(max_iter=1000)

# Create the RFE model and select the top 10 features
rfe = RFE(model, n_features_to_select=10)
X_rfe = rfe.fit_transform(X_train, y_train)
X_test_rfe = rfe.transform(X_test)


In [4]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'C': [0.1, 1, 10, 100],
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
}

# Create the GridSearchCV object
grid_search = GridSearchCV(LogisticRegression(max_iter=1000), param_grid, cv=5, scoring='accuracy')

# Fit the model
grid_search.fit(X_rfe, y_train)

# Get the best estimator
best_model = grid_search.best_estimator_




In [5]:
# Train the logistic regression model with the best hyperparameters
best_model.fit(X_rfe, y_train)


In [7]:
# Make predictions on the test set
y_pred = best_model.predict(X_test_rfe)


In [8]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

# Evaluate the model
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nAccuracy Score:")
print(accuracy_score(y_test, y_pred))


Confusion Matrix:
[[209   0]
 [ 91   0]]

Classification Report:
              precision    recall  f1-score   support

           0       0.70      1.00      0.82       209
           1       0.00      0.00      0.00        91

    accuracy                           0.70       300
   macro avg       0.35      0.50      0.41       300
weighted avg       0.49      0.70      0.57       300


Accuracy Score:
0.6966666666666667


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


2nd Model

In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load the dataset
loan_data = pd.read_csv('/content/loan.csv')

# Define features and target, EXCLUDING non-numerical columns
X = loan_data.drop(['high_risk_applicant', 'loan_application_id'], axis=1)  # Assuming 'loan_application_id' is the culprit
y = loan_data['high_risk_applicant']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier

# Load the dataset
loan_data = pd.read_csv('/content/loan.csv')

# Define features and target, EXCLUDING non-numerical columns
X = loan_data.drop(['high_risk_applicant', 'loan_application_id'], axis=1)
y = loan_data['high_risk_applicant']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Identify numerical and categorical features
numerical_features = X_train.select_dtypes(include=['number']).columns
categorical_features = X_train.select_dtypes(include=['object']).columns

# Create transformers for numerical and categorical features
numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])
categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))])

# Combine transformers using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Create a pipeline that includes preprocessing and RFE
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('rfe', RFE(estimator=RandomForestClassifier(random_state=42), n_features_to_select=10)),
    ('classifier', RandomForestClassifier(random_state=42))
])

# Fit the pipeline to the training data
pipeline.fit(X_train, y_train)

# Transform the test data using the fitted preprocessor in the pipeline
X_test_transformed = pipeline.named_steps['preprocessor'].transform(X_test)

# Extract the selected features from the transformed test data
X_test_rfe = pipeline.named_steps['rfe'].transform(X_test_transformed)

# Now you can make predictions using the transformed test data
y_pred = pipeline.predict(X_test)

In [15]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Create the GridSearchCV object
grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5, scoring='accuracy')

# Fit the model
grid_search.fit(X_rfe, y_train)

# Get the best estimator
best_model = grid_search.best_estimator_


In [16]:
# Train the random forest model with the best hyperparameters
best_model.fit(X_rfe, y_train)


In [17]:
# Make predictions on the test set
y_pred = best_model.predict(X_test_rfe)


In [18]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

# Evaluate the model
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nAccuracy Score:")
print(accuracy_score(y_test, y_pred))


Confusion Matrix:
[[209   0]
 [ 91   0]]

Classification Report:
              precision    recall  f1-score   support

           0       0.70      1.00      0.82       209
           1       0.00      0.00      0.00        91

    accuracy                           0.70       300
   macro avg       0.35      0.50      0.41       300
weighted avg       0.49      0.70      0.57       300


Accuracy Score:
0.6966666666666667


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
