In [29]:
import pandas as pd

try:
    df = pd.read_excel('/content/sample_data/credit_data.xlsx')
    print(df.head())
    print(df.info())
    print(df.describe())
except FileNotFoundError:
    print("Make sure 'credit_data.xlsx' is in the correct location.")

                  Cbal  Cdur                   Chist         Cpur    Camt  \
0      0 <= Rs. < 2000     9    all settled till now     Business   13790   
1      0 <= Rs. < 2000    15   dues not paid earlier  electronics   15250   
2      0 <= Rs. < 2000    36  none taken/all settled     Business   19410   
3      0 <= Rs. < 2000    48  none taken/all settled     Business  144090   
4  no checking account    24    all settled till now  electronics   31690   

                 Sbal               Edur  InRate  \
0          Rs. < 1000       1 to 4 years       2   
1  no savings account  more than 7 years       4   
2          Rs. < 1000  more than 7 years       4   
3          Rs. < 1000       1 to 4 years       2   
4          Rs. < 1000   less than 1 year       4   

                                       MSG        Oparties  ...  \
0                  married or widowed male          no one  ...   
1                              single male  yes, guarantor  ...   
2                      

In [30]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

categorical_features = ['Cbal', 'Chist', 'Cpur', 'Sbal', 'Edur', 'MSG', 'Oparties', 'Rdur', 'Prop', 'inPlans', 'Htype', 'JobType', 'telephone', 'foreign']
numerical_features = ['Cdur', 'Camt', 'InRate', 'age', 'NumCred', 'Ndepend']

In [31]:
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [32]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)])

In [33]:
X = df.drop('creditScore', axis=1)
y = df['creditScore']

In [34]:
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)

In [36]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

#Train a Logistic Regression model
model = LogisticRegression(random_state=42)
model.fit(X_train_preprocessed, y_train)

In [37]:
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, confusion_matrix

# Make predictions on the test set
y_pred = model.predict(X_test_preprocessed)
y_prob = model.predict_proba(X_test_preprocessed)[:, 1] # Probability of the positive class

# Calculate and print evaluation metrics
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("ROC AUC Score:", roc_auc_score(y_test, y_prob))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.775
Classification Report:
               precision    recall  f1-score   support

           0       0.63      0.48      0.55        56
           1       0.82      0.89      0.85       144

    accuracy                           0.78       200
   macro avg       0.72      0.69      0.70       200
weighted avg       0.76      0.78      0.77       200

ROC AUC Score: 0.8198164682539683
Confusion Matrix:
 [[ 27  29]
 [ 16 128]]


In [38]:
# Train a Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train_preprocessed, y_train)

# Make predictions and evaluate the Random Forest model
y_pred_rf = rf_model.predict(X_test_preprocessed)
y_prob_rf = rf_model.predict_proba(X_test_preprocessed)[:, 1]

print("\nRandom Forest Model Evaluation:")
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Classification Report:\n", classification_report(y_test, y_pred_rf))
print("ROC AUC Score:", roc_auc_score(y_test, y_prob_rf))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))


Random Forest Model Evaluation:
Accuracy: 0.8
Classification Report:
               precision    recall  f1-score   support

           0       0.71      0.48      0.57        56
           1       0.82      0.92      0.87       144

    accuracy                           0.80       200
   macro avg       0.77      0.70      0.72       200
weighted avg       0.79      0.80      0.79       200

ROC AUC Score: 0.8365575396825397
Confusion Matrix:
 [[ 27  29]
 [ 11 133]]


In [39]:
from sklearn.svm import SVC

svc_model = SVC(probability=True, random_state=42) # Set probability=True to get probabilities
svc_model.fit(X_train_preprocessed, y_train)

# Make predictions and evaluate the SVC model
y_pred_svc = svc_model.predict(X_test_preprocessed)
y_prob_svc = svc_model.predict_proba(X_test_preprocessed)[:, 1]

print("\nSupport Vector Machine (SVC) Model Evaluation:")
print("Accuracy:", accuracy_score(y_test, y_pred_svc))
print("Classification Report:\n", classification_report(y_test, y_pred_svc))
print("ROC AUC Score:", roc_auc_score(y_test, y_prob_svc))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_svc))


Support Vector Machine (SVC) Model Evaluation:
Accuracy: 0.785
Classification Report:
               precision    recall  f1-score   support

           0       0.68      0.45      0.54        56
           1       0.81      0.92      0.86       144

    accuracy                           0.79       200
   macro avg       0.74      0.68      0.70       200
weighted avg       0.77      0.79      0.77       200

ROC AUC Score: 0.814236111111111
Confusion Matrix:
 [[ 25  31]
 [ 12 132]]


In [40]:
print("\n--- Model Comparison ---")

# Store the ROC AUC scores for comparison
model_performance = {
    "Logistic Regression": roc_auc_score(y_test, y_prob),
    "Random Forest": roc_auc_score(y_test, y_prob_rf),
    "SVC": roc_auc_score(y_test, y_prob_svc)
}

# Find the best performing model based on ROC AUC
best_model_name = max(model_performance, key=model_performance.get)
print(f"Best performing model based on ROC AUC: {best_model_name}")


if best_model_name == "Random Forest":
    from sklearn.model_selection import GridSearchCV

    # Define the parameter grid to search
    param_grid = {
        'n_estimators': [100, 200, 300],
        'max_depth': [5, 10, 15],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }

    grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=42),
                               param_grid=param_grid,
                               scoring='roc_auc',
                               cv=5,
                               n_jobs=-1)

    grid_search.fit(X_train_preprocessed, y_train)

    print("\n--- Hyperparameter Tuning Results for Random Forest ---")
    print("Best parameters:", grid_search.best_params_)
    print("Best ROC AUC score:", grid_search.best_score_)

    tuned_rf_model = grid_search.best_estimator_

    y_pred_tuned_rf = tuned_rf_model.predict(X_test_preprocessed)
    y_prob_tuned_rf = tuned_rf_model.predict_proba(X_test_preprocessed)[:, 1]

    print("\n--- Tuned Random Forest Model Evaluation ---")
    print("Accuracy:", accuracy_score(y_test, y_pred_tuned_rf))
    print("Classification Report:\n", classification_report(y_test, y_pred_tuned_rf))
    print("ROC AUC Score:", roc_auc_score(y_test, y_prob_tuned_rf))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_tuned_rf))


--- Model Comparison ---
Best performing model based on ROC AUC: Random Forest

--- Hyperparameter Tuning Results for Random Forest ---
Best parameters: {'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 200}
Best ROC AUC score: 0.78730407703622

--- Tuned Random Forest Model Evaluation ---
Accuracy: 0.755
Classification Report:
               precision    recall  f1-score   support

           0       0.65      0.27      0.38        56
           1       0.77      0.94      0.85       144

    accuracy                           0.76       200
   macro avg       0.71      0.61      0.61       200
weighted avg       0.74      0.76      0.72       200

ROC AUC Score: 0.8209325396825397
Confusion Matrix:
 [[ 15  41]
 [  8 136]]


In [41]:
import joblib

model_to_save = None
if best_model_name == "Random Forest" and 'tuned_rf_model' in locals():
    model_to_save = tuned_rf_model
elif best_model_name == "Logistic Regression":
    model_to_save = model
elif best_model_name == "SVC":
    model_to_save = svc_model
else:
    # Handle cases where the best model wasn't tuned or is not in the list
    print("Cannot save model: Best model not found or not supported for saving in this example.")

if model_to_save:
    filename = f'{best_model_name.lower().replace(" ", "_")}_credit_scoring_model.joblib'
    joblib.dump(model_to_save, filename)
    print(f"\nModel saved to {filename}")


Model saved to random_forest_credit_scoring_model.joblib


In [28]:
# Continue from your existing notebook
# Make sure you have run all the previous cells including:
# - Data loading and exploration
# - Data preprocessing (including fitting the preprocessor and label_encoder)
# - Model training (Logistic Regression, Random Forest, SVC)
# - Model selection and potential hyperparameter tuning (to determine best_model_name and model_to_evaluate)

# --- Predict Credit Score for New Data ---

# Define some sample new data for which you want to predict the credit score
# Replace with the actual feature values for the individuals you want to predict on
sample_new_data = pd.DataFrame({
    'Cbal': ['A11', 'A14'],
    'Cdur': [6, 18],
    'Chist': ['A34', 'A32'],
    'Cpur': ['A43', 'A40'],
    'Camt': [1169, 3500],
    'Sbal': ['A65', 'A61'],
    'Edur': ['A73', 'A71'],
    'InRate': [4, 2],
    'MSG': ['A93', 'A92'],
    'Oparties': ['A101', 'A103'],
    'Rdur': ['A121', 'A123'],
    'Prop': ['A131', 'A132'],
    'age': [67, 35],
    'inPlans': ['A143', 'A141'],
    'Htype': ['A152', 'A151'],
    'NumCred': [2, 1],
    'JobType': ['A173', 'A172'],
    'Ndepend': [1, 2],
    'telephone': ['A192', 'A191'],
    'foreign': ['A201', 'A202']
})

print("\n--- Sample New Data for Prediction ---")
print(sample_new_data)


--- Sample New Data for Prediction ---
  Cbal  Cdur Chist Cpur  Camt Sbal Edur  InRate  MSG Oparties  Rdur  Prop  \
0  A11     6   A34  A43  1169  A65  A73       4  A93     A101  A121  A131   
1  A14    18   A32  A40  3500  A61  A71       2  A92     A103  A123  A132   

   age inPlans Htype  NumCred JobType  Ndepend telephone foreign  
0   67    A143  A152        2    A173        1      A192    A201  
1   35    A141  A151        1    A172        2      A191    A202  

Error: No trained model found to make predictions. Please run the model training steps.


In [42]:
# Check if the Logistic Regression model ('model') exists and is trained
if 'model' in locals() and model is not None and hasattr(model, 'predict'):

    # Apply the same preprocessing steps to the new data
    # Use the preprocessor object that was already fitted on the training data
    new_data_preprocessed = preprocessor.transform(sample_new_data)

    # Make predictions using the trained Logistic Regression model
    predictions = model.predict(new_data_preprocessed)

    # If the model supports probabilities, get the predicted probabilities
    if hasattr(model, 'predict_proba'):
        probabilities = model.predict_proba(new_data_preprocessed)
        print("\nPredicted Probabilities (Class 0, Class 1):\n", probabilities)
        # Print the probability of the positive class (assuming it's the second column)
        print("Predicted Probability of Positive Class:\n", probabilities[:, 1])

    # Convert the predicted numerical label back to the original creditScore value
    # Use the label_encoder fitted earlier
    predicted_credit_score_original = label_encoder.inverse_transform(predictions)

    print("\nPredicted Credit Scores (Numerical Label):\n", predictions)
    print("Predicted Credit Scores (Original Label):\n", predicted_credit_score_original)

else:
    print("\nError: Logistic Regression model ('model') not found or not trained. Please run the data preprocessing and model training steps.")


Predicted Probabilities (Class 0, Class 1):
 [[0.0661422 0.9338578]
 [0.1138597 0.8861403]]
Predicted Probability of Positive Class:
 [0.9338578 0.8861403]

Predicted Credit Scores (Numerical Label):
 [1 1]
Predicted Credit Scores (Original Label):
 ['good' 'good']
