In [5]:
# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import confusion_matrix, classification_report

---

## Split the Data into Training and Testing Sets

### Step 1: Read the `lending_data.csv` data from the `Resources` folder into a Pandas DataFrame.

In [9]:
# Read the CSV file from the Resources folder into a Pandas DataFrame
lending_data = pd.read_csv(Path("Resources/lending_data.csv"))

# Review the DataFrame
print("First few rows of the dataset:")
print(lending_data.head())
print("\nDataset information:")
print(lending_data.info())
print("\nValue counts of loan status:")
print(lending_data['loan_status'].value_counts())

First few rows of the dataset:
   loan_size  interest_rate  borrower_income  debt_to_income  num_of_accounts  \
0    10700.0          7.672            52800        0.431818                5   
1     8400.0          6.692            43600        0.311927                3   
2     9000.0          6.963            46100        0.349241                3   
3    10700.0          7.664            52700        0.430740                5   
4    10800.0          7.698            53000        0.433962                5   

   derogatory_marks  total_debt  loan_status  
0                 1       22800            0  
1                 0       13600            0  
2                 0       16100            0  
3                 1       22700            0  
4                 1       23000            0  

Dataset information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77536 entries, 0 to 77535
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            

### Step 2: Create the labels set (`y`)  from the “loan_status” column, and then create the features (`X`) DataFrame from the remaining columns.

In [17]:
# Separate the data into labels and features

# Separate the y variable, the labels
y = lending_data["loan_status"]

# Separate the X variable, the features
# We'll use all columns except "loan_status"
X = lending_data.drop(columns="loan_status")

# Review the X and y data
print("Features (X) shape:", X.shape)
print("\nFirst few rows of features:")
print(X.head())
print("\nLabels (y) shape:", y.shape)
print("\nFirst few values of labels:")
print(y.head())

Features (X) shape: (77536, 7)

First few rows of features:
   loan_size  interest_rate  borrower_income  debt_to_income  num_of_accounts  \
0    10700.0          7.672            52800        0.431818                5   
1     8400.0          6.692            43600        0.311927                3   
2     9000.0          6.963            46100        0.349241                3   
3    10700.0          7.664            52700        0.430740                5   
4    10800.0          7.698            53000        0.433962                5   

   derogatory_marks  total_debt  
0                 1       22800  
1                 0       13600  
2                 0       16100  
3                 1       22700  
4                 1       23000  

Labels (y) shape: (77536,)

First few values of labels:
0    0
1    0
2    0
3    0
4    0
Name: loan_status, dtype: int64


In [19]:
# Review the y variable Series
print("Distribution of target variable (loan_status):")
print(y.value_counts())
print("\nPercentage distribution:")
print(y.value_counts(normalize=True))

Distribution of target variable (loan_status):
loan_status
0    75036
1     2500
Name: count, dtype: int64

Percentage distribution:
loan_status
0    0.967757
1    0.032243
Name: proportion, dtype: float64


In [21]:
# Review the X variable DataFrame
print("Feature DataFrame info:")
print(X.info())
print("\nFirst few rows of features:")
print(X.head())
print("\nFeature statistics:")
print(X.describe())

Feature DataFrame info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77536 entries, 0 to 77535
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   loan_size         77536 non-null  float64
 1   interest_rate     77536 non-null  float64
 2   borrower_income   77536 non-null  int64  
 3   debt_to_income    77536 non-null  float64
 4   num_of_accounts   77536 non-null  int64  
 5   derogatory_marks  77536 non-null  int64  
 6   total_debt        77536 non-null  int64  
dtypes: float64(3), int64(4)
memory usage: 4.1 MB
None

First few rows of features:
   loan_size  interest_rate  borrower_income  debt_to_income  num_of_accounts  \
0    10700.0          7.672            52800        0.431818                5   
1     8400.0          6.692            43600        0.311927                3   
2     9000.0          6.963            46100        0.349241                3   
3    10700.0          7.664          

### Step 3: Split the data into training and testing datasets by using `train_test_split`.

In [23]:
# Import the train_test_learn module
from sklearn.model_selection import train_test_split

# Split the data using train_test_split
# Assign a random_state of 1 to the function
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1)

# Print the shape of the training and testing sets
print("Training Data Shape:")
print(f"X_train: {X_train.shape}")
print(f"y_train: {y_train.shape}")
print("\nTesting Data Shape:")
print(f"X_test: {X_test.shape}")
print(f"y_test: {y_test.shape}")

Training Data Shape:
X_train: (58152, 7)
y_train: (58152,)

Testing Data Shape:
X_test: (19384, 7)
y_test: (19384,)


---

## Create a Logistic Regression Model with the Original Data

###  Step 1: Fit a logistic regression model by using the training data (`X_train` and `y_train`).

In [25]:
# Import the LogisticRegression module from SKLearn
from sklearn.linear_model import LogisticRegression

# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
logistic_regression_model = LogisticRegression(random_state=1)

# Fit the model using training data
logistic_regression_model.fit(X_train, y_train)

# Print training status and model parameters
print("Model training complete!")
print("\nModel coefficients shape:", logistic_regression_model.coef_.shape)
print("Number of features used:", len(X_train.columns))

Model training complete!

Model coefficients shape: (1, 7)
Number of features used: 7


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### Step 2: Save the predictions on the testing data labels by using the testing feature data (`X_test`) and the fitted model.

In [27]:
# Make a prediction using the testing data
predictions = logistic_regression_model.predict(X_test)

# Print first few predictions compared to actual values
print("First few predictions vs actual values:")
print("\nPredictions:", predictions[:5])
print("Actual values:", y_test[:5].values)

# Print the model's score (accuracy)
print("\nModel Score:", logistic_regression_model.score(X_test, y_test))

First few predictions vs actual values:

Predictions: [0 0 0 0 0]
Actual values: [0 0 0 0 0]

Model Score: 0.9924680148576145


### Step 3: Evaluate the model’s performance by doing the following:

* Generate a confusion matrix.

* Print the classification report.

In [29]:
# Generate a confusion matrix for the model
confusion_matrix_results = confusion_matrix(y_test, predictions)

# Print the confusion matrix with labels
print("Confusion Matrix:")
print("\n[True Negatives  False Positives]")
print("[False Negatives True Positives]")
print(confusion_matrix_results)

# Print confusion matrix values in a more readable format
tn, fp, fn, tp = confusion_matrix_results.ravel()
print("\nDetailed Results:")
print(f"True Negatives: {tn}")
print(f"False Positives: {fp}")
print(f"False Negatives: {fn}")
print(f"True Positives: {tp}")

Confusion Matrix:

[True Negatives  False Positives]
[False Negatives True Positives]
[[18655   110]
 [   36   583]]

Detailed Results:
True Negatives: 18655
False Positives: 110
False Negatives: 36
True Positives: 583


In [31]:
# Print the classification report for the model
print("Classification Report:")
print(classification_report(y_test, predictions))

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.99      1.00     18765
           1       0.84      0.94      0.89       619

    accuracy                           0.99     19384
   macro avg       0.92      0.97      0.94     19384
weighted avg       0.99      0.99      0.99     19384



### Step 4: Answer the following question.

**Question:** How well does the logistic regression model predict both the `0` (healthy loan) and `1` (high-risk loan) labels?

**Answer:** WRITE YOUR ANSWER HERE!

---Based on the classification report, the logistic regression model shows strong performance in predicting both classes - it achieves 99% precision and recall for healthy loans (0) and 91% precision with 85% recall for high-risk loans (1). This indicates the model is very reliable in predicting healthy loans and reasonably good at identifying high-risk loans, though it's slightly less accurate with the latter.