In [44]:
# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression

---

## Split the Data into Training and Testing Sets

### Step 1: Read the `lending_data.csv` data from the `Resources` folder into a Pandas DataFrame.

In [35]:
# Path to the CSV file
file_path = r'C:\Users\rinnes\credit-risk-classification\Credit_Risk\Resources\lending_data.csv'

# Read the CSV file into a DataFrame
lending_data_df = pd.read_csv(file_path)

#Display the first few rows of the data frame
print(lending_data_df.head())


   loan_size  interest_rate  borrower_income  debt_to_income  num_of_accounts  derogatory_marks  total_debt  loan_status
0    10700.0          7.672            52800        0.431818                5                 1       22800            0
1     8400.0          6.692            43600        0.311927                3                 0       13600            0
2     9000.0          6.963            46100        0.349241                3                 0       16100            0
3    10700.0          7.664            52700        0.430740                5                 1       22700            0
4    10800.0          7.698            53000        0.433962                5                 1       23000            0


### Step 2: Create the labels set (`y`)  from the “loan_status” column, and then create the features (`X`) DataFrame from the remaining columns.

In [36]:
# Separate the y variable, the labels
y = lending_data_df['loan_status']

# Separate the X variable, the features
X = lending_data_df.drop(columns=['loan_status'])

# Display the first few rows of X and y to verify
print(X.head())
print(y.head())


   loan_size  interest_rate  borrower_income  debt_to_income  num_of_accounts  derogatory_marks  total_debt
0    10700.0          7.672            52800        0.431818                5                 1       22800
1     8400.0          6.692            43600        0.311927                3                 0       13600
2     9000.0          6.963            46100        0.349241                3                 0       16100
3    10700.0          7.664            52700        0.430740                5                 1       22700
4    10800.0          7.698            53000        0.433962                5                 1       23000
0    0
1    0
2    0
3    0
4    0
Name: loan_status, dtype: int64


In [43]:
# Review the y variable Series
# Display the first few rows of the y variable
print(y.head())

0    0
1    0
2    0
3    0
4    0
Name: loan_status, dtype: int64


In [41]:
# Display basic statistics of the y variable
print(y.describe())

count    77536.000000
mean         0.032243
std          0.176646
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max          1.000000
Name: loan_status, dtype: float64


In [42]:
# Display the value counts of the y variable
print(y.value_counts())


loan_status
0    75036
1     2500
Name: count, dtype: int64


In [38]:
# Review the X variable DataFrame
# Display the first few rows of the X DataFrame
print(X.head())

   loan_size  interest_rate  borrower_income  debt_to_income  num_of_accounts  derogatory_marks  total_debt
0    10700.0          7.672            52800        0.431818                5                 1       22800
1     8400.0          6.692            43600        0.311927                3                 0       13600
2     9000.0          6.963            46100        0.349241                3                 0       16100
3    10700.0          7.664            52700        0.430740                5                 1       22700
4    10800.0          7.698            53000        0.433962                5                 1       23000


In [39]:
# Display basic statistics of the X DataFrame

print(X.describe())

          loan_size  interest_rate  borrower_income  debt_to_income  num_of_accounts  derogatory_marks    total_debt
count  77536.000000   77536.000000     77536.000000    77536.000000     77536.000000      77536.000000  77536.000000
mean    9805.562577       7.292333     49221.949804        0.377318         3.826610          0.392308  19221.949804
std     2093.223153       0.889495      8371.635077        0.081519         1.904426          0.582086   8371.635077
min     5000.000000       5.250000     30000.000000        0.000000         0.000000          0.000000      0.000000
25%     8700.000000       6.825000     44800.000000        0.330357         3.000000          0.000000  14800.000000
50%     9500.000000       7.172000     48100.000000        0.376299         4.000000          0.000000  18100.000000
75%    10400.000000       7.528000     51400.000000        0.416342         4.000000          1.000000  21400.000000
max    23800.000000      13.235000    105200.000000        0.714

In [40]:
# Display information about the X DataFrame (e.g., column names, non-null counts, data types)
print(X.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77536 entries, 0 to 77535
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   loan_size         77536 non-null  float64
 1   interest_rate     77536 non-null  float64
 2   borrower_income   77536 non-null  int64  
 3   debt_to_income    77536 non-null  float64
 4   num_of_accounts   77536 non-null  int64  
 5   derogatory_marks  77536 non-null  int64  
 6   total_debt        77536 non-null  int64  
dtypes: float64(3), int64(4)
memory usage: 4.1 MB
None


In [30]:
# Display the column names of the X DataFrame
print(X.columns)

Index(['loan_size', 'interest_rate', 'borrower_income', 'debt_to_income', 'num_of_accounts', 'derogatory_marks', 'total_debt'], dtype='object')


In [31]:
# Display the shape of the X DataFrame (number of rows and columns)
print(X.shape)

(77536, 7)


### Step 3: Split the data into training and testing datasets by using `train_test_split`.

In [45]:
# Split the data into training and testing sets with random_state=1
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# Display the shapes of the resulting datasets
print(f'X_train shape: {X_train.shape}')
print(f'X_test shape: {X_test.shape}')
print(f'y_train shape: {y_train.shape}')
print(f'y_test shape: {y_test.shape}')


X_train shape: (62028, 7)
X_test shape: (15508, 7)
y_train shape: (62028,)
y_test shape: (15508,)


---

## Create a Logistic Regression Model with the Original Data

###  Step 1: Fit a logistic regression model by using the training data (`X_train` and `y_train`).

In [32]:
# Instantiate the Logistic Regression model with random_state=1
logistic_model = LogisticRegression(random_state=1)

# Fit the model using the training data
logistic_model.fit(X_train, y_train)

# Display the model's coefficients
print(f'Model coefficients: {logistic_model.coef_}')
print(f'Model intercept: {logistic_model.intercept_}')



Model coefficients: [[-1.07343332e-05 -1.11821247e-07 -3.86442644e-04 -2.57250652e-09
   1.61411871e-07  5.41492664e-08  6.42898333e-04]]
Model intercept: [-3.43113659e-08]


### Step 2: Save the predictions on the testing data labels by using the testing feature data (`X_test`) and the fitted model.

In [46]:
# Make predictions using the testing feature data
y_pred = logistic_model.predict(X_test)

# Display the first few predictions
print(y_pred[:10])


[0 0 0 0 0 0 0 0 0 0]


### Step 3: Evaluate the model’s performance by doing the following:

* Generate a confusion matrix.

* Print the classification report.

In [52]:
# Generate the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Create a DataFrame for better visualization
cm_df = pd.DataFrame(cm, 
                     index=['Actual Negative', 'Actual Positive'], 
                     columns=['Predicted Negative', 'Predicted Positive'])

# Display the confusion matrix with headings
print(f'Confusion Matrix:\n{cm_df}')


Confusion Matrix:
                 Predicted Negative  Predicted Positive
Actual Negative               14926                  75
Actual Positive                  46                 461


In [53]:
# Print the classification report for the model
from sklearn.metrics import classification_report

# Generate the classification report
report = classification_report(y_test, y_pred)

# Display the classification report
print(f'Classification Report:\n{report}')


Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     15001
           1       0.86      0.91      0.88       507

    accuracy                           0.99     15508
   macro avg       0.93      0.95      0.94     15508
weighted avg       0.99      0.99      0.99     15508



### Step 4: Answer the following question.

**Question:** How well does the logistic regression model predict both the `0` (healthy loan) and `1` (high-risk loan) labels?

**Answer:** The model excels in predicting healthy loans with perfect accuracy. For high-risk loans, it does well but occasionally misclassifies some healthy loans as high-risk and misses a few high-risk loans. Despite these small discrepancies, the overall accuracy of 99% indicates that the model is highly effective at predicting both labels, particularly healthy loans.

---