# Student Loan Risk with Deep Learning

In [1]:
# Imports
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from pathlib import Path

  from pandas.core import (


---

## Prepare the data to be used on a neural network model

### Step 1: Read the `student-loans.csv` file into a Pandas DataFrame. Review the DataFrame, looking for columns that could eventually define your features and target variables.   

In [2]:
# Read the csv into a Pandas DataFrame
file_path = "https://static.bc-edx.com/ai/ail-v-1-0/m18/lms/datasets/student-loans.csv"
loans_df = pd.read_csv(file_path)

# Review the DataFrame
loans_df.head()

Unnamed: 0,payment_history,location_parameter,stem_degree_score,gpa_ranking,alumni_success,study_major_code,time_to_completion,finance_workshop_score,cohort_ranking,total_loan_score,financial_aid_score,credit_ranking
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,0
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,0
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,0
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,1
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,0


In [3]:
# Review the data types associated with the columns
loans_df.dtypes

payment_history           float64
location_parameter        float64
stem_degree_score         float64
gpa_ranking               float64
alumni_success            float64
study_major_code          float64
time_to_completion        float64
finance_workshop_score    float64
cohort_ranking            float64
total_loan_score          float64
financial_aid_score       float64
credit_ranking              int64
dtype: object

In [4]:
# Check the credit_ranking value counts
loans_df["credit_ranking"].value_counts()

credit_ranking
1    855
0    744
Name: count, dtype: int64

### Step 2: Using the preprocessed data, create the features (`X`) and target (`y`) datasets. The target dataset should be defined by the preprocessed DataFrame column “credit_ranking”. The remaining columns should define the features dataset.

In [6]:
# Define the target set y using the credit_ranking column
y = loans_df["credit_ranking"]

# Display a sample of y
y

0       0
1       0
2       0
3       1
4       0
       ..
1594    0
1595    1
1596    1
1597    0
1598    1
Name: credit_ranking, Length: 1599, dtype: int64

In [7]:
# Define features set X by selecting all columns but credit_ranking
X = loans_df.drop("credit_ranking", axis=1)

# Review the features DataFrame
X

Unnamed: 0,payment_history,location_parameter,stem_degree_score,gpa_ranking,alumni_success,study_major_code,time_to_completion,finance_workshop_score,cohort_ranking,total_loan_score,financial_aid_score
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4
...,...,...,...,...,...,...,...,...,...,...,...
1594,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5
1595,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2
1596,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0
1597,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2


### Step 3: Split the features and target sets into training and testing datasets.


In [8]:
# Split the preprocessed data into a training and testing dataset
# Assign the function a random_state equal to 1

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
X_train

Unnamed: 0,payment_history,location_parameter,stem_degree_score,gpa_ranking,alumni_success,study_major_code,time_to_completion,finance_workshop_score,cohort_ranking,total_loan_score,financial_aid_score
1144,7.3,0.320,0.23,2.3,0.066,35.0,70.0,0.99588,3.43,0.62,10.1
73,8.3,0.675,0.26,2.1,0.084,11.0,43.0,0.99760,3.31,0.53,9.2
446,12.5,0.380,0.60,2.6,0.081,31.0,72.0,0.99960,3.10,0.73,10.5
399,8.7,0.765,0.22,2.3,0.064,9.0,42.0,0.99630,3.10,0.55,9.4
647,8.3,0.845,0.01,2.2,0.070,5.0,14.0,0.99670,3.32,0.58,11.0
...,...,...,...,...,...,...,...,...,...,...,...
715,7.2,0.490,0.18,2.7,0.069,13.0,34.0,0.99670,3.29,0.48,9.2
905,9.2,0.580,0.20,3.0,0.081,15.0,115.0,0.99800,3.23,0.59,9.5
1096,6.6,0.725,0.09,5.5,0.117,9.0,17.0,0.99655,3.35,0.49,10.8
235,7.2,0.630,0.00,1.9,0.097,14.0,38.0,0.99675,3.37,0.58,9.0


### Step 4: Use scikit-learn's `StandardScaler` to scale the features data.

In [9]:
# Create a StandardScaler instance

scaler = StandardScaler()

# Fit the scaler to the features training dataset
X_scaler = scaler.fit(X_train)

# Fit the scaler to the features training dataset 
## comment supplied is incorrect, Scale the data,both training and test

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [12]:
len(X_train_scaled[0])

11

---

## Compile and Evaluate a Model Using a Neural Network

### Step 1: Create a deep neural network by assigning the number of input features, the number of layers, and the number of neurons on each layer using Tensorflow’s Keras.

> **Hint** You can start with a two-layer deep neural network model that uses the `relu` activation function for both layers.


In [22]:
# Define the the number of inputs (features) to the model
number_input_features = len(X_train_scaled[0])

# Review the number of features
print(number_input_features)


11


In [27]:
# Define the number of hidden nodes for the first hidden layer
#Cominup up with a possible configuration for 11 input features


hidden_nodes_layer1 =  6

# Define the number of hidden nodes for the second hidden layer
hidden_nodes_layer2 = 3

# Define the number of neurons in the output layer

output_nodes_layer = 1



In [28]:
# Create the Sequential model instance

nn = tf.keras.models.Sequential()
# Add the first hidden layer
nn.add(
    tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu")
)

# Add the second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))

# Add the output layer to the model specifying the number of output neurons and activation function

nn.add(tf.keras.layers.Dense(units=output_nodes_layer, activation="sigmoid"))


In [29]:
# Display the Sequential model summary

nn.summary()

### Step 2: Compile and fit the model using the `binary_crossentropy` loss function, the `adam` optimizer, and the `accuracy` evaluation metric.


In [30]:
# Compile the Sequential model

nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [34]:
# Fit the model using 50 epochs and the training data

fit_model = nn.fit(X_train_scaled,y_train,epochs=100)

Epoch 1/100
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 610us/step - accuracy: 0.7796 - loss: 0.4849
Epoch 2/100
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 402us/step - accuracy: 0.7738 - loss: 0.4896
Epoch 3/100
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 371us/step - accuracy: 0.7814 - loss: 0.4680
Epoch 4/100
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 427us/step - accuracy: 0.7615 - loss: 0.4995
Epoch 5/100
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 412us/step - accuracy: 0.7637 - loss: 0.4894
Epoch 6/100
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 375us/step - accuracy: 0.7784 - loss: 0.4766
Epoch 7/100
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 381us/step - accuracy: 0.7693 - loss: 0.4818
Epoch 8/100
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 396us/step - accuracy: 0.7688 - loss: 0.4687
Epoch 9/100
[1m38/38[0m [32m━

[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 407us/step - accuracy: 0.7719 - loss: 0.4759
Epoch 69/100
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 352us/step - accuracy: 0.7619 - loss: 0.4879
Epoch 70/100
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 356us/step - accuracy: 0.7797 - loss: 0.4743
Epoch 71/100
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 371us/step - accuracy: 0.7884 - loss: 0.4600
Epoch 72/100
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 386us/step - accuracy: 0.7665 - loss: 0.4721
Epoch 73/100
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 414us/step - accuracy: 0.7566 - loss: 0.4966
Epoch 74/100
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 395us/step - accuracy: 0.7845 - loss: 0.4616
Epoch 75/100
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 404us/step - accuracy: 0.7920 - loss: 0.4456
Epoch 76/100
[1m38/38[0m [32m━━━━━

### Step 3: Evaluate the model using the test data to determine the model’s loss and accuracy.


In [35]:
# Evaluate the model loss and accuracy metrics using the evaluate method and the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)

# Display the model loss and accuracy results
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

13/13 - 0s - 2ms/step - accuracy: 0.7525 - loss: 0.5556
Loss: 0.5555670261383057, Accuracy: 0.7524999976158142


### Step 4: Save and export your model to a keras file, and name the file `student_loans.keras`.


In [36]:
# Set the model's file path
file_path = Path("./student_loans.keras")

# Export your model to a keras file
nn.save(file_path)

---
## Predict Loan Repayment Success by Using your Neural Network Model

### Step 1: Reload your saved model.

In [37]:
# Set the model's file path

file_path = Path("./student_loans.keras")

# Load the model to a new object

nn_imported = tf.keras.models.load_model(file_path)

### Step 2: Make predictions on the testing data and save the predictions to a DataFrame.

In [41]:
# Use the model you saved in the previous section to make predictions on your reserved testing data

# Make predictions with the test data
predictions = nn_imported.predict(X_test_scaled,verbose=2)

# Display a sample of the predictions
predictions[0:5,:]

13/13 - 0s - 2ms/step


array([[0.28138036],
       [0.28321266],
       [0.8416605 ],
       [0.71815383],
       [0.96722263]], dtype=float32)

In [42]:
# Save the predictions to a DataFrame and round the predictions to binary results


predictions_df = pd.DataFrame(columns=["predictions"], data=predictions)
predictions_df["predictions"] = round(predictions_df["predictions"],0)
predictions_df

Unnamed: 0,predictions
0,0.0
1,0.0
2,1.0
3,1.0
4,1.0
...,...
395,1.0
396,0.0
397,1.0
398,0.0


### Step 4: Display a classification report with the y test data and predictions

In [45]:
# Print the classification report with the y test data and predictions

print(classification_report(y_test, predictions_df["predictions"].values))


              precision    recall  f1-score   support

           0       0.73      0.75      0.74       188
           1       0.77      0.75      0.76       212

    accuracy                           0.75       400
   macro avg       0.75      0.75      0.75       400
weighted avg       0.75      0.75      0.75       400



---
## Discuss creating a recommendation system for student loans

Briefly answer the following questions in the space provided:

1. Describe the data that you would need to collect to build a recommendation system to recommend student loan options for students. Explain why this data would be relevant and appropriate.

2. Based on the data you chose to use in this recommendation system, would your model be using collaborative filtering, content-based filtering, or context-based filtering? Justify why the data you selected would be suitable for your choice of filtering method.

3. Describe two real-world challenges that you would take into consideration while building a recommendation system for student loans. Explain why these challenges would be of concern for a student loan recommendation system.

## Question 1: Describe the data that you would need to collect to build a recommendation system to recommend student loan options for students. Explain why this data would be relevant and appropriate.

### Answer:
Building a recommendation system to suggest student loan options for students requires collecting and analyzing data that helps match students' needs with suitable loan products. Below is a breakdown of the data required, categorized by relevance and appropriateness:

1. Student Information
- Educational Details:
    - Degree type (e.g., undergraduate, graduate, professional).
    - Field of study.
    - Enrollment status (full-time or part-time).
- Loan Amount Needed:
    - Estimated tuition fees, living expenses, and other education-related costs.
    - Existing financial aid or scholarships received.
Relevance: Helps estimate the funding gap and tailor loan options that cover the student’s needs. For example, federal loans often have borrowing limits that may not cover full costs for certain degree types.

2. Financial Information
- Student's Financial Background:
    - Current income (if any).
    - Savings or personal contribution.
    - Family Financial Details (if dependent):
    - Parental income and assets.
    - Expected Family Contribution (EFC) from FAFSA or similar forms.
- Credit History:
    - Credit score (for private loans, where applicable).
    - Debt-to-income ratio (if the student or co-signer has existing loans).
Relevance: Many loan options depend on financial need (for federal loans) or creditworthiness (for private loans). Understanding financial details ensures suitable loan recommendations.

3. Loan Preferences and Goals
    - Loan repayment term preferences (e.g., short-term, long-term).
    - Willingness to consider co-signers for private loans.
    - Interest rate preferences (fixed or variable).
    - Post-graduation plans (e.g., career field, expected income).
Relevance: Tailors recommendations based on what students prioritize, such as lower monthly payments or minimizing overall interest.


4. Eligibility Criteria
- Citizenship status (e.g., U.S. citizen, international student).
- Eligibility for federal loans (e.g., FAFSA completion, satisfactory academic progress).
- State of residence (affects eligibility for state-sponsored loans).
Relevance: Ensures the recommended options are feasible and comply with eligibility requirements.

5. Loan Product Data
- Federal loan types and limits (e.g., Direct Subsidized, Unsubsidized).
- Private loan offerings from banks, credit unions, or online lenders:
    - Interest rates, fees, and terms.
    - Benefits like deferment, income-driven repayment, or forgiveness programs.
    - State-sponsored loan options.
- Details of existing loans: 
    - Loan balance, interest rate, loan type (federal/private), repayment plan, and remaining term.
Relevance: Comprehensive information about loan products is essential to recommend the best fit for each student's situation.

- Ethical and Privacy Considerations
Data Privacy: Ensure compliance with data protection laws (e.g., FERPA, GDPR) and secure storage of sensitive financial and personal information.
Consent: Collect data with explicit consent, informing users of its purpose.
Fairness: Avoid biases that may disproportionately disadvantage certain student groups.
By collecting and analyzing this data, the recommendation system can provide personalized, relevant, and practical loan suggestions while respecting user privacy and ethical considerations.

## Question 2: Based on the data you chose to use in this recommendation system, would your model be using collaborative filtering, content-based filtering, or context-based filtering? Justify why the data you selected would be suitable for your choice of filtering method.
    
### Answer:
The recommendation system for student loan refinancing would primarily use context-based filtering, with elements of content-based filtering. Here's why:

### 1. Context-Based Filtering (Primary)
Definition: This approach leverages the specific attributes and circumstances of the user (e.g., financial profile, loan details, and preferences) to generate personalized recommendations.

Why Suitable:

The data selected (e.g., income, debt-to-income ratio, loan balance, repayment goals) provides a rich context about each user.
Recommendations are tailored based on the individual’s financial situation and preferences, rather than relying on patterns from other users.
Real-time external data (e.g., current interest rates, lender terms) adds an additional layer of contextual relevance.
Example in Use: A student with a high debt-to-income ratio and a preference for lower monthly payments would be matched with a refinancing option that extends the repayment term and lowers the monthly payment.

### 2. Content-Based Filtering (Secondary)
Definition: This approach uses the attributes of the items (loan products) to match them with user profiles.

Why Suitable:

The system can analyze the attributes of loan options (e.g., interest rates, repayment terms, fees) and match them to a user’s specific financial characteristics and preferences.
By leveraging data like current loan details and repayment goals, the system makes recommendations based on the "content" of the loans rather than drawing from other users' behaviors.
Example in Use: If a user prioritizes a lower interest rate, the system identifies and recommends loans with competitive rates and terms that align with their creditworthiness.

### Why Not Collaborative Filtering?
- Collaborative filtering relies on patterns from user behavior, such as preferences or actions of similar users.While this approach can be useful in certain domains (e.g., e-commerce or media recommendations), it is less appropriate here because:
- Loan refinancing decisions are highly individualized and dependent on unique financial circumstances.
- Over-relying on aggregated user data could lead to inaccurate or irrelevant recommendations, as loan products must meet strict eligibility and regulatory requirements.

### Conclusion
The recommendation system would predominantly use context-based filtering, with some elements of content-based filtering, to ensure that recommendations are grounded in the user's financial context and the detailed attributes of the loan products. This approach ensures personalized and relevant suggestions, aligning closely with the nature of student loan refinancing.


## Question 3: Describe two real-world challenges that you would take into consideration while building a recommendation system for student loans. Explain why these challenges would be of concern for a student loan recommendation system.

### Answer:

Building a recommendation system for student loans involves addressing several real-world challenges to ensure accuracy, fairness, and usability. Two key challenges are:

1. Data Privacy and Security
Concern: The system requires sensitive personal and financial data (e.g., income, credit scores, and family contribution) to make accurate recommendations. Handling this data improperly could lead to breaches of privacy, compliance violations, or loss of user trust.
Why This is a Concern:
Legal Compliance: Laws like FERPA (for students in the U.S.) and GDPR (if serving international students) mandate strict data handling and transparency.
Trust: Students and families may hesitate to share sensitive information without robust assurances about data protection.
Ethical Responsibility: Misuse or exposure of sensitive data could have severe financial and reputational consequences for users and the organization.
Solution: Implement end-to-end encryption, anonymize sensitive data when possible, and ensure compliance with relevant privacy laws. Provide clear and transparent data usage policies to users.

2. Addressing Bias and Ensuring Fairness
Concern: Bias can emerge from data collection, algorithm design, or existing systemic inequities in student loan systems (e.g., access to federal loans vs. private loans for low-income students or international students).
Why This is a Concern:
Unequal Access: Students from marginalized groups might have fewer options due to systemic barriers, such as lower credit scores or lack of access to co-signers.
Algorithmic Bias: If historical data reflects existing inequities (e.g., fewer approvals for certain demographics), the system might perpetuate these biases.
Regulatory Compliance: Discrimination in financial services is illegal under laws like the Equal Credit Opportunity Act (ECOA) in the U.S.
Reputation Risks: Perceptions of unfairness could damage the credibility and adoption of the recommendation system.
Solution: Use bias mitigation techniques, such as: 

Ensuring diverse and representative training data.
Regularly auditing the algorithm for disparate impact on different demographic groups.
Offering alternative options for students facing systemic barriers, such as co-signer-free loans or income-share agreements.
Addressing These Challenges
These challenges are critical because:

Trust and Adoption: Users must feel confident that their personal information is safe and that the recommendations are unbiased and fair.
Regulatory and Ethical Responsibility: Financial recommendation systems face stringent oversight to prevent misuse or harm to vulnerable populations.
Practical Usability: If the system cannot address privacy concerns or provide equitable recommendations, it risks alienating key user groups and failing its purpose.
    