# Student Loan Risk with Deep Learning

In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.metrics import classification_report

---

## Prepare the data to be used on a neural network model

### Step 1: Read the `student-loans.csv` file into a Pandas DataFrame. Review the DataFrame, looking for columns that could eventually define your features and target variables.   

In [9]:
file_path = '/content/student-loans.csv'
data = pd.read_csv(file_path)
data.head()



Unnamed: 0,payment_history,location_parameter,stem_degree_score,gpa_ranking,alumni_success,study_major_code,time_to_completion,finance_workshop_score,cohort_ranking,total_loan_score,financial_aid_score,credit_ranking
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,0
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,0
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,0
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,1
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,0


In [10]:
data.info()
data.describe()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   payment_history         1599 non-null   float64
 1   location_parameter      1599 non-null   float64
 2   stem_degree_score       1599 non-null   float64
 3   gpa_ranking             1599 non-null   float64
 4   alumni_success          1599 non-null   float64
 5   study_major_code        1599 non-null   float64
 6   time_to_completion      1599 non-null   float64
 7   finance_workshop_score  1599 non-null   float64
 8   cohort_ranking          1599 non-null   float64
 9   total_loan_score        1599 non-null   float64
 10  financial_aid_score     1599 non-null   float64
 11  credit_ranking          1599 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 150.0 KB


Unnamed: 0,payment_history,location_parameter,stem_degree_score,gpa_ranking,alumni_success,study_major_code,time_to_completion,finance_workshop_score,cohort_ranking,total_loan_score,financial_aid_score,credit_ranking
count,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0
mean,8.319637,0.527821,0.270976,2.538806,0.087467,15.874922,46.467792,0.996747,3.311113,0.658149,10.422983,0.534709
std,1.741096,0.17906,0.194801,1.409928,0.047065,10.460157,32.895324,0.001887,0.154386,0.169507,1.065668,0.49895
min,4.6,0.12,0.0,0.9,0.012,1.0,6.0,0.99007,2.74,0.33,8.4,0.0
25%,7.1,0.39,0.09,1.9,0.07,7.0,22.0,0.9956,3.21,0.55,9.5,0.0
50%,7.9,0.52,0.26,2.2,0.079,14.0,38.0,0.99675,3.31,0.62,10.2,1.0
75%,9.2,0.64,0.42,2.6,0.09,21.0,62.0,0.997835,3.4,0.73,11.1,1.0
max,15.9,1.58,1.0,15.5,0.611,72.0,289.0,1.00369,4.01,2.0,14.9,1.0


In [7]:
X = data.drop(columns=['credit_ranking'])
y = data['credit_ranking']


### Step 2: Using the preprocessed data, create the features (`X`) and target (`y`) datasets. The target dataset should be defined by the preprocessed DataFrame column “credit_ranking”. The remaining columns should define the features dataset.

In [12]:
X = data.drop(columns=['credit_ranking'])
y = data['credit_ranking']


### Step 3: Split the features and target sets into training and testing datasets.


In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


### Step 4: Use scikit-learn's `StandardScaler` to scale the features data.

In [15]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


---

## Compile and Evaluate a Model Using a Neural Network

### Step 1: Create a deep neural network by assigning the number of input features, the number of layers, and the number of neurons on each layer using Tensorflow’s Keras.

> **Hint** You can start with a two-layer deep neural network model that uses the `relu` activation function for both layers.


In [16]:
model = Sequential([
    Dense(16, activation='relu', input_dim=X_train_scaled.shape[1]),
    Dense(8, activation='relu'),
    Dense(1, activation='sigmoid')
])
model.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


### Step 2: Compile and fit the model using the `binary_crossentropy` loss function, the `adam` optimizer, and the `accuracy` evaluation metric.


In [18]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


In [19]:
history = model.fit(X_train_scaled, y_train, epochs=50, validation_data=(X_test_scaled, y_test))

Epoch 1/50
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 10ms/step - accuracy: 0.4754 - loss: 0.7070 - val_accuracy: 0.6281 - val_loss: 0.6460
Epoch 2/50
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.5734 - loss: 0.6444 - val_accuracy: 0.6687 - val_loss: 0.6121
Epoch 3/50
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6626 - loss: 0.6132 - val_accuracy: 0.6969 - val_loss: 0.5885
Epoch 4/50
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7009 - loss: 0.5883 - val_accuracy: 0.7250 - val_loss: 0.5696
Epoch 5/50
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.7115 - loss: 0.5656 - val_accuracy: 0.7250 - val_loss: 0.5551
Epoch 6/50
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.7517 - loss: 0.5345 - val_accuracy: 0.7344 - val_loss: 0.5434
Epoch 7/50
[1m40/40[0m [32m━━━━━━━━━

### Step 3: Evaluate the model using the test data to determine the model’s loss and accuracy.


In [20]:
loss, accuracy = model.evaluate(X_test_scaled, y_test)
print(f"Loss: {loss}, Accuracy: {accuracy}")


[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.7499 - loss: 0.5027 
Loss: 0.5131015777587891, Accuracy: 0.734375


### Step 4: Save and export your model to a keras file, and name the file `student_loans.keras`.


In [21]:
model.save('student_loans.keras')


---
## Predict Loan Repayment Success by Using your Neural Network Model

### Step 1: Reload your saved model.

In [22]:
loaded_model = tf.keras.models.load_model('student_loans.keras')

### Step 2: Make predictions on the testing data and save the predictions to a DataFrame.

In [23]:
predictions = (loaded_model.predict(X_test_scaled) > 0.5).astype(int)

[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 


In [24]:
# Save the predictions to a DataFrame and round the predictions to binary results
predictions_df = pd.DataFrame(predictions, columns=['predictions'])

# Convert the predictions to binary (0 or 1) if not already binary
predictions_df['predictions'] = predictions_df['predictions'].round().astype(int)

# Display the first few rows of the DataFrame
print(predictions_df.head())


   predictions
0            0
1            0
2            1
3            0
4            1


### Step 4: Display a classification report with the y test data and predictions

In [25]:
print(classification_report(y_test, predictions))


              precision    recall  f1-score   support

           0       0.68      0.74      0.71       141
           1       0.78      0.73      0.75       179

    accuracy                           0.73       320
   macro avg       0.73      0.74      0.73       320
weighted avg       0.74      0.73      0.74       320



---
## Discuss creating a recommendation system for student loans

Briefly answer the following questions in the space provided:

1. **Describe the data that you would need to collect to build a recommendation system to recommend student loan options for students. Explain why this data would be relevant and appropriate.**

***Answer***: To build a recommendation system for student loans, the following data would be required:

**Student Demographics**: Age, gender, location, and marital status.

**Relevance**: This data helps identify specific groups of students and tailor loan options based on regional loan terms or personal circumstances.

**Educational Information**: University name, degree type (e.g., bachelor’s, master’s), field of study, GPA, and graduation year.

**Relevance**: Loan amounts and repayment terms often depend on the student’s education level and the potential future income associated with their degree.

**Financial Information**: Annual income (if working), parental income (if applicable), total student loan amount requested, credit score, and financial dependents.

**Relevance**: Lenders assess repayment ability based on financial health and creditworthiness.

**Loan History**: Any previous loans, repayment history, and current debt levels.

**Relevance**: Understanding prior borrowing behavior can help recommend loans with suitable repayment terms.

**Career Data**: Employment status, job title, and industry.

**Relevance**: Loan repayment ability depends on the stability and expected income of the student’s career.

**Why this data is relevant:** These features are key indicators of the student’s ability to repay loans and their preferences, enabling the recommendation system to suggest loans that align with their needs and financial capabilities

2. **Based on the data you chose to use in this recommendation system, would your model be using collaborative filtering, content-based filtering, or context-based filtering? Justify why the data you selected would be suitable for your choice of filtering method.**

***Answer***: Filtering Method - Content-Based Filtering

**Justification**:Content-based filtering uses the attributes of the user (in this case, the student demographics, education, and financial profile) to recommend items (student loans).

The data collected includes detailed information about the students (e.g., education level, credit score, loan history), which is directly used to recommend loans with appropriate interest rates and repayment terms.

Collaborative filtering (based on user similarity) is less suitable here because student loans are highly individualized products, and recommendations should focus on the student’s specific needs rather than patterns from similar users.

3. **Describe two real-world challenges that you would take into consideration while building a recommendation system for student loans. Explain why these challenges would be of concern for a student loan recommendation system.**

***Answer:***

**Challenge 1**: Data Privacy and Security

**Explanation:**
Student loan applications involve highly sensitive personal data, including financial details, credit scores, and employment history. Ensuring the privacy and security of this data is critical to prevent breaches and misuse.
Failure to secure this data could lead to legal penalties, loss of trust, and reputational damage to the company.

**Challenge 2**: Bias and Fairness

**Explanation:**
A recommendation system could inadvertently introduce bias based on gender, ethnicity, or socioeconomic status. For example, students from lower-income backgrounds might unfairly receive higher interest rates or less favorable loan options.
Addressing bias is critical to ensure the system is equitable and does not perpetuate discriminatory lending practices.