In [103]:
import requests
import os
import pandas as pd
from io import StringIO
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score

# Replace these values with the owner, repository, and file path
owner = "Summeryuqing01"
repo = "German_credit_risk"
file_path_train = "data/train.csv"
file_path_test = "data/test.csv"

# Crea
raw_url_train= f"https://raw.githubusercontent.com/{owner}/{repo}/main/{file_path_train}"
raw_url_test= f"https://raw.githubusercontent.com/{owner}/{repo}/main/{file_path_test}"

# Make a GET request to the raw URL
response_train = requests.get(raw_url_train)
response_test = requests.get(raw_url_test)

content_train = response_train.text
content_test = response_test.text

# Convert the CSV content to a DataFrame
train = pd.read_csv(StringIO(content_train))
test = pd.read_csv(StringIO(content_test))

In [104]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 752 entries, 0 to 751
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Age               752 non-null    int64 
 1   Sex               752 non-null    object
 2   Job               752 non-null    int64 
 3   Housing           752 non-null    object
 4   Saving.accounts   752 non-null    object
 5   Checking.account  752 non-null    object
 6   Duration          752 non-null    int64 
 7   Purpose           752 non-null    object
 8   Credit.amount     752 non-null    int64 
dtypes: int64(4), object(5)
memory usage: 53.0+ KB


In [105]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 248 entries, 0 to 247
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Age               248 non-null    int64 
 1   Sex               248 non-null    object
 2   Job               248 non-null    int64 
 3   Housing           248 non-null    object
 4   Saving.accounts   248 non-null    object
 5   Checking.account  248 non-null    object
 6   Duration          248 non-null    int64 
 7   Purpose           248 non-null    object
 8   Credit.amount     248 non-null    int64 
dtypes: int64(4), object(5)
memory usage: 17.6+ KB


In [106]:
#Mapping male to 1 and female to 0
train['Sex'] = train['Sex'].map({'male': 1, 'female': 0})
test['Sex'] = test['Sex'].map({'male': 1, 'female': 0})

In [107]:
train_encoded = pd.get_dummies(train, columns=['Housing',"Saving.accounts","Checking.account","Purpose"])
test_encoded = pd.get_dummies(test, columns=['Housing',"Saving.accounts","Checking.account","Purpose"])

In [108]:
train_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 752 entries, 0 to 751
Data columns (total 23 columns):
 #   Column                       Non-Null Count  Dtype
---  ------                       --------------  -----
 0   Age                          752 non-null    int64
 1   Sex                          752 non-null    int64
 2   Job                          752 non-null    int64
 3   Duration                     752 non-null    int64
 4   Credit.amount                752 non-null    int64
 5   Housing_free                 752 non-null    uint8
 6   Housing_own                  752 non-null    uint8
 7   Housing_rent                 752 non-null    uint8
 8   Saving.accounts_little       752 non-null    uint8
 9   Saving.accounts_moderate     752 non-null    uint8
 10  Saving.accounts_quite rich   752 non-null    uint8
 11  Saving.accounts_rich         752 non-null    uint8
 12  Checking.account_little      752 non-null    uint8
 13  Checking.account_moderate    752 non-null    uint8

In [109]:
#Mapping male to 1 and female to 0
train_ordinal = train
test_ordinal = test

train_ordinal['Housing'] = train_ordinal['Housing'].map({'free': 0, 'rent': 1, 'own': 2})
test_ordinal['Housing'] = test_ordinal['Housing'].map({'free': 0, 'rent': 1, 'own': 2})
train_ordinal['Saving.accounts'] = train_ordinal['Saving.accounts'].map({'little': 0, 'moderate': 1, 'rich': 2, 'quite rich': 3})
test_ordinal['Saving.accounts'] = test_ordinal['Saving.accounts'].map({'little': 0, 'moderate': 1, 'rich': 2, 'quite rich': 3})
train_ordinal['Checking.account'] = train_ordinal['Checking.account'].map({'little': 0, 'moderate': 1, 'rich': 2})
test_ordinal['Checking.account'] = test_ordinal['Checking.account'].map({'little': 0, 'moderate': 1, 'rich': 2})

In [192]:
X_train = train_encoded.drop("Credit.amount", axis=1)
y_train = train_encoded["Credit.amount"]
X_test = test_encoded.drop("Credit.amount", axis=1)
y_test = test_encoded["Credit.amount"]

In [203]:
X_train_2 = train_ordinal.drop("Credit.amount", axis=1)
X_train_2 = X_train_2.drop("Purpose", axis=1)
y_train_2 = train_ordinal["Credit.amount"]
X_test_2 = test_ordinal.drop("Credit.amount", axis=1)
X_test_2 = X_test_2.drop("Purpose", axis=1)
y_test_2 = test_ordinal["Credit.amount"]

In [193]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 752 entries, 0 to 751
Data columns (total 22 columns):
 #   Column                       Non-Null Count  Dtype
---  ------                       --------------  -----
 0   Age                          752 non-null    int64
 1   Sex                          752 non-null    int64
 2   Job                          752 non-null    int64
 3   Duration                     752 non-null    int64
 4   Housing_free                 752 non-null    uint8
 5   Housing_own                  752 non-null    uint8
 6   Housing_rent                 752 non-null    uint8
 7   Saving.accounts_little       752 non-null    uint8
 8   Saving.accounts_moderate     752 non-null    uint8
 9   Saving.accounts_quite rich   752 non-null    uint8
 10  Saving.accounts_rich         752 non-null    uint8
 11  Checking.account_little      752 non-null    uint8
 12  Checking.account_moderate    752 non-null    uint8
 13  Checking.account_rich        752 non-null    uint8

In [194]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 248 entries, 0 to 247
Data columns (total 22 columns):
 #   Column                       Non-Null Count  Dtype
---  ------                       --------------  -----
 0   Age                          248 non-null    int64
 1   Sex                          248 non-null    int64
 2   Job                          248 non-null    int64
 3   Duration                     248 non-null    int64
 4   Housing_free                 248 non-null    uint8
 5   Housing_own                  248 non-null    uint8
 6   Housing_rent                 248 non-null    uint8
 7   Saving.accounts_little       248 non-null    uint8
 8   Saving.accounts_moderate     248 non-null    uint8
 9   Saving.accounts_quite rich   248 non-null    uint8
 10  Saving.accounts_rich         248 non-null    uint8
 11  Checking.account_little      248 non-null    uint8
 12  Checking.account_moderate    248 non-null    uint8
 13  Checking.account_rich        248 non-null    uint8

In [113]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 752 entries, 0 to 751
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Age               752 non-null    int64 
 1   Sex               752 non-null    int64 
 2   Job               752 non-null    int64 
 3   Housing           752 non-null    int64 
 4   Saving.accounts   752 non-null    int64 
 5   Checking.account  752 non-null    int64 
 6   Duration          752 non-null    int64 
 7   Purpose           752 non-null    object
 8   Credit.amount     752 non-null    int64 
dtypes: int64(8), object(1)
memory usage: 53.0+ KB


In [197]:
#Training Hyperparameters for RF regressor for Random Forest model on original categorical variables.

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2']
}

# Create the RandomForestRegressor
rf_model = RandomForestRegressor()

# Perform GridSearchCV
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=10)
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Train the model with the best parameters
best_rf_model = RandomForestRegressor(**best_params)
best_rf_model.fit(X_train, y_train)

810 fits failed out of a total of 2430.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
810 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\ianre\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 890, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\ianre\anaconda3\Lib\site-packages\sklearn\base.py", line 1344, in wrapper
    estimator._validate_params()
  File "C:\Users\ianre\anaconda3\Lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\ianre\anaconda3\Lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParam

Best Hyperparameters: {'max_depth': 10, 'max_features': 'log2', 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 200}


In [198]:
#Random Forest Regressor on original categorical variables(X_train and X_test)

train_predictions = best_rf_model.predict(X_train)
mse = mean_squared_error(y_train, train_predictions)
r2 = r2_score(y_train, train_predictions)

print(f"Mean Squared Error Train: {mse}")
print(f"R^2 Score Train: {r2}")

# Make predictions on the test set
predictions = best_rf_model.predict(X_test)
mse_test = mean_squared_error(y_test, predictions)
r2_test = r2_score(y_test, predictions)
print(f"Mean Squared Error Test: {mse_test}")
print(f"R^2 Score Test: {r2_test}")


Mean Squared Error Train: 2645012.6532986835
R^2 Score Train: 0.6419389139003293
Mean Squared Error Test: 5298809.50007267
R^2 Score Test: 0.44878579044028155


In [204]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Assuming X is your design matrix
vif_data = pd.DataFrame()
vif_data["Variable"] = X_train_2.columns
vif_data["VIF"] = [variance_inflation_factor(X_train_2.values, i) for i in range(X_train_2.shape[1])]
vif_data.head(10)

Unnamed: 0,Variable,VIF
0,Age,7.534788
1,Sex,3.38718
2,Job,7.805154
3,Housing,4.908154
4,Saving.accounts,1.402557
5,Checking.account,2.186486
6,Duration,4.055789


In [205]:
X_train_2 = X_train_2.drop("Age", axis=1)
X_test_2 = X_test_2.drop("Age", axis=1)

In [206]:
X_train_2 = X_train_2.drop("Job", axis=1)
X_test_2 = X_test_2.drop("Job", axis=1)

In [207]:
vif_data = pd.DataFrame()
vif_data["Variable"] = X_train_2.columns
vif_data["VIF"] = [variance_inflation_factor(X_train_2.values, i) for i in range(X_train_2.shape[1])]
vif_data.head(10)

Unnamed: 0,Variable,VIF
0,Sex,2.912054
1,Housing,3.698719
2,Saving.accounts,1.396378
3,Checking.account,2.109575
4,Duration,3.049116


In [215]:
#Polynomial regression

from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline

degree = 2  # Set the degree of the polynomial
model = make_pipeline(PolynomialFeatures(degree), LinearRegression())

# Fit the model to the training data
model.fit(X_train_2, y_train_2)

# Make predictions on the test set
y_pred_train = model.predict(X_train_2)
mse = mean_squared_error(y_train_2, y_pred_train)
print(f"Mean Squared Error train: {mse}")
r2_train = r2_score(y_train_2, y_pred_train)
print(f"R^2 Score Train: {r2_train}")

# Make predictions on the test set with the best model
y_pred = model.predict(X_test_2)
# Calculate and print the mean squared error
mse = mean_squared_error(y_test_2, y_pred)
print(f"Mean Squared Error test: {mse}")
r2_test = r2_score(y_test_2, y_pred)
print(f"R^2 Score Test: {r2_test}")

Mean Squared Error train: 4592136.836674041
R^2 Score Train: 0.37835250005054033
Mean Squared Error test: 4757599.864212099
R^2 Score Test: 0.5050856898106773


In [216]:
#Random Forest Regressor on initial data with ordinal variables (X_train_2 and X_test_2)
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2']
}

# Create the RandomForestRegressor
rf_model = RandomForestRegressor()

# Perform GridSearchCV
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=10)
grid_search.fit(X_train_2, y_train_2)

# Get the best parameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Train the model with the best parameters
best_rf_model = RandomForestRegressor(**best_params, random_state=42)
best_rf_model.fit(X_train_2, y_train_2)

810 fits failed out of a total of 2430.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
810 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\ianre\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 890, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\ianre\anaconda3\Lib\site-packages\sklearn\base.py", line 1344, in wrapper
    estimator._validate_params()
  File "C:\Users\ianre\anaconda3\Lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\ianre\anaconda3\Lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParam

Best Hyperparameters: {'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 50}


In [217]:
#Random Forest Regressor on initial data with ordinal variables (X_train_2 and X_test_2)
train_predictions = best_rf_model.predict(X_train_2)
mse = mean_squared_error(y_train_2, train_predictions)
r2 = r2_score(y_train_2, train_predictions)

print(f"Mean Squared Error Train: {mse}")
print(f"R^2 Score Train: {r2}")

# Make predictions on the test set
predictions = best_rf_model.predict(X_test_2)
mse_test = mean_squared_error(y_test_2, predictions)
r2_test = r2_score(y_test_2, predictions)
print(f"Mean Squared Error Test: {mse_test}")
print(f"R^2 Score Test: {r2_test}")

Mean Squared Error Train: 4061119.5292571085
R^2 Score Train: 0.45023746195962755
Mean Squared Error Test: 4821506.898079757
R^2 Score Test: 0.4984376936601864


In [212]:
#Random Forest Regressor on initial data with ordinal variables and log(response) (X_train_2 and X_test_2)
y_train_log_2 = np.log(y_train_2)
y_test_log_2 = np.log(y_test_2)

# Define the parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2']
}

# Create the RandomForestRegressor
rf_model = RandomForestRegressor()

# Perform GridSearchCV
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=10)
grid_search.fit(X_train_2, y_train_log_2)

# Get the best parameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Train the model with the best parameters
best_rf_model_2 = RandomForestRegressor(**best_params, random_state=42)
best_rf_model_2.fit(X_train_2, y_train_log_2)

810 fits failed out of a total of 2430.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
810 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\ianre\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 890, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\ianre\anaconda3\Lib\site-packages\sklearn\base.py", line 1344, in wrapper
    estimator._validate_params()
  File "C:\Users\ianre\anaconda3\Lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\ianre\anaconda3\Lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParam

Best Hyperparameters: {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 50}


In [213]:
train_predictions = best_rf_model_2.predict(X_train_2)
mse = mean_squared_error(y_train_log_2, train_predictions)
r2 = r2_score(y_train_log_2, train_predictions)
print(f"Mean Squared Error Train: {mse}")
print(f"R^2 Score Train: {r2}")

# Make predictions on the test set
predictions = best_rf_model_2.predict(X_test_2)
mse_test = mean_squared_error(y_test_log_2, predictions)
r2_test = r2_score(y_test_log_2, predictions)
print(f"Mean Squared Error Test: {mse_test}")
print(f"R^2 Score Test: {r2_test}")

Mean Squared Error Train: 0.3071259732020692
R^2 Score Train: 0.48142326908780264
Mean Squared Error Test: 0.30382448864586303
R^2 Score Test: 0.5156244846416201


This gives a slightly better R^2 and a far better MSE. However, this very low MSE is likely just attributed to the log transfromation of the response variable.

In [176]:
## Linear regression for comparison to original categorical variables

model = LinearRegression()
model.fit(X_train_2, y_train_2)

# Make predictions on the test set
y_pred_train = model.predict(X_train_2)
mse = mean_squared_error(y_train_2, y_pred_train)
print(f"Mean Squared Error train: {mse}")
r2_train = r2_score(y_train_2, y_pred_train)
print(f"R^2 Score Train: {r2_train}")

# Make predictions on the test set with the best model
y_pred = model.predict(X_test_2)
# Calculate and print the mean squared error
mse = mean_squared_error(y_test_2, y_pred)
print(f"Mean Squared Error test: {mse}")
r2_test = r2_score(y_test_2, y_pred)
print(f"R^2 Score Test: {r2_test}")

Mean Squared Error train: 4711862.063974101
R^2 Score Train: 0.36214503697200784
Mean Squared Error test: 4993798.488754236
R^2 Score Test: 0.4805148804384397
