In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

train_df = pd.read_csv('German train.csv')
test_df = pd.read_csv('German test.csv')
train_df.head(), test_df.head()


(   Age   Sex  Job Housing Saving.accounts Checking.account  Duration  \
 0   67  male    2     own      quite rich           little         6   
 1   49  male    1     own          little           little        12   
 2   45  male    2    free          little           little        42   
 3   53  male    2    free          little           little        24   
 4   35  male    1    free        moderate         moderate        36   
 
                Purpose  Credit.amount  
 0             radio/TV           1169  
 1            education           2096  
 2  furniture/equipment           7882  
 3                  car           4870  
 4            education           9055  ,
    Age     Sex  Job Housing Saving.accounts Checking.account  Duration  \
 0   22  female    2     own          little         moderate        48   
 1   53    male    2     own      quite rich         moderate        24   
 2   24  female    2    rent          little           little        48   
 3   60    ma

In [12]:
correlation_matrix_full = train_encoded.corr()
credit_amount_correlation_full = correlation_matrix_full['Credit.amount'].sort_values(ascending=False)
credit_amount_correlation_full



Credit.amount                  1.000000
Duration                       0.589370
Job                            0.296344
Housing_free                   0.192472
Purpose_car                    0.148046
Sex_male                       0.130422
Checking.account_moderate      0.123932
Purpose_business               0.101761
Purpose_vacation/others        0.072180
Age                            0.055432
Saving.accounts_moderate       0.041984
Saving.accounts_little         0.022296
Checking.account_little       -0.011347
Purpose_furniture/equipment   -0.016070
Saving.accounts_rich          -0.029446
Purpose_education             -0.030894
Purpose_repairs               -0.040364
Housing_rent                  -0.053247
Saving.accounts_quite rich    -0.057893
Purpose_domestic appliances   -0.070935
Housing_own                   -0.087474
Sex_female                    -0.130422
Checking.account_rich         -0.166427
Purpose_radio/TV              -0.175943
Name: Credit.amount, dtype: float64

In [11]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np

train_encoded = pd.get_dummies(train_df, columns=['Sex', 'Housing', 'Saving.accounts', 'Checking.account', 'Purpose'])
test_encoded = pd.get_dummies(test_df, columns=['Sex', 'Housing', 'Saving.accounts', 'Checking.account', 'Purpose'])

test_encoded, train_encoded = test_encoded.align(train_encoded, join='inner', axis=1)

correlation_matrix = train_encoded.corr()
credit_amount_correlations = correlation_matrix['Credit.amount'].sort_values(ascending=False)

selected_features = ['Duration', 'Job', 'Housing_free']

X_train_selected = train_encoded[selected_features]
y_train = train_encoded['Credit.amount']
X_test_selected = test_encoded[selected_features]
y_test = test_encoded['Credit.amount']

lr_selected = LinearRegression()
lr_selected.fit(X_train_selected, y_train)

y_pred_lr_selected = lr_selected.predict(X_test_selected)

rmse_lr_selected = np.sqrt(mean_squared_error(y_test, y_pred_lr_selected))
mse_lr_selected = mean_squared_error(y_test, y_pred_lr_selected)
mae_lr_selected = mean_absolute_error(y_test, y_pred_lr_selected)
r2_lr_selected = r2_score(y_test, y_pred_lr_selected)

mse_lr_selected, mae_lr_selected, r2_lr_selected,rmse_lr_selected



(4898372.044991403,
 1483.4248005685452,
 0.49044171622468635,
 2213.2266140166043)

In [15]:
lr_all = LinearRegression()
X_train_all = train_encoded.drop(['Credit.amount'], axis=1)
X_test_all = test_encoded.drop(['Credit.amount'], axis=1)
y_train_all = train_encoded['Credit.amount']
y_test_all = test_encoded['Credit.amount']
lr_all.fit(X_train_all, y_train_all)

y_pred_lr_all = lr_all.predict(X_test_all)

mse_lr_all = mean_squared_error(y_test_all, y_pred_lr_all)

rmse_lr_all = np.sqrt(mse_lr_all)

r2_lr_all = r2_score(y_test_all, y_pred_lr_all)
mse_lr_all, rmse_lr_all, r2_lr_all


(4630624.437076734, 2151.888574503042, 0.5182944416446444)

In [21]:
# Levels of 'Purpose' to be removed one at a time, assumed from the provided screenshot
purpose_levels = [
    'Purpose_car', 'Purpose_business', 'Purpose_education', 
    'Purpose_furniture/equipment', 'Purpose_radio/TV', 'Purpose_domestic appliances',
    'Purpose_repairs', 'Purpose_vacation/others'
]

# Function to perform regression excluding one level of 'Purpose' each time
def regression_excluding_purpose_level(train_data, test_data, level_to_exclude):
    # Encode categorical variables
    train_encoded = pd.get_dummies(train_data, drop_first=True)
    test_encoded = pd.get_dummies(test_data, drop_first=True)
    
    # Ensure that both train and test have the same columns
    train_encoded, test_encoded = train_encoded.align(test_encoded, join='inner', axis=1)
    
    # Exclude the specified level of 'Purpose'
    if level_to_exclude in train_encoded.columns:
        X_train = train_encoded.drop(columns=[level_to_exclude, 'Credit.amount'])
        X_test = test_encoded.drop(columns=[level_to_exclude, 'Credit.amount'])
    else:
        # If the level to exclude is not found, use all columns except 'Credit.amount'
        X_train = train_encoded.drop(columns=['Credit.amount'])
        X_test = test_encoded.drop(columns=['Credit.amount'])
        
    # Define target variable
    y_train = train_encoded['Credit.amount']
    y_test = test_encoded['Credit.amount']
    
    # Initialize and fit the model
    model = LinearRegression().fit(X_train, y_train)
    
    # Predict and calculate metrics
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    
    return mse, rmse, r2

# Running the regression and collecting results for each level of 'Purpose' removed
regression_results_purpose = {}
for level in purpose_levels:
    mse, rmse, r2 = regression_excluding_purpose_level(train_df, test_df, level)
    regression_results_purpose[level] = {'MSE': mse, 'RMSE': rmse, 'R2': r2}

regression_results_purpose


{'Purpose_car': {'MSE': 4661724.299859974,
  'RMSE': 2159.102660796835,
  'R2': 0.5150592458367476},
 'Purpose_business': {'MSE': 4630624.437076733,
  'RMSE': 2151.888574503042,
  'R2': 0.5182944416446446},
 'Purpose_education': {'MSE': 4616555.096172077,
  'RMSE': 2148.6170194271654,
  'R2': 0.5197580195720413},
 'Purpose_furniture/equipment': {'MSE': 4631883.3766628215,
  'RMSE': 2152.181074320379,
  'R2': 0.5181634791352696},
 'Purpose_radio/TV': {'MSE': 4616773.864174603,
  'RMSE': 2148.6679278507886,
  'R2': 0.5197352620013855},
 'Purpose_domestic appliances': {'MSE': 4635628.285871583,
  'RMSE': 2153.050925053001,
  'R2': 0.5177739110314622},
 'Purpose_repairs': {'MSE': 4575807.893082258,
  'RMSE': 2139.1138102219475,
  'R2': 0.5239967900623967},
 'Purpose_vacation/others': {'MSE': 4837095.034492065,
  'RMSE': 2199.3396814707967,
  'R2': 0.4968161214389353}}

In [22]:
# Levels of 'Checking.account' to be removed one at a time
checking_account_levels = ['Checking.account_little', 'Checking.account_moderate', 'Checking.account_rich']

# Function to perform regression excluding one level of 'Checking.account' each time
def regression_excluding_checking_level(train_data, test_data, level_to_exclude):
    # Encode categorical variables
    train_encoded = pd.get_dummies(train_data, drop_first=True)
    test_encoded = pd.get_dummies(test_data, drop_first=True)
    
    # Ensure that both train and test have the same columns
    train_encoded, test_encoded = train_encoded.align(test_encoded, join='inner', axis=1)
    
    # Exclude the specified level of 'Checking.account'
    if level_to_exclude in train_encoded.columns:
        X_train = train_encoded.drop(columns=[level_to_exclude, 'Credit.amount'])
        X_test = test_encoded.drop(columns=[level_to_exclude, 'Credit.amount'])
    else:
        # If the level to exclude is not found, use all columns except 'Credit.amount'
        X_train = train_encoded.drop(columns=['Credit.amount'])
        X_test = test_encoded.drop(columns=['Credit.amount'])
        
    # Define target variable
    y_train = train_encoded['Credit.amount']
    y_test = test_encoded['Credit.amount']
    
    # Initialize and fit the model
    model = LinearRegression().fit(X_train, y_train)
    
    # Predict and calculate metrics
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    
    return mse, rmse, r2

# Running the regression and collecting results for each level of 'Checking.account' removed
regression_results_checking = {}
for level in checking_account_levels:
    mse, rmse, r2 = regression_excluding_checking_level(train_df, test_df, level)
    regression_results_checking[level] = {'MSE': mse, 'RMSE': rmse, 'R2': r2}

regression_results_checking


{'Checking.account_little': {'MSE': 4630624.437076733,
  'RMSE': 2151.888574503042,
  'R2': 0.5182944416446446},
 'Checking.account_moderate': {'MSE': 4631206.793747046,
  'RMSE': 2152.023883173011,
  'R2': 0.5182338613819074},
 'Checking.account_rich': {'MSE': 4634371.207321198,
  'RMSE': 2152.758975668479,
  'R2': 0.5179046799446483}}

In [25]:


# Levels of 'Saving.accounts' to be removed one at a time
saving_account_levels = ['Saving.accounts_quite rich', 'Saving.accounts_moderate', 'Saving.accounts_little', 'Saving.accounts_rich']

# Function to perform regression excluding one level of 'Saving.accounts' each time
def regression_excluding_savings_level(train_data, test_data, level_to_exclude):
    # Encode categorical variables
    train_encoded = pd.get_dummies(train_data, drop_first=True)
    test_encoded = pd.get_dummies(test_data, drop_first=True)
    
    # Ensure that both train and test have the same columns
    train_encoded, test_encoded = train_encoded.align(test_encoded, join='inner', axis=1)
    
    # Exclude the specified level of 'Saving.accounts'
    if level_to_exclude in train_encoded.columns:
        X_train = train_encoded.drop(columns=[level_to_exclude, 'Credit.amount'])
        X_test = test_encoded.drop(columns=[level_to_exclude, 'Credit.amount'])
    else:
        # If the level to exclude is not found, use all columns except 'Credit.amount'
        X_train = train_encoded.drop(columns=['Credit.amount'])
        X_test = test_encoded.drop(columns=['Credit.amount'])
        
    # Define target variable
    y_train = train_encoded['Credit.amount']
    y_test = test_encoded['Credit.amount']
    
    # Initialize and fit the model
    model = LinearRegression().fit(X_train, y_train)
    
    # Predict and calculate metrics
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    
    return mse, rmse, r2

# Running the regression and collecting results for each level of 'Saving.accounts' removed
regression_results = {}
for level in saving_account_levels:
    mse, rmse, r2 = regression_excluding_savings_level(train_df, test_df, level)
    regression_results[level] = {'MSE': mse, 'RMSE': rmse, 'R2': r2}

regression_results


{'Saving.accounts_quite rich': {'MSE': 2065433.6064038028,
  'RMSE': 1437.1616493644,
  'R2': 0.7851411052357447},
 'Saving.accounts_moderate': {'MSE': 2066871.836441262,
  'RMSE': 1437.661933989094,
  'R2': 0.7849914918493316},
 'Saving.accounts_little': {'MSE': 2067952.1809543865,
  'RMSE': 1438.0376145825903,
  'R2': 0.7848791078795274},
 'Saving.accounts_rich': {'MSE': 2069857.3796213875,
  'RMSE': 1438.6998921322638,
  'R2': 0.7846809175922053}}