In [5]:
pip install scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.8.0-cp312-cp312-win_amd64.whl.metadata (11 kB)
Collecting scipy>=1.10.0 (from scikit-learn)
  Downloading scipy-1.17.0-cp312-cp312-win_amd64.whl.metadata (60 kB)
Collecting joblib>=1.3.0 (from scikit-learn)
  Downloading joblib-1.5.3-py3-none-any.whl.metadata (5.5 kB)
Collecting threadpoolctl>=3.2.0 (from scikit-learn)
  Downloading threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.8.0-cp312-cp312-win_amd64.whl (8.0 MB)
   ---------------------------------------- 0.0/8.0 MB ? eta -:--:--
   - -------------------------------------- 0.3/8.0 MB ? eta -:--:--
   -- ------------------------------------- 0.5/8.0 MB 762.0 kB/s eta 0:00:10
   --- ------------------------------------ 0.8/8.0 MB 1.2 MB/s eta 0:00:07
   ------ --------------------------------- 1.3/8.0 MB 1.4 MB/s eta 0:00:05
   ------ --------------------------------- 1.3/8.0 MB 1.4 MB/s eta 0:00:05
   --------- ------------------------------ 1.8/


[notice] A new release of pip is available: 26.0 -> 26.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [6]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score


In [9]:
data = pd.read_csv("german_credit_data.csv")
print(data.head())

   laufkont  laufzeit  moral  verw  hoehe  sparkont  beszeit  rate  famges  \
0         1        18      4     2   1049         1        2     4       2   
1         1         9      4     0   2799         1        3     2       3   
2         2        12      2     9    841         2        4     2       2   
3         1        12      4     0   2122         1        3     3       3   
4         1        12      4     0   2171         1        3     4       3   

   buerge  ...  verm  alter  weitkred  wohn  bishkred  beruf  pers  telef  \
0       1  ...     2     21         3     1         1      3     2      1   
1       1  ...     1     36         3     1         2      3     1      1   
2       1  ...     1     23         3     1         1      2     2      1   
3       1  ...     1     39         3     1         2      2     1      1   
4       1  ...     2     38         1     2         2      2     2      1   

   gastarb  kredit  
0        2       1  
1        2       1  
2    

In [12]:
X = data.drop("kredit", axis=1)   # all features
y = data["kredit"]              

In [13]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [14]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [15]:
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)

y_pred_lr = lr.predict(X_test)
y_prob_lr = lr.predict_proba(X_test)[:, 1]


In [16]:
rf = RandomForestClassifier(
    n_estimators=200,
    random_state=42
)
rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)
y_prob_rf = rf.predict_proba(X_test)[:, 1]


In [17]:
def evaluate(model_name, y_test, y_pred, y_prob):
    print(f"\nðŸ“Œ {model_name}")
    print("Accuracy :", accuracy_score(y_test, y_pred))
    print("Precision:", precision_score(y_test, y_pred))
    print("Recall   :", recall_score(y_test, y_pred))
    print("F1-Score :", f1_score(y_test, y_pred))
    print("ROC-AUC  :", roc_auc_score(y_test, y_prob))


In [18]:
evaluate("Logistic Regression", y_test, y_pred_lr, y_prob_lr)
evaluate("Random Forest", y_test, y_pred_rf, y_prob_rf)



ðŸ“Œ Logistic Regression
Accuracy : 0.77
Precision: 0.8012820512820513
Recall   : 0.8928571428571429
F1-Score : 0.8445945945945946
ROC-AUC  : 0.8085714285714287

ðŸ“Œ Random Forest
Accuracy : 0.795
Precision: 0.8113207547169812
Recall   : 0.9214285714285714
F1-Score : 0.862876254180602
ROC-AUC  : 0.8233333333333334


In [19]:
importances = pd.Series(rf.feature_importances_, index=X.columns)
importances.sort_values(ascending=False).head(10)


hoehe       0.131496
laufkont    0.109159
laufzeit    0.102934
alter       0.100028
verw        0.064057
moral       0.055082
sparkont    0.051747
beszeit     0.050368
verm        0.045394
rate        0.044115
dtype: float64