In [21]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler

In [27]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.data"
df = pd.read_csv(url, header=None, delimiter=" ", quoting=3)
df.columns = ["checking_account", "duration", "credit_history", "purpose", "credit_amount", "savings_account",
              "employment", "installment_rate", "personal_status", "other_debtors", "residence_since",
              "property", "age", "other_installment_plans", "housing", "number_credits", "job", "people_liable",
              "telephone", "foreign_worker", "creditability"]

In [28]:
# Preprocess categorical variables
df['checking_account'] = df['checking_account'].replace({'A11': 0, 'A12': 1, 'A13': 2, 'A14': 3})
df['credit_history'] = df['credit_history'].replace({'A30': 0, 'A31': 1, 'A32': 2, 'A33': 3, 'A34': 4})
df['purpose'] = df['purpose'].replace({'A40': 0, 'A41': 1, 'A42': 2, 'A43': 3, 'A44': 4, 'A45': 5, 'A46': 6, 'A47': 7,
                                       'A48': 8, 'A49': 9, 'A410': 10})
df['savings_account'] = df['savings_account'].replace({'A61': 0, 'A62': 1, 'A63': 2, 'A64': 3, 'A65': 4})
df['employment'] = df['employment'].replace({'A71': 0, 'A72': 1, 'A73': 2, 'A74': 3, 'A75': 4})
df['personal_status'] = df['personal_status'].replace({'A91': 0, 'A92': 1, 'A93': 2, 'A94': 3, 'A95': 4})
df['other_debtors'] = df['other_debtors'].replace({'A101': 0, 'A102': 1, 'A103': 2})
df['property'] = df['property'].replace({'A121': 0, 'A122': 1, 'A123': 2, 'A124': 3})
df['other_installment_plans'] = df['other_installment_plans'].replace({'A141': 0, 'A142': 1, 'A143': 2})
df['housing'] = df['housing'].replace({'A151': 0, 'A152': 1, 'A153': 2})
df['job'] = df['job'].replace({'A171': 0, 'A172': 1, 'A173': 2, 'A174': 3})
df['telephone'] = df['telephone'].replace({'A191': 0, 'A192': 1})
df['foreign_worker'] = df['foreign_worker'].replace({'A201': 0, 'A202': 1})

In [29]:
# Split data into features (X) and target (y)
X = df.drop('creditability', axis=1)
y = df['creditability']

In [30]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the data using StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [31]:
# Create a Random Forest Classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf.fit(X_train, y_train)

In [32]:
# Make predictions
y_pred = rf.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.815
Classification Report:
               precision    recall  f1-score   support

           1       0.82      0.94      0.88       141
           2       0.78      0.53      0.63        59

    accuracy                           0.81       200
   macro avg       0.80      0.73      0.75       200
weighted avg       0.81      0.81      0.80       200

Confusion Matrix:
 [[132   9]
 [ 28  31]]
