In [None]:
import pandas as pd
import numpy as np
import joblib
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

print("Downloading the German Credit dataset from OpenML...")
credit = fetch_openml(name="credit-g", version=1, as_frame=True)
df = credit.frame.copy()
print("Download complete. Shape =", df.shape)
print()

print("Columns:", df.columns.tolist())
print("Target value counts:\n", df["class"].value_counts())
print()

X = df.drop(columns="class")
y = df["class"].apply(lambda s: 1 if s == "good" else 0)

numeric_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical_cols = X.select_dtypes(include=["object", "category"]).columns.tolist()

print("Numeric columns:", numeric_cols)
print("Categorical columns:", categorical_cols)
print()

numeric_transformer = Pipeline(
    steps=[
        ("scaler", StandardScaler())
    ]
)
categorical_transformer = Pipeline(
    steps=[
        ("onehot", OneHotEncoder(handle_unknown="ignore", drop="first"))
    ]
)
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_cols),
        ("cat", categorical_transformer, categorical_cols),
    ]
)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42, stratify=y
)
print("Training set shape:", X_train.shape, "Test set shape:", X_test.shape)
print()

logreg_pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("classifier", LogisticRegression(max_iter=1000, random_state=42))
    ]
)
rf_pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("classifier", RandomForestClassifier(
            n_estimators=100, random_state=42, n_jobs=-1
        ))
    ]
)

print("Training Logistic Regression...")
logreg_pipeline.fit(X_train, y_train)
joblib.dump(logreg_pipeline, "task1_logreg.pkl")
print("Saved Logistic Regression pipeline to 'logreg_pipeline.pkl'.")

y_pred_logreg = logreg_pipeline.predict(X_test)
acc_logreg = accuracy_score(y_test, y_pred_logreg)
print(f"\nLogistic Regression Accuracy: {acc_logreg:.4f}")
print("\nClassification Report (Logistic Regression):")
print(classification_report(y_test, y_pred_logreg, target_names=["bad", "good"]))
print("Confusion Matrix (Logistic Regression):")
print(confusion_matrix(y_test, y_pred_logreg))
print("="*60)

print("Training Random Forest...")
rf_pipeline.fit(X_train, y_train)
joblib.dump(rf_pipeline, "task1_randfor.pkl")
print("Saved Random Forest pipeline to 'rf_pipeline.pkl'.")

y_pred_rf = rf_pipeline.predict(X_test)
acc_rf = accuracy_score(y_test, y_pred_rf)
print(f"\nRandom Forest Accuracy: {acc_rf:.4f}")
print("\nClassification Report (Random Forest):")
print(classification_report(y_test, y_pred_rf, target_names=["bad", "good"]))
print("Confusion Matrix (Random Forest):")
print(confusion_matrix(y_test, y_pred_rf))
print("="*60)

rf_model = rf_pipeline.named_steps["classifier"]
ohe = rf_pipeline.named_steps["preprocessor"].named_transformers_["cat"].named_steps["onehot"]
cat_feature_names = ohe.get_feature_names_out(categorical_cols)
all_feature_names = numeric_cols + list(cat_feature_names)

importances = rf_model.feature_importances_
feat_imp = pd.Series(importances, index=all_feature_names).sort_values(ascending=False)

print("\nTop 10 Feature Importances (Random Forest):")
print(feat_imp.head(10))

Downloading the German Credit dataset from OpenML...
Download complete. Shape = (1000, 21)

Columns: ['checking_status', 'duration', 'credit_history', 'purpose', 'credit_amount', 'savings_status', 'employment', 'installment_commitment', 'personal_status', 'other_parties', 'residence_since', 'property_magnitude', 'age', 'other_payment_plans', 'housing', 'existing_credits', 'job', 'num_dependents', 'own_telephone', 'foreign_worker', 'class']
Target value counts:
 class
good    700
bad     300
Name: count, dtype: int64

Numeric columns: ['duration', 'credit_amount', 'installment_commitment', 'residence_since', 'age', 'existing_credits', 'num_dependents']
Categorical columns: ['checking_status', 'credit_history', 'purpose', 'savings_status', 'employment', 'personal_status', 'other_parties', 'property_magnitude', 'other_payment_plans', 'housing', 'job', 'own_telephone', 'foreign_worker']

Training set shape: (800, 20) Test set shape: (200, 20)

Training Logistic Regression...
Saved Logistic

In [None]:
print("Loading pipelines from disk...")
logreg_pipeline = joblib.load("task1_logreg.pkl")
rf_pipeline    = joblib.load("task1_randfor.pkl")
print("Done.\n")

new_applicant = {
    "checking_status":       ["<0"],                          
    "duration":              [24],                            
    "credit_history":        ["existing paid"],               
    "purpose":               ["new car"],                     
    "savings_status":        ["500<=X<1000"],                 
    "employment":            ["<1"],                          
    "installment_commitment":[2],                             
    "personal_status":       ["male single"],                 
    "other_parties":         ["none"],                        
    "residence_since":       [3],                             
    "property_magnitude":    ["real estate"],                 
    "age":                   [35],                            
    "other_payment_plans":   ["none"],                        
    "housing":               ["own"],                         
    "existing_credits":      [1],                             
    "job":                   ["skilled"],                     
    "num_dependents":        [1],                             
    "own_telephone":         ["yes"],                         
    "foreign_worker":        ["yes"],                         
}

new_X = pd.DataFrame(new_applicant)
logreg_pred  = logreg_pipeline.predict(new_X)[0]
logreg_proba = logreg_pipeline.predict_proba(new_X)[0, 1]

print("[LogisticRegression] → Predicted class:", 
      "good" if logreg_pred == 1 else "bad")
print("[LogisticRegression] → P(good):", f"{logreg_proba:.4f}")
print()

rf_pred  = rf_pipeline.predict(new_X)[0]
rf_proba = rf_pipeline.predict_proba(new_X)[0, 1]

print("[RandomForest] → Predicted class:", 
      "good" if rf_pred == 1 else "bad")
print("[RandomForest] → P(good):", f"{rf_proba:.4f}")


Loading pipelines from disk...
Done.

[LogisticRegression] → Predicted class: good
[LogisticRegression] → P(good): 0.5511

[RandomForest] → Predicted class: good
[RandomForest] → P(good): 0.6100
