# **Clone Repository & Setup Project**

In [11]:
# ============================
# Project Setup
# ============================

!git clone https://github.com/Rodexxx24/credit-scoring-ml.git

Cloning into 'credit-scoring-ml'...
remote: Enumerating objects: 66, done.[K
remote: Counting objects: 100% (66/66), done.[K
remote: Compressing objects: 100% (49/49), done.[K
remote: Total 66 (delta 17), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (66/66), 145.04 KiB | 4.53 MiB/s, done.
Resolving deltas: 100% (17/17), done.


# **Masuk ke Direktori Proyek**

In [12]:
%cd credit-scoring-ml

/content/credit-scoring-ml


# **Verifikasi Struktur Folder**

In [13]:
!ls

data  notebooks  README.md  requirements.txt  results  src


# **Setup Environment**

In [14]:
# ============================
# Preprocessing - Setup
# ============================

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

import warnings
warnings.filterwarnings("ignore")

# **Load Dataset**

In [15]:
# ============================
# Load Clean Dataset
# ============================

columns = [
    "Status_Checking_Account", "Duration", "Credit_History", "Purpose",
    "Credit_Amount", "Savings_Account", "Employment_Duration",
    "Installment_Rate", "Personal_Status_Sex", "Other_Debtors",
    "Residence_Since", "Property", "Age", "Other_Installment_Plans",
    "Housing", "Number_Credits", "Job", "People_Maintained",
    "Telephone", "Foreign_Worker", "Risk"
]

data_path = "data/raw/german_credit_uci.data"

df = pd.read_csv(
    data_path,
    header=None,
    sep=r"\s+",
    engine="python",
    names=columns
)

print("Dataset Shape:", df.shape)
df.head()

Dataset Shape: (1000, 21)


Unnamed: 0,Status_Checking_Account,Duration,Credit_History,Purpose,Credit_Amount,Savings_Account,Employment_Duration,Installment_Rate,Personal_Status_Sex,Other_Debtors,...,Property,Age,Other_Installment_Plans,Housing,Number_Credits,Job,People_Maintained,Telephone,Foreign_Worker,Risk
0,A11,6,A34,A43,1169,A65,A75,4,A93,A101,...,A121,67,A143,A152,2,A173,1,A192,A201,1
1,A12,48,A32,A43,5951,A61,A73,2,A92,A101,...,A121,22,A143,A152,1,A173,1,A191,A201,2
2,A14,12,A34,A46,2096,A61,A74,2,A93,A101,...,A121,49,A143,A152,1,A172,2,A191,A201,1
3,A11,42,A32,A42,7882,A61,A74,2,A93,A103,...,A122,45,A143,A153,1,A173,2,A191,A201,1
4,A11,24,A33,A40,4870,A61,A73,3,A93,A101,...,A124,53,A143,A153,2,A173,2,A191,A201,2


# **Pisahkan Fitur & Target**

In [16]:
# ============================
# Feature - Target Split
# ============================

X = df.drop("Risk", axis=1)
y = df["Risk"]

print("X shape:", X.shape)
print("y distribution:")
print(y.value_counts())

X shape: (1000, 20)
y distribution:
Risk
1    700
2    300
Name: count, dtype: int64


# **Encoding Target Variable**

In [17]:
# ============================
# Target Encoding
# ============================

y = y.map({1: 0, 2: 1})

print("Encoded target distribution:")
print(y.value_counts())

Encoded target distribution:
Risk
0    700
1    300
Name: count, dtype: int64


# **Identifikasi Tipe Fitur**

In [18]:
# ============================
# Feature Type Identification
# ============================

categorical_features = X.select_dtypes(include=["object"]).columns.tolist()
numerical_features = X.select_dtypes(include=["int64", "float64"]).columns.tolist()

print("Categorical Features:", len(categorical_features))
print(categorical_features)

print("\nNumerical Features:", len(numerical_features))
print(numerical_features)

Categorical Features: 13
['Status_Checking_Account', 'Credit_History', 'Purpose', 'Savings_Account', 'Employment_Duration', 'Personal_Status_Sex', 'Other_Debtors', 'Property', 'Other_Installment_Plans', 'Housing', 'Job', 'Telephone', 'Foreign_Worker']

Numerical Features: 7
['Duration', 'Credit_Amount', 'Installment_Rate', 'Residence_Since', 'Age', 'Number_Credits', 'People_Maintained']


# **Preprocessing Pipeline**

In [19]:
# ============================
# Preprocessing Pipeline
# ============================

numeric_transformer = Pipeline(
    steps=[
        ("scaler", StandardScaler())
    ]
)

categorical_transformer = Pipeline(
    steps=[
        ("encoder", OneHotEncoder(drop="first", handle_unknown="ignore"))
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numerical_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)

# **Train-Test Split**

In [20]:
# ============================
# Train-Test Split
# ============================

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)

Train shape: (800, 20)
Test shape: (200, 20)


# **Transform Dataset**

In [21]:
# ============================
# Apply Preprocessing
# ============================

X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

print("Processed Train Shape:", X_train_processed.shape)
print("Processed Test Shape:", X_test_processed.shape)

Processed Train Shape: (800, 48)
Processed Test Shape: (200, 48)


# **Simpan Hasil**

In [23]:
# ============================
# Save Preprocessed Data
# ============================

import joblib
import os

# Create directories if they don't exist
os.makedirs("models", exist_ok=True)
os.makedirs("data/processed", exist_ok=True)

joblib.dump(preprocessor, "models/preprocessor.joblib")
joblib.dump(X_train_processed, "data/processed/X_train.joblib")
joblib.dump(X_test_processed, "data/processed/X_test.joblib")
joblib.dump(y_train, "data/processed/y_train.joblib")
joblib.dump(y_test, "data/processed/y_test.joblib")

['data/processed/y_test.joblib']

In [24]:
import os

os.makedirs("data/processed", exist_ok=True)

np.save("data/processed/X_train_processed.npy", X_train_processed)
np.save("data/processed/X_test_processed.npy", X_test_processed)
np.save("data/processed/y_train.npy", y_train.to_numpy())
np.save("data/processed/y_test.npy", y_test.to_numpy())