In [1]:
pip install ucimlrepo

Collecting ucimlrepo
  Using cached ucimlrepo-0.0.7-py3-none-any.whl.metadata (5.5 kB)
Using cached ucimlrepo-0.0.7-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.7
Note: you may need to restart the kernel to use updated packages.


# 2.載入資料集

In [2]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
bank_marketing = fetch_ucirepo(id=222) 
  
# data (as pandas dataframes) 
X = bank_marketing.data.features 
y = bank_marketing.data.targets 
  
# metadata 
print(bank_marketing.metadata) 
  
# variable information 
print(bank_marketing.variables) 


{'uci_id': 222, 'name': 'Bank Marketing', 'repository_url': 'https://archive.ics.uci.edu/dataset/222/bank+marketing', 'data_url': 'https://archive.ics.uci.edu/static/public/222/data.csv', 'abstract': 'The data is related with direct marketing campaigns (phone calls) of a Portuguese banking institution. The classification goal is to predict if the client will subscribe a term deposit (variable y).', 'area': 'Business', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 45211, 'num_features': 16, 'feature_types': ['Categorical', 'Integer'], 'demographics': ['Age', 'Occupation', 'Marital Status', 'Education Level'], 'target_col': ['y'], 'index_col': None, 'has_missing_values': 'yes', 'missing_values_symbol': 'NaN', 'year_of_dataset_creation': 2014, 'last_updated': 'Fri Aug 18 2023', 'dataset_doi': '10.24432/C5K306', 'creators': ['S. Moro', 'P. Rita', 'P. Cortez'], 'intro_paper': {'ID': 277, 'type': 'NATIVE', 'title': 'A data-driven approach to predict the s

# 3.將 y 轉成 0/1（二元分類）

In [3]:
y = y["y"].map({"yes": 1, "no": 0})

# 4.分類數值特徵 vs 類別特徵

In [4]:
# 自動依照 dtype 分類欄位
numeric_features = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical_features = X.select_dtypes(include=["object"]).columns.tolist()

print("數值特徵 (numeric features):")
print(numeric_features)

print("\n類別特徵 (categorical features):")
print(categorical_features)


數值特徵 (numeric features):
['age', 'balance', 'day_of_week', 'duration', 'campaign', 'pdays', 'previous']

類別特徵 (categorical features):
['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome']


In [5]:
# 因為若把星期（Mon/Tue/Wed）當成 1,2,3 → SVM 會錯誤解讀成 “星期三比星期一大 3 倍”
# 會出錯
# need to manually adjust features
numeric_features = [
    "age", "balance", "duration", "campaign",
    "pdays", "previous"
]

categorical_features = [
    "job", "marital", "education", "default",
    "housing", "loan", "contact", "month",
    "poutcome", "day_of_week"
]

print("修正後的 numeric features:", numeric_features)
print("修正後的 categorical features:", categorical_features)


修正後的 numeric features: ['age', 'balance', 'duration', 'campaign', 'pdays', 'previous']
修正後的 categorical features: ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome', 'day_of_week']


# 5.前處理 Pipeline（StandardScaler + OneHotEncoder）

In [6]:
import sklearn
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

preprocess = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features)
    ]
)


# 6.Train-Test Split（切分訓練/測試資料）

In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("訓練資料大小:", X_train.shape)
print("測試資料大小:", X_test.shape)


訓練資料大小: (36168, 16)
測試資料大小: (9043, 16)


# 7.建立 SVM Pipeline（前處理 + 模型）

In [8]:
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline

def get_pipeline(kernel):
    return Pipeline([
        ("preprocess", preprocess),   # Step 5 的前處理器
        ("svc", SVC(kernel=kernel))   # SVM 模型
    ])


# 8.使用 GridSearchCV 比較多種核函數（linear / rbf / poly）

In [9]:
# Linear kernel
param_linear = {
    "svc__C": [0.1, 1, 10, 100]
}

# RBF kernel
param_rbf = {
    "svc__C": [1, 10, 100],
    "svc__gamma": ["scale", 0.01, 0.1, 1]
}

# Polynomial kernel
param_poly = {
    "svc__C": [1, 10],
    "svc__gamma": ["scale", 0.01, 0.1],
    "svc__degree": [2, 3, 4]
}


In [10]:
from sklearn.model_selection import GridSearchCV

grids = {
    "linear": GridSearchCV(
        estimator=get_pipeline("linear"),
        param_grid=param_linear,
        cv=5,
        scoring="f1",
        n_jobs=-1
    ),

    "rbf": GridSearchCV(
        estimator=get_pipeline("rbf"),
        param_grid=param_rbf,
        cv=5,
        scoring="f1",
        n_jobs=-1
    ),

    "poly": GridSearchCV(
        estimator=get_pipeline("poly"),
        param_grid=param_poly,
        cv=5,
        scoring="f1",
        n_jobs=-1
    )
}


In [11]:
from sklearn.model_selection import train_test_split

X_train_small, _, y_train_small, _ = train_test_split(
    X_train, y_train,
    train_size=8000,
    random_state=42,
    stratify=y_train
)

print("用來做 GridSearch 的資料筆數:", X_train_small.shape[0])


用來做 GridSearch 的資料筆數: 8000


In [None]:
from sklearn.metrics import accuracy_score, f1_score, classification_report

results = {}

for name, grid in grids.items():
    print(f"\n=== Training {name.upper()} kernel (using small train set) ===")
    grid.fit(X_train_small, y_train_small)  # ← 只用小型訓練集

    y_pred = grid.predict(X_test)           # ← 在完整 test set 評估

    results[name] = {
        "best_params": grid.best_params_,
        "accuracy": accuracy_score(y_test, y_pred),
        "f1": f1_score(y_test, y_pred),
        "report": classification_report(y_test, y_pred)
    }



=== Training LINEAR kernel (using small train set) ===


Exception ignored in: <function ResourceTracker.__del__ at 0x103145b20>
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 91, in _stop
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 116, in _stop_locked
ChildProcessError: [Errno 10] No child processes
Exception ignored in: <function ResourceTracker.__del__ at 0x10f9adb20>
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 91, in _stop
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 116, in _stop_locked
ChildProcessError: [Errno 10] No child processes



=== Training RBF kernel (using small train set) ===

=== Training POLY kernel (using small train set) ===


Exception ignored in: <function ResourceTracker.__del__ at 0x104c49b20>
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 91, in _stop
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 116, in _stop_locked
ChildProcessError: [Errno 10] No child processes
Exception ignored in: <function ResourceTracker.__del__ at 0x109ee9b20>
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 91, in _stop
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 116, in _stop_locked
ChildProcessError: [Errno 10] No child processes
Exception ignored in: <function ResourceTracker.__del__ at 0x10ec61b20>
Traceback (most recent call last

In [None]:
for name, r in results.items():
    print("\n===============================")
    print(f"Kernel: {name.upper()}")
    print("Best Parameters:", r["best_params"])
    print("Accuracy:", r["accuracy"])
    print("F1 Score:", r["f1"])
    print("Report:\n", r["report"])


In [None]:
# 印出所有 kernel 的比較表
print("=== SVM Kernel Comparison ===")
for name, r in results.items():
    print("\n--------------------------------")
    print(f"Kernel: {name.upper()}")
    print("Best Parameters:", r["best_params"])
    print("Accuracy:", round(r["accuracy"], 4))
    print("F1 Score:", round(r["f1"], 4))


In [None]:
# 畫出最佳 Kernel（通常是 RBF）的混淆矩陣
from sklearn.metrics import ConfusionMatrixDisplay

best_kernel = max(results, key=lambda k: results[k]["f1"])
print("最佳 kernel:", best_kernel)

best_grid = grids[best_kernel]
y_pred = best_grid.predict(X_test)

ConfusionMatrixDisplay.from_predictions(y_test, y_pred)
plt.title(f"{best_kernel.upper()} Kernel - Confusion Matrix")
plt.show()


In [None]:
# 印出最佳 kernel 的 Classification Report
print("\n=== Classification Report (Best Kernel) ===\n")
print(results[best_kernel]["report"])
