# Generate Pair Plot

In [7]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# 讀取資料
public_df = pd.read_csv("public_data.csv")
private_df = pd.read_csv("private_data.csv")

# 移除 id 欄位（如果有）
public_data = public_df.drop(columns=['id'], errors='ignore')
private_data = private_df.drop(columns=['id'], errors='ignore')

# public data pairplot
sns.pairplot(public_data, plot_kws={"s": 1})
plt.suptitle("Pairwise Feature Plots - Public Dataset", y=1.02)
plt.tight_layout()
plt.savefig("public_pairplot.png")
plt.close()

# private data pairplot
sns.pairplot(private_data, plot_kws={"s": 1})
plt.suptitle("Pairwise Feature Plots - Private Dataset", y=1.02)
plt.tight_layout()
plt.savefig("private_pairplot.png")
plt.close()

print("Pairplots saved: public_pairplot.png, private_pairplot.png")


Pairplots saved: public_pairplot.png, private_pairplot.png


# Generate result visualization


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
import umap.umap_ as umap
from sklearn.preprocessing import StandardScaler

def visualize(dataset_name, data_file, label_file, prefix):
    # 讀取資料與 label
    df = pd.read_csv(data_file)
    labels = pd.read_csv(label_file)['label']

    # 特徵處理
    X = df.drop(columns=['id'])
    X = X.loc[:, X.std() != 0].dropna(axis=1)
    X_scaled = StandardScaler().fit_transform(X)

    # PCA
    pca = PCA(n_components=2)
    X_pca = pca.fit_transform(X_scaled)
    plt.figure(figsize=(8, 6))
    plt.scatter(X_pca[:, 0], X_pca[:, 1], c=labels, cmap='tab10', s=10, alpha=0.8)
    plt.title(f'{dataset_name} - PCA')
    plt.xlabel('PC1')
    plt.ylabel('PC2')
    plt.savefig(f'{prefix}_pca.png', dpi=300)
    plt.close()

    # UMAP
    reducer = umap.UMAP(n_components=2, random_state=42)
    X_umap = reducer.fit_transform(X_scaled)
    plt.figure(figsize=(8, 6))
    plt.scatter(X_umap[:, 0], X_umap[:, 1], c=labels, cmap='tab10', s=10, alpha=0.8)
    plt.title(f'{dataset_name} - UMAP')
    plt.xlabel('UMAP1')
    plt.ylabel('UMAP2')
    plt.savefig(f'{prefix}_umap.png', dpi=300)
    plt.close()

# 設定檔名
visualize("PUBLIC", "public_data.csv", "public_submission.csv", "public")
visualize("PRIVATE", "private_data.csv", "private_submission.csv", "private")


  from .autonotebook import tqdm as notebook_tqdm
  warn(
  warn(


In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from scipy.stats import zscore
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score

IQR_THRESHOLD = 12  # IQR threshold for outlier detection
zscore_threshold = 5  # Z-score threshold for outlier detection

# ========== Preprocessing Function ==========
def preprocess(df, feature_cols, method='iqr'):
    df = df.copy()
    df = df.dropna()
    X = df[feature_cols]

    # 移除常數欄位
    non_constant_cols = X.loc[:, X.std() != 0].columns.tolist()
    X = df[non_constant_cols]

    # 移除離群值
    if method == 'iqr':
        Q1 = X.quantile(0.25)
        Q3 = X.quantile(0.75)
        IQR = Q3 - Q1
        mask = ~((X < (Q1 - IQR_THRESHOLD * IQR)) | (X > (Q3 + IQR_THRESHOLD * IQR))).any(axis=1)
    elif method == 'zscore':
        z_scores = np.abs(zscore(X))
        mask = (z_scores < zscore_threshold).all(axis=1)
    else:
        mask = pd.Series([True] * len(X))

    df_clean = df[mask]
    X_clean = X[mask]  # 注意：這裡要用 log 後的 X

    # 標準化
    scaler = StandardScaler()
    X_clean_scaled = scaler.fit_transform(X_clean)

    return X_clean_scaled, df_clean, non_constant_cols, scaler

# ========== Clustering ==========
def run_kmeans(X, n_clusters):
    model = KMeans(n_clusters=n_clusters, init="random", random_state=42)
    model.fit(X)
    return model

def evaluate_clustering(X, labels):
    sil_score = silhouette_score(X, labels)
    ch_score = calinski_harabasz_score(X, labels)
    db_score = davies_bouldin_score(X, labels)
    return sil_score, ch_score, db_score

def visualize_clusters(X, labels, method, file_name):
    # PCA 降到 2 維
    pca = PCA(n_components=2)
    X_pca = pca.fit_transform(X)
    plt.figure(figsize=(6, 5))
    plt.scatter(X_pca[:, 0], X_pca[:, 1], c=labels, cmap='tab20', s=1)
    plt.xlabel("PCA 1")
    plt.ylabel("PCA 2")
    plt.title(f"{method} Clustering")
    plt.tight_layout()
    plt.savefig(file_name)
    plt.close()

# ========== Load Data ==========
public_df = pd.read_csv("public_data.csv")
private_df = pd.read_csv("private_data.csv")

public_features = ['1', '2', '3', '4']
private_features = ['1', '2', '3', '4', '5', '6']

# ========== Preprocess ==========
X_pub_clean, df_pub_clean, pub_cols, scaler_pub = preprocess(public_df, public_features, method='iqr')
X_priv_clean, df_priv_clean, priv_cols, scaler_priv = preprocess(private_df, private_features, method='iqr')

# ========== Fit KMeans ==========
n_pub_clusters = 4 * len(pub_cols) - 1
n_priv_clusters = 4 * len(priv_cols) - 1

print(f"Public Dataset Clusters: {n_pub_clusters}")
print(f"Private Dataset Clusters: {n_priv_clusters}")

kmeans_pub = run_kmeans(X_pub_clean, n_pub_clusters)
kmeans_priv = run_kmeans(X_priv_clean, n_priv_clusters)

# 計算聚類指標
sil_pub, ch_pub, db_pub = evaluate_clustering(X_pub_clean, kmeans_pub.labels_)
sil_priv, ch_priv, db_priv = evaluate_clustering(X_priv_clean, kmeans_priv.labels_)

print(f"Public Dataset Clustering Evaluation:")
print(f"  Silhouette Score: {sil_pub:.4f}")
print(f"  Calinski-Harabasz Index: {ch_pub:.4f}")
print(f"  Davies-Bouldin Index: {db_pub:.4f}\n")

print(f"Private Dataset Clustering Evaluation:")
print(f"  Silhouette Score: {sil_priv:.4f}")
print(f"  Calinski-Harabasz Index: {ch_priv:.4f}")
print(f"  Davies-Bouldin Index: {db_priv:.4f}\n")

# ========== Predict on ALL data ==========
X_pub_all = scaler_pub.transform(public_df[pub_cols])
X_priv_all = scaler_priv.transform(private_df[priv_cols])

labels_pub_all = kmeans_pub.predict(X_pub_all)
labels_priv_all = kmeans_priv.predict(X_priv_all)

public_df['label'] = labels_pub_all
private_df['label'] = labels_priv_all

# ========== Save ==========
public_df[['id', 'label']].to_csv("public_submission.csv", index=False)
private_df[['id', 'label']].to_csv("private_submission.csv", index=False)

# ========== Visualization ==========
sns.pairplot(df_pub_clean.drop(columns='id', errors='ignore'), plot_kws={"s": 1})
plt.suptitle("Pairwise Feature Plots - Public Dataset", y=1.02)
plt.tight_layout()
plt.savefig("public_pairplot.png")
plt.close()

visualize_clusters(X_pub_all, labels_pub_all, "KMeans - Public", "public_kmeans.png")

print("Clustering completed. Files saved:")
print("- public_submission.csv")
print("- private_submission.csv")
print("- public_pairplot.png")
print("- public_kmeans.png")


Public Dataset Clusters: 15
Private Dataset Clusters: 23
Public Dataset Clustering Evaluation:
  Silhouette Score: 0.6101
  Calinski-Harabasz Index: 37758.7347
  Davies-Bouldin Index: 0.7810

Private Dataset Clustering Evaluation:
  Silhouette Score: 0.5249
  Calinski-Harabasz Index: 114074.8280
  Davies-Bouldin Index: 0.8592

Clustering completed. Files saved:
- public_submission.csv
- private_submission.csv
- public_pairplot.png
- public_kmeans.png


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

# ========== Clustering ==========
def run_kmeans(X, n_clusters):
    model = KMeans(n_clusters=n_clusters, init="random", random_state=42)
    model.fit(X)
    return model

def visualize_clusters(X, labels, title, file_name):
    plt.figure(figsize=(6, 5))
    plt.scatter(X[:, 0], X[:, 1], c=labels, cmap='tab20', s=1)
    plt.xlabel("Feature 2 (scaled)")
    plt.ylabel("Feature 3 (scaled)")
    plt.title(title)
    plt.tight_layout()
    plt.savefig(file_name)
    plt.close()

# ========== Load Data ==========
public_df = pd.read_csv("public_data.csv")
private_df = pd.read_csv("private_data.csv")

selected_features = ['2', '3']

# ========== 預處理 ==========
def preprocess(df, selected_features, scale_factor=2):
    df = df.dropna(subset=selected_features)
    X = df[selected_features].values
    X[:, 0] *= scale_factor  # 加權 feature 2
    X[:, 1] *= scale_factor  # 加權 feature 3
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    return X_scaled, scaler

# Public dataset
X_pub, scaler_pub = preprocess(public_df, selected_features, scale_factor=2)
kmeans_pub = run_kmeans(X_pub, n_clusters=15)
labels_pub = kmeans_pub.predict(X_pub)
public_df['label'] = labels_pub
public_df[['id', 'label']].to_csv("public_submission.csv", index=False)
visualize_clusters(X_pub, labels_pub, "KMeans on Scaled Features 2 & 3 - Public", "public_kmeans.png")

# Private dataset
X_priv, _ = preprocess(private_df, selected_features, scale_factor=2)
kmeans_priv = run_kmeans(X_priv, n_clusters=23)
labels_priv = kmeans_priv.predict(X_priv)
private_df['label'] = labels_priv
private_df[['id', 'label']].to_csv("private_submission.csv", index=False)
visualize_clusters(X_priv, labels_priv, "KMeans on Scaled Features 2 & 3 - Private", "private_kmeans.png")

print("Clustering complete.")
print("- public_submission.csv\n- private_submission.csv\n- public_kmeans.png\n- private_kmeans.png")


Clustering complete.
- public_submission.csv
- private_submission.csv
- public_kmeans.png
- private_kmeans.png


In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import SpectralClustering

# ========== Clustering (Spectral) ========== 
def run_spectral(X, n_clusters):
    model = SpectralClustering(n_clusters=n_clusters, 
                                affinity='nearest_neighbors', 
                                n_neighbors=15, # 可調參
                                assign_labels='kmeans', 
                                random_state=42)
    labels = model.fit_predict(X)
    return labels

def visualize_clusters(X, labels, title, file_name):
    plt.figure(figsize=(6, 5))
    plt.scatter(X[:, 0], X[:, 1], c=labels, cmap='tab20', s=1)
    plt.xlabel("Feature 2")
    plt.ylabel("Feature 3")
    plt.title(title)
    plt.tight_layout()
    plt.savefig(file_name)
    plt.close()

# ========== Load Data ==========
public_df = pd.read_csv("public_data.csv")
private_df = pd.read_csv("private_data.csv")

# 選擇第 2, 3 維特徵
selected_features = ['2', '3']

# ========== 預處理 ========== 
def preprocess_simple(df, selected_features):
    df = df.dropna(subset=selected_features)
    X = df[selected_features]
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    return X_scaled, scaler

X_pub, scaler_pub = preprocess_simple(public_df, selected_features)

# ========== Spectral Clustering ==========
k_pub = 15  # 依照規定 public dataset 應該分為 15 群

labels_pub = run_spectral(X_pub, k_pub)
public_df['label'] = labels_pub

# ========== Save ========== 
public_df[['id', 'label']].to_csv("public_submission.csv", index=False)

# ========== Visualization ========== 
visualize_clusters(X_pub, labels_pub, "Spectral Clustering on Features 2 & 3 - Public", "public_spectral.png")

print("Spectral clustering done using Feature 2 & 3.")
print("- public_submission.csv")
print("- public_spectral.png")




Spectral clustering done using Feature 2 & 3.
- public_submission.csv
- public_spectral.png
