<a href="https://colab.research.google.com/github/Nohyunsun/ADHD-Self-Diagnosis-Across-Platforms/blob/main/ensemble_sentiment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 성능 향상
- TARGET_PER_CLASS를 800~1500으로 키워 성능을 더 끌어올림
- (엔진: TF-IDF(word+char) 듀얼 모델 + 소프트보팅 앙상블 → 검증셋에서 macro-F1을 올리도록 클래스별 확률 스케일을 탐욕적 튜닝으로 최적화)

=> 핵심: 단일 모델(+중립 편향)보다 소수 클래스 F1이 올라오도록 확률을 클래스별로 미세 조정했어.
여전히 데이터 자체 난이도(문맥 중의성, 부정어 등) 때문에 완벽하진 않지만, 중립 쏠림은 확실히 완화된 게 정상.

In [None]:
# Fast retry: dual-model ensemble + greedy per-class scaling (3 choices per class), much faster than full grid.
import os, numpy as np, pandas as pd, matplotlib.pyplot as plt, joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, f1_score, accuracy_score
from sklearn.utils.class_weight import compute_class_weight
from sklearn.utils import resample
from caas_jupyter_tools import display_dataframe_to_user

OUTDIR = "/mnt/data"
LABELED_XLSX = "/mnt/data/korean_sentiment_dataset.xlsx"
PLATFORMS = {
    "youtube": "/mnt/data/all_mapped_youtube.xlsx",
    "blog": "/mnt/data/all_mapped_blog.xlsx",
    "instagram": "/mnt/data/all_mapped_instagram.xlsx",
    "x": "/mnt/data/all_mapped_x.xlsx",
}
os.makedirs(OUTDIR, exist_ok=True)

# 1) Load & clean
raw = pd.read_excel(LABELED_XLSX)
df = raw.loc[1:, ["Unnamed: 1","Unnamed: 2"]].rename(columns={"Unnamed: 1":"text","Unnamed: 2":"label"})
df = df.dropna(subset=["text","label"]).astype(str)
valid = {"행복","중립","슬픔","공포","혐오","분노","놀람"}
df = df[df["label"].isin(valid)].copy()

# 2) Rebalance (target per class 1000 for speed/perf balance)
TARGET_PER_CLASS = 1000
frames=[]
for lab, grp in df.groupby("label"):
    if len(grp) >= TARGET_PER_CLASS:
        frames.append(grp.sample(n=TARGET_PER_CLASS, random_state=42))
    else:
        frames.append(pd.concat([grp, resample(grp, replace=True, n_samples=TARGET_PER_CLASS-len(grp), random_state=42)]))
dfb = pd.concat(frames).sample(frac=1.0, random_state=42).reset_index(drop=True)

# 3) Split
Xtr_text, Xva_text, ytr_str, yva_str = train_test_split(
    dfb["text"].values, dfb["label"].values, test_size=0.2, random_state=42, stratify=dfb["label"].values
)

# 4) Vectorizers (smaller feature caps for speed)
vec_char = TfidfVectorizer(analyzer="char", ngram_range=(2,5), min_df=3, max_df=0.995,
                           sublinear_tf=True, max_features=15000)
vec_word = TfidfVectorizer(analyzer="word", ngram_range=(1,2), min_df=2, max_df=0.99,
                           sublinear_tf=True, max_features=20000)
Xtr_c = vec_char.fit_transform(Xtr_text); Xva_c = vec_char.transform(Xva_text)
Xtr_w = vec_word.fit_transform(Xtr_text); Xva_w = vec_word.transform(Xva_text)

# 5) Labels & weights
le = LabelEncoder(); ytr = le.fit_transform(ytr_str); yva = le.transform(yva_str)
classes = np.unique(ytr)
cw = compute_class_weight(class_weight="balanced", classes=classes, y=ytr)
sw = cw[ytr]

# 6) Train probabilistic classifiers
clf_c = SGDClassifier(loss="log_loss", alpha=4e-4, random_state=42)
clf_w = SGDClassifier(loss="log_loss", alpha=6e-4, random_state=42)
clf_c.partial_fit(Xtr_c, ytr, classes=classes, sample_weight=sw)
clf_w.partial_fit(Xtr_w, ytr, classes=classes, sample_weight=sw)

proba_c = clf_c.predict_proba(Xva_c)
proba_w = clf_w.predict_proba(Xva_w)
proba = 0.6*proba_c + 0.4*proba_w

# 7) Greedy per-class scaling (scales in {0.9, 1.0, 1.1})
scales = np.ones(len(classes))
choices = [0.9, 1.0, 1.1]
def eval_scales(s):
    p = proba * s[None,:]
    p = p / p.sum(axis=1, keepdims=True)
    pred = p.argmax(1)
    return f1_score(yva, pred, average="macro")

base = eval_scales(scales)
improved = True
while improved:
    improved = False
    for k in range(len(classes)):
        best_local = base; best_val = scales[k]
        for v in choices:
            test = scales.copy(); test[k] = v
            f1m = eval_scales(test)
            if f1m > best_local + 1e-4:
                best_local, best_val = f1m, v
        if best_val != scales[k]:
            scales[k] = best_val
            base = best_local
            improved = True

# Final preds
p_final = proba * scales[None,:]
p_final = p_final / p_final.sum(axis=1, keepdims=True)
yva_pred = p_final.argmax(1)
val_acc = accuracy_score(yva, yva_pred)
val_f1 = f1_score(yva, yva_pred, average="macro")
report_txt = classification_report(yva, yva_pred, target_names=le.classes_, digits=4, zero_division=0)

# Save report & confusion
with open(f"{OUTDIR}/ensemble_report.txt", "w", encoding="utf-8") as f:
    f.write(f"Validation Accuracy: {val_acc:.4f}\nMacro F1: {val_f1:.4f}\nClass scales: {scales.tolist()}\n\n{report_txt}")

cm = confusion_matrix(yva_str, le.inverse_transform(yva_pred), labels=list(le.classes_))
fig, ax = plt.subplots(figsize=(7,6))
ConfusionMatrixDisplay(cm, display_labels=list(le.classes_)).plot(ax=ax, xticks_rotation=45, colorbar=False)
plt.title("Confusion Matrix (Greedy-scaled Ensemble)")
plt.tight_layout()
plt.savefig(f"{OUTDIR}/ensemble_confusion_matrix.png", dpi=150, bbox_inches="tight"); plt.close()

# Save artifacts
joblib.dump(clf_c, f"{OUTDIR}/clf_char.pkl")
joblib.dump(clf_w, f"{OUTDIR}/clf_word.pkl")
joblib.dump(vec_char, f"{OUTDIR}/vec_char.pkl")
joblib.dump(vec_word, f"{OUTDIR}/vec_word.pkl")
joblib.dump(le, f"{OUTDIR}/label_encoder.pkl")
np.save(f"{OUTDIR}/class_scales.npy", scales)

# 8) Platform predictions with ensemble
def detect_text_col(d):
    cand = [c for c in d.columns if str(c).lower() in ["text","content","body","본문","내용","sentence","utterance","comment","댓글"]]
    if cand: return cand[0]
    obj = [c for c in d.columns if d[c].dtype=='O']
    return obj[0] if obj else d.columns[0]

def predict_platform(path, name, limit=8000):
    d = pd.read_excel(path)
    tc = detect_text_col(d)
    sub = d[[tc]].dropna().rename(columns={tc:"text"}).copy()
    if len(sub) > limit: sub = sub.head(limit).copy()
    sub["text"] = sub["text"].astype(str).str.strip()
    Xc = vec_char.transform(sub["text"].values)
    Xw = vec_word.transform(sub["text"].values)
    pc = clf_c.predict_proba(Xc); pw = clf_w.predict_proba(Xw)
    p = (0.6*pc + 0.4*pw) * scales[None,:]
    p = p / p.sum(axis=1, keepdims=True)
    y_idx = p.argmax(1)
    sub["pred_emotion"] = le.inverse_transform(y_idx)
    dist = sub["pred_emotion"].value_counts().rename_axis("emotion").reset_index(name="count")
    dist["percent"] = (dist["count"]/dist["count"].sum()*100).round(2)
    base = f"{OUTDIR}/ensemble_{name}"
    sub.to_csv(base+"_predictions.csv", index=False, encoding="utf-8-sig")
    dist.to_csv(base+"_distribution.csv", index=False, encoding="utf-8-sig")
    plt.figure(figsize=(6,4))
    plt.bar(dist["emotion"], dist["count"])
    plt.title(f"{name}: Emotion Distribution (Ensemble)")
    plt.xlabel("Emotion"); plt.ylabel("Count"); plt.tight_layout()
    plt.savefig(base+"_distribution.png", dpi=150, bbox_inches="tight"); plt.close()
    return {"platform": name, "n_texts_used": len(sub), "text_col": tc,
            "predictions_csv": base+"_predictions.csv", "distribution_csv": base+"_distribution.csv", "chart_png": base+"_distribution.png"}

rows=[]
for k, p in PLATFORMS.items():
    if os.path.exists(p):
        try:
            rows.append(predict_platform(p, k))
        except Exception as e:
            rows.append({"platform": k, "error": str(e)})
    else:
        rows.append({"platform": k, "error": "file not found"})

overview = pd.DataFrame(rows)
display_dataframe_to_user("Ensemble predictions overview (improved performance, greedy scaling)", overview)

(f"{OUTDIR}/ensemble_report.txt", f"{OUTDIR}/ensemble_confusion_matrix.png", rows)


# 성능 지표

| 감정(Label) | Precision | Recall | F1-score | Support |
| --------- | --------- | ------ | -------- | ------- |
| 공포        | 0.9471    | 0.9850 | 0.9657   | 200     |
| 놀람        | 0.5581    | 0.6000 | 0.5783   | 200     |
| 분노        | 0.5550    | 0.6050 | 0.5789   | 200     |
| 슬픔        | 0.5314    | 0.4650 | 0.4960   | 200     |
| 중립        | 0.4021    | 0.3900 | 0.3959   | 200     |
| 행복        | 0.6020    | 0.5900 | 0.5960   | 200     |
| 혐오        | 0.9072    | 0.8800 | 0.8934   | 200     |


| 항목                  | 값                                   |
| ------------------- | ----------------------------------- |
| Validation Accuracy | **0.6450**                          |
| Macro F1            | **0.6435**                          |
| Class Scales        | [0.9, 1.1, 1.0, 1.0, 1.0, 1.0, 1.0] |
| Accuracy (전체)       | 0.6450                              |
| Macro 평균 Precision  | 0.6433                              |
| Macro 평균 Recall     | 0.6450                              |
| Macro 평균 F1-score   | 0.6435                              |


# 앙상블 모델(ensemble_char+word) 과 검증 데이터(yva_pred, yva_true)를 활용
- 오분류 분석
- 대표 문장 추출

In [None]:
# Recreate the two requested result sets:
# (1) Platform-wise Top-50 representative sentences per emotion (filtered & de-duplicated)
# (5) Threshold tuning tables for boosting recall of 혐오/공포 (grid + comparison)
import os, re, textwrap, numpy as np, pandas as pd, joblib, matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import precision_recall_fscore_support, f1_score, accuracy_score, classification_report
from sklearn.utils.class_weight import compute_class_weight
from sklearn.utils import resample
from caas_jupyter_tools import display_dataframe_to_user

OUTDIR = "/mnt/data"
LABELED_XLSX = "/mnt/data/korean_sentiment_dataset.xlsx"
PLATFORMS = {
    "youtube": "/mnt/data/all_mapped_youtube.xlsx",
    "blog": "/mnt/data/all_mapped_blog.xlsx",
    "instagram": "/mnt/data/all_mapped_instagram.xlsx",
    "x": "/mnt/data/all_mapped_x.xlsx",
}
os.makedirs(OUTDIR, exist_ok=True)

# -------------------- Load labeled data --------------------
분
df = raw.loc[1:, ["Unnamed: 1","Unnamed: 2"]].rename(columns={"Unnamed: 1":"text","Unnamed: 2":"label"})
df = df.dropna(subset=["text","label"]).astype(str)
valid_labels = ["행복","중립","슬픔","공포","혐오","분노","놀람"]
df = df[df["label"].isin(valid_labels)].copy()

# -------------------- Try to load ensemble artifacts; else train fallback --------------------
use_ensemble = all(os.path.exists(p) for p in [
    f"{OUTDIR}/clf_char.pkl",
    f"{OUTDIR}/clf_word.pkl",
    f"{OUTDIR}/vec_char.pkl",
    f"{OUTDIR}/vec_word.pkl",
    f"{OUTDIR}/label_encoder.pkl",
    f"{OUTDIR}/class_scales.npy",
])

if use_ensemble:
    clf_c = joblib.load(f"{OUTDIR}/clf_char.pkl")
    clf_w = joblib.load(f"{OUTDIR}/clf_word.pkl")
    vec_c = joblib.load(f"{OUTDIR}/vec_char.pkl")
    vec_w = joblib.load(f"{OUTDIR}/vec_word.pkl")
    le = joblib.load(f"{OUTDIR}/label_encoder.pkl")
    base_scales = np.load(f"{OUTDIR}/class_scales.npy")
    labels = list(le.classes_)
else:
    # Train a quick balanced single-model fallback (char TF-IDF + SGD log_loss)
    TARGET_PER_CLASS = 1000
    frames=[]
    for lab, grp in df.groupby("label"):
        if len(grp) >= TARGET_PER_CLASS:
            frames.append(grp.sample(n=TARGET_PER_CLASS, random_state=42))
        else:
            frames.append(pd.concat([grp, resample(grp, replace=True, n_samples=TARGET_PER_CLASS-len(grp), random_state=42)]))
    df_bal = pd.concat(frames).sample(frac=1.0, random_state=42).reset_index(drop=True)
    Xtr_text, Xva_text, ytr_str, yva_str = train_test_split(
        df_bal["text"].values, df_bal["label"].values, test_size=0.2, random_state=42, stratify=df_bal["label"].values
    )
    le = LabelEncoder(); ytr = le.fit_transform(ytr_str); yva = le.transform(yva_str)
    labels = list(le.classes_)
    vec_c = TfidfVectorizer(analyzer="char", ngram_range=(2,5), min_df=3, max_df=0.995, sublinear_tf=True, max_features=20000)
    Xtr_c = vec_c.fit_transform(Xtr_text); Xva_c = vec_c.transform(Xva_text)
    classes = np.unique(ytr)
    cw = compute_class_weight(class_weight="balanced", classes=classes, y=ytr)
    sw = cw[ytr]
    clf_c = SGDClassifier(loss="log_loss", alpha=4e-4, random_state=42)
    clf_c.partial_fit(Xtr_c, ytr, classes=classes, sample_weight=sw)
    # Mimic ensemble using only char model; define dummy word vectorizer/model
    clf_w = None; vec_w = None
    base_scales = np.ones(len(labels))

# -------------------- Validation split (for threshold tuning) --------------------
# Use original df (not balanced) for realistic validation
Xtr_text_v, Xva_text_v, ytr_str_v, yva_str_v = train_test_split(
    df["text"].values, df["label"].values, test_size=0.2, random_state=42, stratify=df["label"].values
)
# vectorize val and get probabilities
Xc_val = vec_c.transform(Xva_text_v)
pc_val = clf_c.predict_proba(Xc_val)
if clf_w is not None:
    Xw_val = vec_w.transform(Xva_text_v)
    pw_val = clf_w.predict_proba(Xw_val)
    proba_val = (0.6*pc_val + 0.4*pw_val)
else:
    proba_val = pc_val
proba_val = proba_val * base_scales[None,:]
proba_val = proba_val / proba_val.sum(axis=1, keepdims=True)
yva_idx = np.array([labels.index(y) for y in yva_str_v])

# -------------------- (1) Platform-wise Top-50 representative sentences --------------------
bad_terms = ["씨발","ㅅㅂ","ㅄ","개새","좆","병신","새끼","니엄마","sex","섹스","fuck","shit","좇","씹","꺼져","좆같","미친년","강간","강제","n번방"]
def normalize_text(s): return re.sub(r"\s+", " ", str(s).strip())
def is_clean(s):
    low = s.lower()
    if len(s) < 5: return False
    for t in bad_terms:
        if t.lower() in low: return False
    return True

def detect_text_col(d):
    cand = [c for c in d.columns if str(c).lower() in ["text","content","body","본문","내용","sentence","utterance","comment","댓글","caption","title"]]
    if cand: return cand[0]
    obj = d.select_dtypes(include="object").columns.tolist()
    return obj[0] if obj else d.columns[0]

def representative_for_platform(path, platform_name, topk=50):
    if not os.path.exists(path):
        return None
    d = pd.read_excel(path)
    tc = detect_text_col(d)
    sub = d[[tc]].dropna().rename(columns={tc:"text"}).copy()
    sub["text"] = sub["text"].astype(str).map(normalize_text)
    sub = sub[sub["text"].map(is_clean)]
    if sub.empty:
        out_csv = f"{OUTDIR}/rep_top50_{platform_name}.csv"
        pd.DataFrame(columns=["platform","emotion","rank","prob","text"]).to_csv(out_csv, index=False, encoding="utf-8-sig")
        return out_csv
    Xc = vec_c.transform(sub["text"].values)
    pc = clf_c.predict_proba(Xc)
    if clf_w is not None:
        Xw = vec_w.transform(sub["text"].values)
        pw = clf_w.predict_proba(Xw)
        p = (0.6*pc + 0.4*pw)
    else:
        p = pc
    p = p * base_scales[None,:]
    p = p / p.sum(axis=1, keepdims=True)
    rows = []
    for li, lab in enumerate(labels):
        order = np.argsort(-p[:, li])
        seen = set(); kept=0
        for idx in order:
            txt = sub["text"].iloc[idx]
            if txt in seen: continue
            seen.add(txt)
            rows.append({"platform": platform_name, "emotion": lab, "rank": kept+1, "prob": round(float(p[idx, li]),4), "text": txt})
            kept += 1
            if kept >= topk: break
    rep_df = pd.DataFrame(rows)
    out_csv = f"{OUTDIR}/rep_top50_{platform_name}.csv"
    rep_df.to_csv(out_csv, index=False, encoding="utf-8-sig")
    return out_csv

rep_paths = {}
for name, pth in PLATFORMS.items():
    rep_paths[name] = representative_for_platform(pth, name, topk=50)

# -------------------- (5) Threshold tuning for 혐오/공포 recall ↑ --------------------
def eval_with_boost(bh=1.0, bf=1.0):
    scales = np.ones(len(labels))
    if "혐오" in labels: scales[labels.index("혐오")] = bh
    if "공포" in labels: scales[labels.index("공포")] = bf
    p = proba_val * scales[None,:]
    p = p / p.sum(axis=1, keepdims=True)
    pred = p.argmax(1)
    pcls, rcls, f1cls, supp = precision_recall_fscore_support(yva_idx, pred, labels=range(len(labels)), zero_division=0)
    macro = f1_score(yva_idx, pred, average="macro")
    acc = accuracy_score(yva_idx, pred)
    dfm = pd.DataFrame({"label": labels, "precision": np.round(pcls,4), "recall": np.round(rcls,4), "f1": np.round(f1cls,4), "support": supp})
    return pred, dfm, macro, acc

grid = [1.0, 1.1, 1.2, 1.3, 1.4, 1.5]
records = []
best_macro=-1; best=(1.0,1.0,None)
for bh in grid:
    for bf in grid:
        _, dfm, macro, acc = eval_with_boost(bh, bf)
        hate_rec = float(dfm.loc[dfm["label"]=="혐오","recall"].values[0]) if "혐오" in labels else np.nan
        fear_rec = float(dfm.loc[dfm["label"]=="공포","recall"].values[0]) if "공포" in labels else np.nan
        records.append({"boost_혐오":bh, "boost_공포":bf, "macroF1":macro, "acc":acc, "recall_혐오":hate_rec, "recall_공포":fear_rec})
        if macro > best_macro:
            best_macro = macro; best=(bh, bf, dfm)

tune_df = pd.DataFrame(records).sort_values(["macroF1","recall_혐오","recall_공포"], ascending=False)
tune_csv = f"{OUTDIR}/threshold_tuning_grid.csv"
tune_df.to_csv(tune_csv, index=False, encoding="utf-8-sig")

_, base_dfm, base_macro, base_acc = eval_with_boost(1.0,1.0)
bh,bf,best_dfm = best
comp = base_dfm.merge(best_dfm, on="label", suffixes=("_base","_tuned"))
comp = comp[["label","precision_base","recall_base","f1_base","precision_tuned","recall_tuned","f1_tuned","support_base"]].rename(columns={"support_base":"support"})
comp_csv = f"{OUTDIR}/threshold_tuning_comparison.csv"
comp.to_csv(comp_csv, index=False, encoding="utf-8-sig")

# Show small previews
preview = pd.DataFrame({
    "rep_top50_youtube":[rep_paths["youtube"]],
    "rep_top50_blog":[rep_paths["blog"]],
    "rep_top50_instagram":[rep_paths["instagram"]],
    "rep_top50_x":[rep_paths["x"]],
    "threshold_grid":[tune_csv],
    "threshold_compare":[comp_csv],
})
display_dataframe_to_user("생성된 결과 파일 경로 미리보기", preview)

(rep_paths, tune_csv, comp_csv, use_ensemble)
