##  Step 1 Load datasets + create _label

### imports and path

In [None]:
import pandas as pd
from pathlib import Path
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.linear_model import LogisticRegression
from sklearn.cluster import MiniBatchKMeans
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_fscore_support

In [None]:
labeled_path = Path("../data/final/final_1000/labeled_1000_samples.csv")
unlabeled_path = Path("../data/final/final_full/ready_messages.csv")

assert labeled_path.exists(), f"Missing: {labeled_path.resolve()}"
assert unlabeled_path.exists(), f"Missing: {unlabeled_path.resolve()}"

print("Labeled  :", labeled_path.resolve())
print("Unlabeled:", unlabeled_path.resolve())  

 ### Load both datasets

In [None]:
df_lab = pd.read_csv(labeled_path)
df_unlab = pd.read_csv(labeled_path)

print("df_lab shape  :", df_lab.shape)
print("df_unlab shape:", df_unlab.shape)

df_lab.head(5), df_unlab.head(5) 

### Create _label anchors and combine

- _label = label for labeled
- _label = -1 for unlabeled (unknown)

In [None]:
feature_cols = ["text_clean", "len_words", "is_question"]


df_lab2 = df_lab[feature_cols + ["label"]].copy()
df_lab2["_label"] = df_lab2["label"].astype(int)
df_lab2["_is_labeled"] = 1

df_unlab2 = df_unlab[feature_cols].copy()
df_unlab2["_label"] = -1
df_unlab2["_is_labeled"] = 0

df_all = pd.concat([df_lab2, df_unlab2], ignore_index=True)

df_all["len_words"] = df_all["len_words"].astype(int)
df_all["is_question"] = df_all["is_question"].astype(int)
df_all["_label"] = df_all["_label"].astype(int)

print("df_all shape:", df_all.shape)
df_all.head() 

## Step 2

Vectorize text for clustering (TF-IDF)

**Goal**:
- Build a shared TF-IDF space for all messages (labeled + unlabeled)
- This space will be used only for clustering
- We reuse the same philosophy as Notebook 2, but this vectorizer is independent 

## Create TF-IDF vectorizer 

In [None]:
tfidf_cluster = TfidfVectorizer(
    ngram_range=(1, 2), 
    min_df=2, 
    max_df=0.9, 
    sublinear_tf=True
)

Why:
- unigrams + bigrams → captures “can you”, “are you”
- min_df=2 → removes typos / ultra-rare noise
- max_df=0.9 → removes useless global words


## Fit TF-IDF on all messages 

This is important:
- fit on df_all, not only labeled
- Clustering must see the whole distribution 

In [None]:
X_text = tfidf_cluster.fit_transform(df_all["text_clean"])
print("TF-IDF matrix shape :", X_text.shape)

In [None]:
print("Sparse matrix type:", type(X_text))
print("Density:", X_text.nnz / (X_text.shape[0] * X_text.shape[1])) 

### Step 3

Cluster messages with **MiniBatchKMeans**

Why MiniBatchKMeans?
- works with large sparse TF-IDF matrices
- faster than full KMeans
- good enough for our purpose group similar messages

### Choose number of clusters and fit

Start with k=300 good default. You can adjust later 

In [None]:
k = 300

kmeans = MiniBatchKMeans(
    n_clusters=k,
    random_state=42,
    batch_size=2048,
    n_init="auto"
)

clusters = kmeans.fit_predict(X_text)

df_all["cluster"] = clusters

print(" Clustering done")
print("Clusters shape:", clusters.shape)
print("Unique clusters:", df_all["cluster"].nunique())
df_all[["cluster", "_is_labeled", "_label", "text_clean"]].head(10)

## Check how many labeled anchors per cluster 

This tells us whether clusters are can propagate or empty 

## Check how many labeled anchors per cluster 

In [None]:
anchors_per_cluster = (
    df_all[df_all["_is_labeled"] == 1]
    .groupby("cluster")
    .size()
    .sort_values(ascending=False)
)

print("Anchored clusters:", anchors_per_cluster.shape[0], "out of", k)
print("\nTop 15 clusters by #anchors:")
display(anchors_per_cluster.head(15)) 

### Inspect a few clusters (sanity check) 

In [None]:
def show_cluster(c, n=10):
    sample = df_all[df_all["cluster"] == c].sample(min(n, (df_all["cluster"] == c).sum()), random_state=42)
    return sample[["_is_labeled", "_label", "text_clean"]]

some_clusters = anchors_per_cluster.head(10).index.tolist() if len(anchors_per_cluster) >= 3 else df_all["cluster"].dropna().unique()[:3]

for c in some_clusters:
    print("\n" + "="*80)
    print(f"Cluster {c} | size={(df_all['cluster']==c).sum()} | anchors={anchors_per_cluster.get(c, 0)}")
    display(show_cluster(c, n=12)) 

## Step 4

Propagate labels using existing anchors majority vote

### majority vote propagation 

Rules:
- Only propagate in clusters that have anchors
- Use majority label of anchors
- Do NOT overwrite human labels

In [None]:
import numpy as np

df_all["prop_label"] = np.nan

for c in range(k):
    idx = df_all.index[df_all["cluster"] == c]
    
    anchors = df_all.loc[idx][df_all.loc[idx]["_is_labeled"] == 1]
    
    if len(anchors) == 0:
        continue 
    
    # majority label among anchors 
    majority_label = anchors["_label"].mode()[0]
    
    # assign to ALL points in cluster
    df_all.loc[idx, "prop_label"] = majority_label 

### Check propagation coverage 

In [None]:
print("Total messages:", len(df_all))
print("Propagated labels:", df_all["prop_label"].notna().sum())
print("Still unlabeled:", df_all["prop_label"].isna().sum())

df_all["prop_label"].value_counts(dropna=False) 

## Step 5 

Distance filtering keep closest X% per cluster

What this does

For each cluster:
- compute distance of every point to its centroid
- keep only the closest 50% 
- discard the far points (likely boundary / mixed) 

###  Compute distances to cluster centroids 

In [None]:
D = kmeans.transform(X_text)

# distance of each point to its assigned centroid
dist_to_centroid = D[np.arange(len(df_all)), df_all["cluster"].values]

df_all["dist_to_centroid"] = dist_to_centroid
df_all["dist_to_centroid"].describe()

###  Keep closest percentile per cluster 

In [None]:
percentile_closest = 50  # keep closest 50% of each cluster

keep_mask = np.zeros(len(df_all), dtype=bool)

for c in range(k):
    idx = np.where(df_all["cluster"].values == c)[0]
    if len(idx) == 0:
        continue
    
    d = df_all["dist_to_centroid"].values[idx]
    cutoff = np.percentile(d, percentile_closest)
    keep_mask[idx] = d <= cutoff

df_keep = df_all[keep_mask].copy()
print("Kept after distance filter:", len(df_keep), "out of", len(df_all)) 

### Keep only rows that actually got propagated labels

We only want points in anchored clusters they have prop_label

In [None]:
df_keep = df_keep[df_keep["prop_label"].notna()].copy()
print("Kept + propagated:", len(df_keep))
df_keep["prop_label"].value_counts() 

Build pseudo-labeled dataset from unlabeled only

We will create a file for training:
- take only unlabeled rows (_is_labeled == 0)
- use prop_label as label 

In [None]:
pseudo = df_keep[df_keep["_is_labeled"] == 0].copy()
pseudo["label"] = pseudo["prop_label"].astype(int)

print("Pseudo-labeled rows (from unlabeled):", len(pseudo))
pseudo[["label", "text_clean", "len_words", "is_question"]].head(10) 

### Save pseudo-labeled dataset 

In [None]:
OUT_DIR = Path("../data/final")
OUT_DIR.mkdir(parents=True, exist_ok=True)

pseudo_path = OUT_DIR / "pseudo_labeled_cluster_filtered.csv"
pseudo.to_csv(pseudo_path, index=False, encoding="utf-8")

print("Saved:", pseudo_path.resolve()) 

What we have now important

You now produced high confidence pseudo labels using:
- clustering
- majority label propagation from your 1000 anchors
- distance filtering to reduce noise

This is exactly Géron’s method, adapted to your text 

## Step 6 

#### Merge datasets = > retrain supervised model = >  evaluate
 

### Load seed + pseudo datasets 

In [None]:
seed_path = Path("../data/final/final_1000/labeled_1000_samples.csv")
pseudo_path = Path("../data/final/pseudo_labeled_cluster_filtered.csv")

df_seed = pd.read_csv(seed_path)
df_pseudo = pd.read_csv(pseudo_path)

print("Seed  :", df_seed.shape)
print("Pseudo :", df_pseudo.shape)

df_seed.head(2), df_pseudo.head(2) 

### Keep only model columns + clean dtypes 

In [None]:
feature_cols = ["text_clean", "len_words", "is_question"]
target_col = "label"

df_seed = df_seed[feature_cols + [target_col]].copy()
df_pseudo = df_pseudo[feature_cols + [target_col]].copy()

# enforce types
for col in ["len_words", "is_question", "label"]:
    df_seed[col] = df_seed[col].astype(int)
    df_pseudo[col] = df_pseudo[col].astype(int)

print("Seed label distribution:\n", df_seed["label"].value_counts())
print("\nPseudo label distribution:\n", df_pseudo["label"].value_counts()) 

### Merge to create semi-supervised training dataset 

In [None]:
df_semi = pd.concat([df_seed, df_pseudo], ignore_index=True)

print("Semi dataset shape:", df_semi.shape)
print("\nSemi label distribution:\n", df_semi["label"].value_counts(normalize=True)) 

In [None]:
out_path = Path("../data/final/train_semi_supervised.csv")
df_semi.to_csv(out_path, index=False, encoding="utf-8")
print("Saved:", out_path.resolve()) 

### Train/Test split stratified

In [None]:
from sklearn.model_selection import train_test_split

X = df_semi[feature_cols]
y = df_semi[target_col]

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Train:", X_train.shape, y_train.shape)
print("Test  :", X_test.shape, y_test.shape) 

### Build the SAME model pipeline as Notebook 2 

In [None]:
text_col = "text_clean"
num_col = ["len_words", "is_question"]

text_vectorizer = TfidfVectorizer(
    ngram_range=(1, 2),
    min_df=2,
    max_df=0.9,
    sublinear_tf=True
)

numeric_transformer = Pipeline(steps=[
    ("scaler", StandardScaler(with_mean=False))
])

preprocess = ColumnTransformer(
    transformers=[
        ("text", text_vectorizer, text_col),
        ("num", numeric_transformer, num_col),
    ],
    remainder="drop"
)

clf = LogisticRegression(
    max_iter=2000,
    class_weight="balanced",
    random_state=42
)

model_semi = Pipeline(steps=[
    ("preprocess", preprocess),
    ("clf", clf),
]) 

### Train + evaluate 

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_fscore_support
import numpy as np

model_semi.fit(X_train, y_train)

y_pred = model_semi.predict(X_val)
y_proba = model_semi.predict_proba(X_val)[:, 1]

print("Semi-supervised model trained\n")
print(classification_report(y_val, y_pred, digits=4))

cm = confusion_matrix(y_val, y_pred)
print("Confusion matrix:\n", cm)

tn, fp, fn, tp = cm.ravel()
print(f"\nTN: {tn}  FP: {fp}  FN: {fn}  TP: {tp}") 

In [None]:
def eval_threshold(th):
    pred = (y_proba >= th).astype(int)
    p, r, f1, _ = precision_recall_fscore_support(
        y_val, pred, average="binary", zero_division=0
    )
    return p, r, f1

for th in [0.2, 0.3, 0.4, 0.5, 0.6, 0.7]:
    p, r, f1 = eval_threshold(th)
    print(f"th={th:.1f}  precision={p:.3f}  recall={r:.3f}  f1={f1:.3f}") 

In [None]:
THRESHOLD = 0.3

y_pred_final = (y_proba >= THRESHOLD).astype(int)

print("Using threshold =", THRESHOLD)
print(classification_report(y_val, y_pred_final, digits=4))
print("Confusion matrix:\n", confusion_matrix(y_val, y_pred_final))  

### Save the model joblib  

In [None]:
import joblib 

MODEL_DIR = Path("../models")
MODEL_DIR.mkdir(parents=True, exist_ok=True)

MODEL_PATH = MODEL_DIR / "tg_logreg_semi_cluster.joblib"

joblib.dump(model_semi, MODEL_PATH)

print(" Model saved to:", MODEL_PATH.resolve()) 

In [None]:
import json

CONFIG_PATH = MODEL_DIR / "tg_config.json"

config = {
    "threshold": 0.3,
    "positive_class": "important",
    "negative_class": "skip",
    "notes": "Chosen to prioritize recall (~70%+) for important messages"
}

with open(CONFIG_PATH, "w") as f:
    json.dump(config, f, indent=2)

print(" Config saved to:", CONFIG_PATH.resolve())  

In [None]:
loaded_model = joblib.load(MODEL_PATH)

# sanity check
proba_test = loaded_model.predict_proba(X_val)[:, 1]
pred_test = (proba_test >= 0.3).astype(int)

print("Load test OK. Predictions:", pred_test[:10]) 

### Goal 
Improve recall for **important Telegram messages** without additional manual labeling by leveraging a large pool of unlabeled data.

---

### Method
This notebook implements a **semi-supervised learning pipeline inspired by label propagation**:

1. **Anchor labels**  
   Started from ~1,000 manually labeled messages (`important` / `skip`).

2. **Shared representation**  
   All labeled and unlabeled messages were embedded into a common **TF-IDF space** (unigrams + bigrams).

3. **Clustering**  
   Messages were grouped using **MiniBatch K-Means** (`k = 300`).  
   ~85% of clusters contained at least one labeled anchor.

4. **Label propagation**  
   For each anchored cluster, labels were propagated using **majority vote** from human-labeled messages.

5. **Noise reduction**  
   To reduce incorrect propagation, only the **closest 50% of samples per cluster** (by distance to centroid) were kept.

6. **Dataset expansion**  
   This produced **619 high-confidence pseudo-labeled messages**, expanding the training set by ~60% with no extra manual effort.

---

### Model Training
A **Logistic Regression** classifier with:
- TF-IDF text features
- Numeric features (`len_words`, `is_question`)
- Class-weight balancing

was retrained on the **merged dataset** (manual + pseudo labels).

---

### Results
Compared to the supervised-only baseline:

- **Recall (important messages)** improved from ~0.40 → **0.63**
- With threshold tuning (`p ≥ 0.3`), recall reached **~0.78**
- Improvement achieved **without new human labels**

This confirms that cluster-based semi-supervised learning effectively reduces missed important messages in a real-world, imbalanced setting.

---

### Key Takeaway
Using human labeled messages as anchors and propagating labels within semantic clusters is an efficient and explainable way to scale personal text classification systems when labeled data is scarce.