## Step 1 Load labeled dataset + basic checks

### Goal of this step: 
- Load the frozen seed dataset
- Separate features (X) and target (y)
- Verify nothing is broken before modeling

### Imports and path

In [None]:
import pandas as pd
from pathlib import Path

from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import classification_report, confusion_matrix, precision_recall_fscore_support
import imaplib

### Load dataset

In [None]:
data_path = Path("../data/final/final_1000/labeled_1000_samples.csv")

df = pd.read_csv(data_path)
print("Shape: ", df.shape)
df.head() 

###  Sanity checks

In [None]:
df.info() 

In [None]:
df.isna().sum() 

### Check label distribution

In [None]:
df["label"].value_counts(normalize=True)

###  Split X and y 

In [None]:
feature_cols = ["text_clean", "len_words", "is_question"]
target_col = "label"

X = df[feature_cols]
y = df[target_col]

print("X shape : ", X.shape)
print("y shape : ", y.shape) 

## Step 2 Train / validation split (STRATIFIED)

### Goal of this step:
- Split your 1000 labeled samples into:
- train set
- validation set
- Preserve the **class imbalance** this is critical

###  Stratified split 

We use stratification so the % of important messages stays the same

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, 
    y, 
    test_size = 0.2, 
    random_state=42, 
    stratify=y 
)

print("Train shape:", X_train.shape, y_train.shape)
print("Test shape  :", X_test.shape, y_test.shape) 

### Verify label distribution stayed correct

In [None]:
print("Train label distribution:")
print(y_train.value_counts(normalize=True))

print("\nValidation label distribution:")
print(y_test.value_counts(normalize=True)) 

## Step 3 Build feature + model pipeline

### Define which columns are text and numeric  

In [None]:
text_cols = "text_clean"
num_cols = ["len_words", "is_question"]

### Build preprocessors TF-IDF + numeric passthrough 

We scale numeric features.
TF-IDF already outputs sparse matrix scaler uses with_mean=False to work with sparse

In [None]:
text_vectorizer = TfidfVectorizer(
    ngram_range=(1, 2), 
    min_df = 2, 
    max_df = 0.9, 
    sublinear_tf=True
)

numeric_transformer = Pipeline(steps=[
    ("scaler", StandardScaler(with_mean=False))
])

preprocess = ColumnTransformer(
    transformers=[
        ("text", text_vectorizer, text_cols), 
        ("num", numeric_transformer, num_cols)
    ], 
    remainder="drop"
) 

### Create the full model pipeline

We use class_weight because your dataset is imbalanced

In [None]:
clf = LogisticRegression(
    max_iter=2000, 
    class_weight="balanced", 
    n_jobs=None
)

model = Pipeline(steps=[
    ("preprocess", preprocess), 
    ("clf", clf)
])

model 

## Step 4 Train + Evaluate 

### Fit the model

In [None]:
model.fit(X_train, y_train)
print("Training done")

In [None]:
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]  # probability of class 1 

print("Predictions ready")

###   Core metrics precision/recall/F1 

In [None]:
print(classification_report(y_test, y_pred, digits=4)) # While Recall asks "Did we find them all?", Precision asks "Of the ones we flagged, how many were actually right?" 

In [None]:
cm = confusion_matrix(y_test, y_pred)
print("Confusion matrix:\n", cm)

tn, fp, fn, tp = cm.ravel()
print(f"\nTN (correct skip)      : {tn}")
print(f"FP (skip predicted imp): {fp}")
print(f"FN (important missed)  : {fn}")
print(f"TP (important caught)  : {tp}") 

###  Evaluate different thresholds key for triage 

In [None]:
def eval_threshold(th):
    pred = (y_proba >= th).astype(int)
    p, r, f1, _ = precision_recall_fscore_support(y_test, pred, average="binary", zero_division=0)
    return p, r, f1

for th in [0.2, 0.3, 0.4, 0.5, 0.6, 0.7]:
    p, r, f1 = eval_threshold(th)
    print(f"th={th:.1f}  precision={p:.3f}  recall={r:.3f}  f1={f1:.3f}") 

### Save model with joblib

### Summary

**Model & Pipeline**
- TF-IDF vectorization on text (`ngram_range=(1,2)`)
- Numeric feature scaling
- Logistic Regression with `class_weight="balanced"`
- Implemented using `ColumnTransformer` + `Pipeline` for clean separation of preprocessing and modeling

**Evaluation (Validation Set)**
- Accuracy: ~0.76 (not primary metric)
- For **important class (1)** at default threshold 0.5:
  - Precision ≈ 0.26
  - Recall ≈ 0.41
- Confusion matrix:
  - TP: 11 | FN: 16 | FP: 32 | TN: 139

**Threshold Analysis (Key Insight)**
- For a triage system, recall is prioritized over precision
- At threshold **0.3**:
  - Recall ≈ **0.59**
  - Precision ≈ 0.24
- This significantly reduces missed important messages at the cost of acceptable notification noise

**Conclusion**
- The baseline model captures meaningful patterns despite limited labeled data
- Performance is sufficient to bootstrap **semi-supervised learning**
- This model is used as a seed to auto-label high-confidence samples in the next notebook 

## Save model with joblib

In [None]:
import joblib
MODEL_DIR = Path("../models")
MODEL_DIR.mkdir(parents=True, exist_ok=True)

MODEL_PATH = MODEL_DIR / "tg_logreg.joblib"

joblib.dump(model, MODEL_PATH)

print("Model saved to:", MODEL_PATH.resolve()) 