### Step 1: Load trained model + unlabeled data

Goal of this step:
- Load the trained supervised model from Notebook 2
- Load the unlabeled personal messages
- NO labeling yet, just preparation

This keeps things safe and reproducible

### Imports + paths 

In [None]:
import pandas as pd 
from pathlib import Path
import joblib 

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression 

### Load trained model

In [None]:
model_path = Path("../models/tg_logreg.joblib")
model = joblib.load(model_path)
print("Model loaded") 

### Load unlabeled dataset 

In [None]:
data_path = Path("../data/final/final_full/clean_messages.csv")

df_unlabeled = pd.read_csv(data_path) 
print("Unlabeled data shape: ", df_unlabeled.shape) 

df_unlabeled.head() 

## Step 2 prep unlabeled data

In [None]:
feature_cols = ["text_clean", "len_words", "is_question"]

X_unlabeled = df_unlabeled[feature_cols].copy()
 
X_unlabeled["len_words"] = X_unlabeled["len_words"].astype(int)
X_unlabeled["is_question"] = X_unlabeled["is_question"].astype(int)

print("X_unlabeled shape:", X_unlabeled.shape)
print(X_unlabeled.dtypes)
X_unlabeled.head() 

In [None]:
OUT_DIR = Path("../data/final/final_full")
OUT_DIR.mkdir(parents=True, exist_ok=True)

out_path = OUT_DIR / "ready_messages.csv"
X_unlabeled.to_csv(out_path, index=False, encoding="utf-8")

print("Saved model-ready unlabeled dataset to:", out_path.resolve()) 

## Step 3

### Build preprocessing + model pipeline

Purpose:
**Combine**
- cleaned text (text_clean)
- numeric features (len_words, is_question) into one reproducible ML pipeline

### Define faeture groups

In [None]:
text_cols = "text"
num_cols = ["len_words", "is_question"] 

### Text + numeric preprocessors  

In [None]:
text_vectorizer = TfidfVectorizer(
    ngram_range=(1, 2), 
    min_df=2, 
    max_df=0.9, 
    sublinear_tf=True
)

numeric_transformer = Pipeline(steps=[
    ("scaler", StandardScaler(with_mean=False))
])

preprocess = ColumnTransformer(
    transformers=[
        ("text", text_vectorizer, text_cols),
        ("num", numeric_transformer, num_cols),
    ],
    remainder="drop"
) 


### Classifer 

In [None]:
clf = LogisticRegression(
    max_iter=2000, 
    class_weight="balanced", 
    random_state=42
)