## features

notebook goal:
converting raw review text into ml suitable data

notebook todo:
- [x] explore tf-idf with word importance <= black magic
- [x] mod. feature transformations to avoid leakage <= ???

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
PROJECT_ROOT = Path("..")
DATA_PATH = PROJECT_ROOT / "data" / "processed" / "reviews_clean.csv"

reviews_df = pd.read_csv(DATA_PATH)

reviews_df.shape

In [None]:
reviews_df.head(3)

In [None]:
reviews_df = reviews_df.copy()

reviews_df = reviews_df.dropna(subset=["text", "label"])
reviews_df["text"] = reviews_df["text"].astype(str).str.strip()
reviews_df = reviews_df[reviews_df["text"].str.len() > 0].copy()
reviews_df["label"] = reviews_df["label"].astype(int)

reviews_df.shape

### splitting train and test

In [None]:
X_text = reviews_df["text"]
y = reviews_df["label"]

X_train_text, X_test_text, y_train, y_test = train_test_split(
    X_text,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

X_train_text.shape, X_test_text.shape

### tf-idf

note:
- tf-idf converts text into a numerical matrix to represent frequency and uniqueness
    - tf => "term frequency"
    - idf => "inverse document frequency"


In [None]:
tfidf = TfidfVectorizer(
    lowercase=True,
    stop_words="english",
    max_features=20_000,
    ngram_range=(1, 1)
)

In [None]:
X_train = tfidf.fit_transform(X_train_text)
X_test = tfidf.transform(X_test_text)

X_train.shape, X_test.shape

In [None]:
print("tf-idf features:", len(tfidf.vocabulary_))
print("train class balance:\n", y_train.value_counts(normalize=True))
print("test class balance:\n", y_test.value_counts(normalize=True))

In [None]:
vocab_items = list(tfidf.vocabulary_.items())
vocab_items[:20]

In [None]:
X_train[:1] # type: ignore

In [None]:
feature_names = np.array(tfidf.get_feature_names_out())

row = 0
vec = X_train[row].toarray().ravel() # type: ignore

top_idx = np.argsort(vec)[-15:][::-1]
list(zip(feature_names[top_idx], vec[top_idx]))

In [None]:
import joblib
from pathlib import Path

PROJECT_ROOT = Path("..")
ARTIFACTS_DIR = PROJECT_ROOT / "results" / "artifacts"
ARTIFACTS_DIR.mkdir(parents=True, exist_ok=True)

joblib.dump(tfidf, ARTIFACTS_DIR / "tfidf.joblib")
joblib.dump((X_train, X_test, y_train, y_test), ARTIFACTS_DIR / "splits.joblib")

ARTIFACTS_DIR

### notebook summary

- converted reviews to tf-idf feature vectors
- vocab dictionary was limited to control complexity
- prepped features for supervised learning