In [None]:
# ------------------------------
# Amazon Product Categorization - Kaggle Version
# ------------------------------

# ========== CONFIG ==========
# Note: In Kaggle, we usually don't use Drive. We use /kaggle/input.
SAMPLE_MODE = False       # True -> use smaller sample for quick debug
SAMPLE_SIZE = 4000
TOP_K = 10
RANDOM_STATE = 42
LSTM_EPOCHS = 3
MAX_SEQ_LENGTH = 128
BERT_EPOCHS = 2          # set low for debug; increase for final runs
BERT_BATCH_SIZE = 8
FINETUNE = False         # Set True to run BERT fine-tuning (requires GPU and time)

# Defines where to save models/outputs in Kaggle
OUTPUT_DIR = "/kaggle/working/" 

# ========== INSTALLS ==========
print("INSTALL STEP: checking/installing required packages...")
# Kaggle has most of these, but we ensure versions here.
!pip install -q "tensorflow>=2.16.1,<2.20" 
!pip install -q transformers datasets evaluate sentence-transformers
!pip install -q scikit-learn matplotlib seaborn joblib
# PyTorch is usually pre-installed on Kaggle GPU images, but this ensures compatibility
!pip install -q torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu118

import os
import shutil
import re
import html
import gc
import pandas as pd
import numpy as np
from pathlib import Path

# ========== FILE DETECTION (KAGGLE SPECIFIC) ==========
print("\n-- Detecting Input Files in /kaggle/input --")

products_path = None
cats_path = None

# Walk through the input directory to find the files regardless of the dataset name
for root, dirs, files in os.walk("/kaggle/input"):
    for name in files:
        full_path = os.path.join(root, name)
        # robust matching for products file
        if 'product' in name.lower() and name.lower().endswith('.csv'):
            products_path = full_path
        # robust matching for categories file
        if 'categor' in name.lower() and name.lower().endswith('.csv'):
            cats_path = full_path

if products_path is None or cats_path is None:
    print("❌ ERROR: Could not find 'amazon_products.csv' or 'amazon_categories.csv' in /kaggle/input.")
    print("Please ensure you have clicked '+ Add Data' and attached the dataset containing these CSVs.")
    # List what was found to help debug
    print("Files found in /kaggle/input:")
    for root, dirs, files in os.walk("/kaggle/input"):
        for name in files:
            print(os.path.join(root, name))
    raise FileNotFoundError("Dataset not attached or filenames do not match expected patterns.")
else:
    print(f"✅ Found Products: {products_path}")
    print(f"✅ Found Categories: {cats_path}")

# ========== QUICK PREVIEW ==========
print("\n-- Previewing small samples to understand schema --")
prod_preview = pd.read_csv(products_path, nrows=5, low_memory=False)
cats_preview = pd.read_csv(cats_path, nrows=20, low_memory=False)
print("Products columns:", prod_preview.columns.tolist())
display(prod_preview)
print("Categories columns:", cats_preview.columns.tolist())
display(cats_preview)

# ========== BUILD TEXT COLUMN ==========
print("\n-- Ensuring a 'text' column exists (title + description fallback) --")
prod_df = pd.read_csv(products_path, low_memory=False)
# find title / description columns heuristically
title_cols = [c for c in prod_df.columns if c.lower() in ['title', 'product_title', 'name']]
desc_cols  = [c for c in prod_df.columns if c.lower() in ['description', 'product_description', 'desc', 'long_description']]

if 'text' not in prod_df.columns:
    if title_cols and desc_cols:
        prod_df['text'] = (prod_df[title_cols[0]].astype(str) + ' ' + prod_df[desc_cols[0]].astype(str)).str.strip()
        print("Created 'text' from", title_cols[0], "+", desc_cols[0])
    elif title_cols:
        prod_df['text'] = prod_df[title_cols[0]].astype(str).str.strip()
        print("Created 'text' from", title_cols[0])
    elif desc_cols:
        prod_df['text'] = prod_df[desc_cols[0]].astype(str).str.strip()
        print("Created 'text' from", desc_cols[0])
    else:
        # no text; create empty and warn
        prod_df['text'] = ''
        print("WARNING: No title/description columns found. 'text' created empty -> 0 usable rows likely.")

print("Sample 'text' values:")
display(prod_df['text'].head(5))

# ========== LOAD CATEGORIES & BUILD MAP ==========
cats_df = pd.read_csv(cats_path, low_memory=False)
# Try to normalize expected column names
if 'id' in cats_df.columns and 'category_name' in cats_df.columns:
    id_col = 'id'
    name_col = 'category_name'
elif 'category_id' in cats_df.columns and 'category_name' in cats_df.columns:
    id_col = 'category_id'
    name_col = 'category_name'
else:
    # fallback: use first two columns
    id_col = cats_df.columns[0]
    name_col = cats_df.columns[1] if len(cats_df.columns) > 1 else cats_df.columns[0]
    print("Warning: unexpected categories columns, using", id_col, name_col)

# Build mapping: force keys to stringified ints when possible
def key_str(x):
    try:
        if pd.isna(x):
            return None
        fx = float(x)
        if fx.is_integer():
            return str(int(fx))
        return str(x)
    except Exception:
        return str(x)

cat_map = {}
for i, row in cats_df.iterrows():
    k = key_str(row[id_col])
    v = str(row[name_col])
    if k is not None:
        cat_map[k] = v
print(f"Built cat_map with {len(cat_map)} entries (sample):", list(cat_map.items())[:8])

# ========== EXTRACT & MAP CATEGORY IDs IN PRODUCTS ==========
print("\n-- Extracting numeric token from product category column and mapping to category_name --")
# Determine candidate column in products (common name is 'category_id' from your preview)
prod_candidate_cols = [c for c in prod_df.columns if c.lower() in ['category_id','category','cat_id','browse_node','browse_nodes','browseNodes']]
prod_candidate_cols = prod_candidate_cols or [c for c in prod_df.columns if 'cat' in c.lower() or 'category' in c.lower()]
prod_candidate_cols = list(dict.fromkeys(prod_candidate_cols))  # unique order-preserving
print("Candidate product category columns:", prod_candidate_cols)

def extract_catid(val):
    if pd.isna(val): return None
    s = str(val).strip()
    if s == '': return None
    # if already pure digits
    if re.fullmatch(r'\d+', s):
        return s
    # find first numeric token inside string
    m = re.search(r'(\d+)', s)
    if m:
        return m.group(1)
    return s

mapped_any = False
if prod_candidate_cols:
    col = prod_candidate_cols[0]
    print("Using product column for mapping:", col)
    prod_df['category_id_extracted'] = prod_df[col].apply(extract_catid)
    prod_df['category_name'] = prod_df['category_id_extracted'].map(cat_map)
    mapped_any = prod_df['category_name'].notna().sum() > 0
    print("Mapped rows:", prod_df['category_name'].notna().sum(), "out of", len(prod_df))
else:
    print("No candidate category column detected automatically. Showing product columns for manual selection.")
    print(prod_df.columns.tolist())

# If nothing mapped, attempt fuzzy textual mapping fallback (REQUIRES small sample & slower)
if not mapped_any:
    print("No direct numeric mapping found. Attempting fuzzy textual mapping on small sample...")
    from difflib import get_close_matches
    # build category_name list
    cat_names = list(set([str(x) for x in cats_df[name_col].astype(str).tolist()]))
    # take up to 500 distinct product text entries and try to match
    prod_texts = prod_df['text'].astype(str).dropna().unique()[:500]
    fuzzy_map = {}
    for t in prod_texts:
        cand = get_close_matches(t, cat_names, n=1, cutoff=0.85)
        if cand:
            fuzzy_map[t] = cand[0]
    if fuzzy_map:
        print("Fuzzy mapping found some matches (sample):", list(fuzzy_map.items())[:10])
        # Here we won't assign to whole df automatically; we require a human check if many matches
    else:
        print("No strong fuzzy matches found. At this point we will filter to mapped rows only (safe).")

# Diagnostics: how many usable labeled rows (text present & category_name present)
prod_df['text_len'] = prod_df['text'].astype(str).str.len()
usable_df = prod_df[ (prod_df['text_len'] > 0) & (prod_df['category_name'].notna()) ].copy()
print("Usable labeled rows after mapping:", len(usable_df))
if len(usable_df) == 0:
    raise RuntimeError("No usable labeled rows after mapping. Inspect candidate columns and mapping logic above.")

# Replace prod_df with usable subset for downstream pipeline
prod_df = usable_df.reset_index(drop=True)

# Optionally sample for quicker runs
if SAMPLE_MODE:
    prod_df = prod_df.sample(n=min(SAMPLE_SIZE, len(prod_df)), random_state=RANDOM_STATE).reset_index(drop=True)
    print("SAMPLE_MODE active -> sampled to", len(prod_df))

print("Top categories (counts):")
print(prod_df['category_name'].value_counts().head(20))

# ========== PREPROCESSING & SPLIT ==========
print("\n-- Preprocessing & train/test split --")
def clean_text(s):
    if pd.isna(s): return ''
    s = str(s)
    s = html.unescape(s)
    s = s.lower()
    s = re.sub(r'<.*?>',' ',s)
    s = re.sub(r'http\\S+',' ',s)
    s = re.sub(r'[^a-z0-9\\s]',' ',s)
    s = re.sub(r'\\s+',' ',s).strip()
    return s

prod_df['clean_text'] = prod_df['text'].apply(clean_text)
le = None
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
prod_df['label'] = le.fit_transform(prod_df['category_name'])

from sklearn.model_selection import train_test_split
X = prod_df['clean_text'].values
y = prod_df['label'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y)
print("Train size:", len(X_train), "Test size:", len(X_test))

# ========== TF-IDF + CLASSICAL MODELS ==========
print("\n-- TF-IDF & classical models (LogisticRegression, MultinomialNB, RandomForest) --")
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_features=20000 if not SAMPLE_MODE else 2000, ngram_range=(1,2), min_df=2)
Xtr_tfidf = tfidf.fit_transform(X_train)
Xte_tfidf = tfidf.transform(X_test)
print("TF-IDF shapes:", Xtr_tfidf.shape, Xte_tfidf.shape)

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score
from sklearn.preprocessing import label_binarize

models = {
    'LogisticRegression': LogisticRegression(max_iter=1000, solver='saga', multi_class='multinomial', random_state=RANDOM_STATE),
    'MultinomialNB': MultinomialNB(),
    'RandomForest': RandomForestClassifier(n_estimators=200 if not SAMPLE_MODE else 50, random_state=RANDOM_STATE, n_jobs=1)
}
results = {}
for name, m in models.items():
    print(f"\nTraining {name} ...")
    m.fit(Xtr_tfidf, y_train)
    preds = m.predict(Xte_tfidf)
    acc = accuracy_score(y_test, preds)
    report = classification_report(y_test, preds, output_dict=True, zero_division=0)
    cm = confusion_matrix(y_test, preds)
    try:
        probs = m.predict_proba(Xte_tfidf)
        y_bin = label_binarize(y_test, classes=range(len(le.classes_)))
        roc_macro = roc_auc_score(y_bin, probs, average='macro', multi_class='ovr')
    except Exception:
        roc_macro = None
    results[name] = {'model': m, 'accuracy': acc, 'report': report, 'cm': cm, 'roc_macro': roc_macro}
    print(f"{name} done. Accuracy: {acc:.4f} ROC_AUC_macro: {roc_macro if roc_macro is not None else 'N/A'}")

# Summary table
import pandas as pd
summary_rows = []
for name, r in results.items():
    summary_rows.append({'Model': name, 'Accuracy': round(r['accuracy'],4), 'Macro F1': round(r['report']['macro avg']['f1-score'],4), 'ROC AUC Macro': round(r['roc_macro'],4) if r['roc_macro'] is not None else 'N/A'})
summary_df = pd.DataFrame(summary_rows).sort_values('Accuracy', ascending=False)
print("\nClassical model comparison:")
print(summary_df.to_string(index=False))

# Save best classical model
best_classical_name = summary_df.iloc[0]['Model']
import joblib
joblib.dump({'model': results[best_classical_name]['model'], 'tfidf': tfidf, 'label_encoder': le}, os.path.join(OUTPUT_DIR, 'best_classical_model.joblib'))
print(f"Saved best classical model (joblib) at {OUTPUT_DIR}best_classical_model.joblib")

# Save confusion matrix image
import matplotlib.pyplot as plt
cm = results[best_classical_name]['cm']
plt.figure(figsize=(8,6))
plt.imshow(cm, interpolation='nearest')
plt.title('Confusion Matrix - ' + best_classical_name)
plt.colorbar()
plt.xticks(range(len(le.classes_)), le.classes_, rotation=45, ha='right')
plt.yticks(range(len(le.classes_)), le.classes_)
for i in range(cm.shape[0]):
    for j in range(cm.shape[1]):
        plt.text(j, i, cm[i,j], ha='center', va='center', fontsize=8)
plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, 'confusion_classical.png'), bbox_inches='tight')
plt.show()
print(f"Saved confusion matrix to {OUTPUT_DIR}confusion_classical.png")

# ========== SMALL LSTM ==========
print("\n-- Training LSTM (Keras) --")
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dropout, Dense

MAX_NUM_WORDS = 20000
MAX_LEN = 100
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS, oov_token='<OOV>')
tokenizer.fit_on_texts(X_train)
Xtr_seq = tokenizer.texts_to_sequences(X_train)
Xte_seq = tokenizer.texts_to_sequences(X_test)
Xtr_pad = pad_sequences(Xtr_seq, maxlen=MAX_LEN)
Xte_pad = pad_sequences(Xte_seq, maxlen=MAX_LEN)
num_classes = len(le.classes_)

model_lstm = Sequential([
    Embedding(MAX_NUM_WORDS, 128, input_length=MAX_LEN),
    LSTM(128),
    Dropout(0.3),
    Dense(128, activation='relu'),
    Dense(num_classes, activation='softmax')
])
model_lstm.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
print("LSTM compiled. Training now...")
model_lstm.fit(Xtr_pad, y_train, validation_split=0.1, epochs=LSTM_EPOCHS, batch_size=32)
pred_lstm = model_lstm.predict(Xte_pad).argmax(axis=1)
print("LSTM classification report:")
from sklearn.metrics import classification_report
print(classification_report(y_test, pred_lstm, target_names=le.classes_))
joblib.dump({'model': model_lstm, 'tokenizer': tokenizer, 'label_encoder': le}, os.path.join(OUTPUT_DIR, 'lstm_model.joblib'))
print(f"Saved LSTM model at {OUTPUT_DIR}lstm_model.joblib")

# ========== BERT EMBEDDINGS + LR ==========
print("\n-- Sentence-Transformer embeddings (all-MiniLM-L6-v2) + LogisticRegression --")
from sentence_transformers import SentenceTransformer
embedder = SentenceTransformer('all-MiniLM-L6-v2')
Xtr_emb = embedder.encode(X_train, show_progress_bar=True)
Xte_emb = embedder.encode(X_test, show_progress_bar=True)
from sklearn.linear_model import LogisticRegression
clf_emb = LogisticRegression(max_iter=1000)
clf_emb.fit(Xtr_emb, y_train)
pred_emb = clf_emb.predict(Xte_emb)
print("Embedding-based classifier report:")
print(classification_report(y_test, pred_emb, target_names=le.classes_))
joblib.dump({'model': clf_emb, 'embedder_name': 'all-MiniLM-L6-v2', 'label_encoder': le}, os.path.join(OUTPUT_DIR, 'bert_embeddings_clf.joblib'))
print(f"Saved embeddings classifier at {OUTPUT_DIR}bert_embeddings_clf.joblib")

# ========== OPTIONAL BERT FINE-TUNING ==========
if FINETUNE:
    print("\n-- BERT fine-tuning (HuggingFace Trainer) --")
    from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
    from datasets import Dataset
    import evaluate
    import numpy as np

    model_name = 'bert-base-uncased'
    tokenizer_hf = AutoTokenizer.from_pretrained(model_name)
    hf_train = Dataset.from_dict({'text': list(X_train), 'label': list(y_train)})
    hf_test  = Dataset.from_dict({'text': list(X_test),  'label': list(y_test)})

    def tokenize_fn(examples):
        return tokenizer_hf(examples['text'], truncation=True, padding='max_length', max_length=MAX_SEQ_LENGTH)

    print("Tokenizing datasets...")
    hf_train = hf_train.map(tokenize_fn, batched=True)
    hf_test  = hf_test.map(tokenize_fn, batched=True)

    hf_train = hf_train.remove_columns(['text'])
    hf_test  = hf_test.remove_columns(['text'])

    hf_train.set_format(type='torch', columns=['input_ids','attention_mask','label'])
    hf_test.set_format(type='torch', columns=['input_ids','attention_mask','label'])

    accuracy_metric = evaluate.load("accuracy")
    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        preds = np.argmax(logits, axis=-1)
        return accuracy_metric.compute(predictions=preds, references=labels)

    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_classes)
    # Update output dir to Kaggle working dir
    training_args = TrainingArguments(output_dir=os.path.join(OUTPUT_DIR, 'bert_finetune'), num_train_epochs=BERT_EPOCHS, per_device_train_batch_size=BERT_BATCH_SIZE, per_device_eval_batch_size=BERT_BATCH_SIZE, evaluation_strategy='epoch', save_strategy='epoch', logging_steps=50, learning_rate=2e-5, load_best_model_at_end=True, metric_for_best_model='accuracy')
    trainer = Trainer(model=model, args=training_args, train_dataset=hf_train, eval_dataset=hf_test, compute_metrics=compute_metrics)

    print("Starting training (ensure GPU is enabled).")
    trainer.train()
    trainer.save_model(os.path.join(OUTPUT_DIR, 'bert_finetuned_model'))
    print(f"Saved fine-tuned BERT at {OUTPUT_DIR}bert_finetuned_model")

# ========== FINAL SAVES & INFERENCE ==========
print("\n-- Saving selection summary and preparing inference helper --")
joblib.dump({'classical_summary': summary_df.to_dict(orient='records')}, os.path.join(OUTPUT_DIR, 'model_selection_summary.joblib'))

def predict_text(text, model_key='best_classical'):
    t = clean_text(text)
    if model_key == 'best_classical':
        art = joblib.load(os.path.join(OUTPUT_DIR, 'best_classical_model.joblib'))
        model = art['model']; tf = art['tfidf']; le_local = art.get('label_encoder', le)
        x = tf.transform([t])
        p = model.predict(x)[0]
        return le_local.inverse_transform([p])[0]
    if model_key == 'bert_emb':
        art = joblib.load(os.path.join(OUTPUT_DIR, 'bert_embeddings_clf.joblib'))
        embedder_local = SentenceTransformer(art['embedder_name'])
        v = embedder_local.encode([t])
        p = art['model'].predict(v)[0]
        return art['label_encoder'].inverse_transform([p])[0]
    if model_key == 'lstm':
        art = joblib.load(os.path.join(OUTPUT_DIR, 'lstm_model.joblib'))
        tok = art['tokenizer']; mdl = art['model']; le_local = art['label_encoder']
        seq = tok.texts_to_sequences([t]); pad = pad_sequences(seq, maxlen=MAX_LEN)
        p = mdl.predict(pad).argmax(axis=1)[0]
        return le_local.inverse_transform([p])[0]
    return None

print(f"\nDONE. Artifacts saved in {OUTPUT_DIR}:")
print(" - best_classical_model.joblib")
print(" - lstm_model.joblib")
print(" - bert_embeddings_clf.joblib")
print(" - confusion_classical.png")
print(" - model_selection_summary.joblib")
if FINETUNE:
    print(" - bert_finetuned_model/")

print("\nExample prediction (first product title):")
print("Text:", prod_df['text'].iloc[0])
print("Predicted (best classical):", predict_text(prod_df['text'].iloc[0], model_key='best_classical'))