# 02 â€” Intent Data Augmentation + Baseline Classifier

In [None]:
!pip -q install scikit-learn pandas matplotlib

In [None]:
# --- BOOTSTRAP: mount Drive, cd to your lab, verify data, fix imports ---

from google.colab import drive
drive.mount('/content/drive')

import os, sys, glob
LAB_DIR = "/content/drive/MyDrive/Data_Labeling_Lab"   # <-- your folder
os.chdir(LAB_DIR)
print("CWD:", os.getcwd())

# make utils importable from this folder
if LAB_DIR not in sys.path:
    sys.path.insert(0, LAB_DIR)

# show notebooks and data files
print("Notebooks:", [os.path.basename(p) for p in glob.glob("*.ipynb")])
print("CSV files in data/:", [os.path.basename(p) for p in glob.glob("data/*.csv")])

# optional: install deps for a fresh runtime
!pip -q install "snorkel==0.9.9" scikit-learn pandas matplotlib

# load data using your utils
from utils import load_datasets, plot_label_counts
df = load_datasets("data")
print("Loaded rows:", len(df))
plot_label_counts(df);


In [None]:
import random, pandas as pd
from utils import load_datasets
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

INTENTS = ['order_status','refund','account_help','product_info','complaint']
LABEL2ID = {l:i for i,l in enumerate(INTENTS)}

df = load_datasets('data')
df['label_id'] = df['label'].map(LABEL2ID)

X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label_id'], test_size=0.25, random_state=42, stratify=df['label_id'])

# Simple augmentation: synonym replacement & question paraphrase
aug_map = {
    'refund':['return','money back'],
    'order':['purchase','order'],
    'track':['trace','track'],
    'password':['passcode','password'],
    'account':['profile','account'],
}

def augment(s, p=0.3):
    words = s.split()
    for i,w in enumerate(words):
        low = w.lower().strip('?,.!')
        if low in aug_map and random.random()<p:
            words[i] = random.choice(aug_map[low])
    # randomly convert to a question
    if random.random()<0.2 and not s.strip().endswith('?'):
        return 'Can you tell me: ' + ' '.join(words) + '?'
    return ' '.join(words)

X_aug = X_train.sample(frac=0.5, random_state=0).apply(augment)
y_aug = y_train.loc[X_aug.index]
X_train_full = pd.concat([X_train, X_aug])
y_train_full = pd.concat([y_train, y_aug])

clf = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1,2), min_df=2)),
    ('lr', LogisticRegression(max_iter=1000, multi_class='auto')),
])
clf.fit(X_train_full, y_train_full)

y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred, target_names=INTENTS, digits=3))