In [72]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.pipeline import Pipeline, FeatureUnion   # <-- add FeatureUnion here
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import ComplementNB
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix


In [74]:
file = '/Users/vijeethvj8/Downloads/Elevateme/TEXT_CLASSIFICATION/text classifcation.csv'
df = pd.read_csv(file)

print(df.head())
print(df.columns)

                                                text   label
0  DEbATinG IF BuRgER🍔 Or bIRYanI is THe TRUe kIn...    food
1  LATEst SMartpHONE bY opeNai dROPpEd tOdAy 🔥 wi...    tech
2  cRicKet COMmeNTArY FelT bIasEd SmH BUT sTILL W...  sports
3  sOfTwaRE upDatE HaD BuGZzZ again 😂 usErs on Tw...    tech
4  soFTwarE updatE Had bugZzz AGAIN 😂 useRs On Tw...    tech
Index(['text', 'label'], dtype='object')


In [76]:
print(df.isnull().sum())
print(df.duplicated().sum())

text     0
label    0
dtype: int64
0


In [78]:
df['text'] = df['text'].str.strip().str.title()


In [80]:
print(df.head())
category = df.groupby('label')['label'].value_counts()
print(category)

                                                text   label
0  Debating If Burger🍔 Or Biryani Is The True Kin...    food
1  Latest Smartphone By Openai Dropped Today 🔥 Wi...    tech
2  Cricket Commentary Felt Biased Smh But Still W...  sports
3  Software Update Had Bugzzz Again 😂 Users On Tw...    tech
4  Software Update Had Bugzzz Again 😂 Users On Tw...    tech
label
entertainment    2000
food             2000
politics         2000
sports           2000
tech             2000
Name: count, dtype: int64


In [82]:
df['text'] = df['text'].str.replace(r'[^a-zA-Z0-9\s]', '', regex=True)
df['text'] = df['text'].str.replace(r'  ', ' ', regex=True)
stopping_words = [
    # Articles
    'A', 'An', 'The',
    
    # Pronouns
    'I', 'We', 'You', 'He', 'She', 'They', 'Me', 'My', 'Mine', 'His', 'Her',
    'Hers', 'Its', 'Your', 'Yours', 'Our', 'Ours', 'Their', 'Theirs', 'Them', 'Us',
    
    # Auxiliary verbs & modals
    'Am', 'Is', 'Are', 'Was', 'Were', 'Be', 'Been', 'Being',
    'Do', 'Does', 'Did', 'Doing',
    'Have', 'Has', 'Had', 'Having',
    'Will', 'Would', 'Shall', 'Should', 'Can', 'Could',
    'May', 'Might', 'Must', 'Of',
    
    # Conjunctions
    'And', 'But', 'Or', 'If', 'Because', 'While', 'Although', 'Though', 'Unless',
    'Until', 'Than', 'Then',
    
    # Prepositions
    'In', 'On', 'At', 'By', 'For', 'With', 'About', 'Against',
    'Between', 'Into', 'Through', 'During', 'Before', 'After',
    'Above', 'Below', 'From', 'Up', 'Down', 'Over', 'Under',
    'Again', 'Further', 'Out', 'Off', 'Toward', 'Around',
    
    # Demonstratives
    'This', 'That', 'These', 'Those',
    
    # Adverbs/Particles
    'Not', 'No', 'Nor', 'So', 'Only', 'Very', 'Just', 'Even', 'Once'
]


# Create regex pattern for stopwords
pattern = r'\b(' + '|'.join(stopping_words) + r')\b'

# Remove stopwords
df['cleaned_text'] = df['text'].str.replace(pattern, '', regex=True)

# Remove extra spaces again
df['cleaned_text'] = df['cleaned_text'].str.replace(r'\s+', ' ', regex=True).str.strip()

print(df.head(10))



                                                text   label  \
0  Debating If Burger Or Biryani Is The True King...    food   
1  Latest Smartphone By Openai Dropped Today With...    tech   
2  Cricket Commentary Felt Biased Smh But Still W...  sports   
3  Software Update Had Bugzzz Again Users On Twit...    tech   
4  Software Update Had Bugzzz Again Users On Twit...    tech   
5  Tried The New Burger Yesterday Omg It Was Sooo...    food   
6  Ronaldo Scored A Last Minute Goal Ppl Went Cra...  sports   
7  Latest Smartphone By Apple Dropped Today With ...    tech   
8  Debating If Burger Or Biryani Is The True King...    food   
9  Cricket Commentary Felt Biased Smh But Still W...  sports   

                                        cleaned_text  
0       Debating Burger Biryani True King Food Fight  
1  Latest Smartphone Openai Dropped Today Ai Feat...  
2  Cricket Commentary Felt Biased Smh Still Whatt...  
3           Software Update Bugzzz Users Twitter Mad  
4           Software

----------------------------------------------------------------------     1st Model     ----------------------------------------------------------------------

In [85]:
# 1) Train/Test split BEFORE any vectorizer fitting ---------------------------
X_train_text, X_test_text, y_train, y_test = train_test_split(
    df['text'],
    df['label'],
    test_size=0.2,
    random_state=42,
    stratify=df['label']
)

# 2) Vectorizers: word + char TF-IDF (robust to noisy/messy text) -------------
# Note: We assume you already removed stopwords. If not, you can set stop_words='english'
word_tfidf = TfidfVectorizer(
    lowercase=True,           # normalize case
    ngram_range=(1,2),        # unigrams + bigrams
    max_features=30000,
    min_df=2
)

char_tfidf = TfidfVectorizer(
    analyzer='char',
    ngram_range=(3,5),        # character 3-5grams help with typos/slang remnants
    lowercase=True,
    min_df=2,
    max_features=30000
)

features = FeatureUnion([
    ('word_tfidf', word_tfidf),
    ('char_tfidf', char_tfidf)
])

# 3) Candidate models ----------------------------------------------------------
models = {
    "LogisticRegression": LogisticRegression(max_iter=2000, n_jobs=None),
    "LinearSVC": LinearSVC(),  # very strong baseline for text
    "SGD_logistic": SGDClassifier(loss="log_loss", max_iter=2000),  # fast, scalable
    "ComplementNB": ComplementNB(),  # good for text with tf-idf
    "MLP_shallow": MLPClassifier(hidden_layer_sizes=(256,), activation='relu', max_iter=50, random_state=42)
}

# 4) Evaluate each model with the same features pipeline -----------------------
results = []
predictions = {}

for name, clf in models.items():
    pipe = Pipeline([
        ('feat', features),
        ('clf', clf)
    ])
    pipe.fit(X_train_text, y_train)
    y_pred = pipe.predict(X_test_text)
    acc = accuracy_score(y_test, y_pred)
    f1m = f1_score(y_test, y_pred, average='macro')
    results.append((name, acc, f1m))
    predictions[name] = (pipe, y_pred)

# Print summary
print("== Model Comparison ==")
for name, acc, f1m in sorted(results, key=lambda x: x[2], reverse=True):
    print(f"{name:16s} | Acc: {acc:.4f} | Macro-F1: {f1m:.4f}")

# Show detailed report for the top model (by Macro-F1)
best_name, _, _ = sorted(results, key=lambda x: x[2], reverse=True)[0]
best_pipe, best_pred = predictions[best_name]
print("\n== Best Model:", best_name, "==")
print(classification_report(y_test, best_pred))

# Optional: Confusion matrix for best model
cm = confusion_matrix(y_test, best_pred, labels=sorted(df['label'].unique()))
cm_df = pd.DataFrame(cm, index=sorted(df['label'].unique()), columns=sorted(df['label'].unique()))
print("\nConfusion Matrix (rows=true, cols=pred):\n", cm_df)

# 5) OPTIONAL: Hyperparameter tuning for LR & LinearSVC ------------------------
# (Run if your dataset size/time allows; comment out if not needed)
param_grid_lr = {
    'feat__word_tfidf__ngram_range': [(1,2)],
    'feat__word_tfidf__min_df': [2,3,5],
    'feat__char_tfidf__ngram_range': [(3,5)],
    'clf__C': [0.5, 1.0, 2.0, 4.0]
}
param_grid_svc = {
    'feat__word_tfidf__ngram_range': [(1,2)],
    'feat__word_tfidf__min_df': [2,3,5],
    'feat__char_tfidf__ngram_range': [(3,5)],
    'clf__C': [0.5, 1.0, 2.0, 4.0]
}

def tune_and_report(clf, params, Xtr, ytr, Xte, yte, name):
    pipe = Pipeline([('feat', features), ('clf', clf)])
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    gs = GridSearchCV(pipe, params, cv=cv, scoring='f1_macro', n_jobs=-1, verbose=0)
    gs.fit(Xtr, ytr)
    print(f"\n== GridSearch Best for {name} ==")
    print("Best params:", gs.best_params_)
    print("CV best f1_macro:", gs.best_score_)
    yhat = gs.predict(Xte)
    print("Test Acc:", accuracy_score(yte, yhat))
    print("Test Macro-F1:", f1_score(yte, yhat, average='macro'))
    print(classification_report(yte, yhat))
    return gs

# Uncomment to run tuning (can take longer on large data)
# best_lr = tune_and_report(LogisticRegression(max_iter=2000), param_grid_lr, X_train_text, y_train, X_test_text, y_test, "LogReg")
# best_svc = tune_and_report(LinearSVC(), param_grid_svc, X_train_text, y_train, X_test_text, y_test, "LinearSVC")


== Model Comparison ==
LogisticRegression | Acc: 1.0000 | Macro-F1: 1.0000
LinearSVC        | Acc: 1.0000 | Macro-F1: 1.0000
SGD_logistic     | Acc: 1.0000 | Macro-F1: 1.0000
ComplementNB     | Acc: 1.0000 | Macro-F1: 1.0000
MLP_shallow      | Acc: 1.0000 | Macro-F1: 1.0000

== Best Model: LogisticRegression ==
               precision    recall  f1-score   support

entertainment       1.00      1.00      1.00       400
         food       1.00      1.00      1.00       400
     politics       1.00      1.00      1.00       400
       sports       1.00      1.00      1.00       400
         tech       1.00      1.00      1.00       400

     accuracy                           1.00      2000
    macro avg       1.00      1.00      1.00      2000
 weighted avg       1.00      1.00      1.00      2000


Confusion Matrix (rows=true, cols=pred):
                entertainment  food  politics  sports  tech
entertainment            400     0         0       0     0
food                       0

In [87]:
test_sentences = [
    "Brgr 🍔 iz da best!!",     # should be food
    "PM gave big speeech 2day", # politics
    "Cric match woww 🔥🔥",    # sports
    "New AI phone dropped!",    # tech
    "Oscars award show 🎬"      # entertainment
]
preds = best_pipe.predict(test_sentences)
print(list(zip(test_sentences, preds)))


[('Brgr 🍔 iz da best!!', 'food'), ('PM gave big speeech 2day', 'food'), ('Cric match woww 🔥🔥', 'sports'), ('New AI phone dropped!', 'tech'), ('Oscars award show 🎬', 'entertainment')]


----------------------------------------------------------------------     2st Model     ----------------------------------------------------------------------

In [118]:
# =======================================
# Text Classification – Robust Classical Ensemble
# =======================================
import re, html, random, warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, GroupShuffleSplit
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.naive_bayes import ComplementNB
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.ensemble import StackingClassifier

from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
import joblib

RANDOM_STATE = 42
random.seed(RANDOM_STATE)
np.random.seed(RANDOM_STATE)

# -----------------------------
# 1) Load data and prep groups
# -----------------------------
df = pd.read_csv(file)[["text", "label"]].dropna().drop_duplicates().reset_index(drop=True)

# Group "fingerprint" to avoid near-duplicate leakage in split
def fingerprint(s: str) -> str:
    s = s.lower()
    s = re.sub(r"[^\w\s]", " ", s)
    s = re.sub(r"\d+", " ", s)            # drop digits
    s = re.sub(r"\s+", " ", s).strip()
    # keep tokens longer than 3 chars to reduce noise
    s = " ".join([w for w in s.split() if len(w) > 3])
    return s

df["group"] = df["text"].map(fingerprint)

# Group-wise split: 80% train / 20% test
gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=RANDOM_STATE)
train_idx, test_idx = next(gss.split(df["text"], df["label"], groups=df["group"]))
train_df = df.iloc[train_idx].copy()
test_df  = df.iloc[test_idx].copy()

# -------------------------------------
# 2) Normalization + politics enrichment
# -------------------------------------
EMOJI_MAP = {"🍔":" burger ", "🔥":" fire ", "🎬":" movie "}
SLANG = {
    "u":"you","r":"are","2day":"today","tmrw":"tomorrow","idk":"i dont know",
    "lol":"funny","smh":"disappointed","pm":"prime minister"
}
POLITICS_HINTS = {
    r"\bparl(iament)?\b": " parliament ",
    r"\belection(s)?\b": " election ",
    r"\bminister\b": " minister ",
    r"\bspeech\b": " speech ",
}

def normalize(s: str) -> str:
    s = html.unescape(str(s))
    s = s.lower()
    # collapse repeated characters: "speeech" -> "speech" (leave at most 2 in a row)
    s = re.sub(r"(.)\1{2,}", r"\1\1", s)
    # slang expansion
    if SLANG:
        s = re.sub(r"\b(" + "|".join(map(re.escape, SLANG.keys())) + r")\b",
                   lambda m: SLANG[m.group(0)], s)
    # emoji mapping
    s = "".join(EMOJI_MAP.get(ch, ch) for ch in s)
    # urls & non-alphanumeric cleanup
    s = re.sub(r"http\S+|www\.\S+", " ", s)
    s = re.sub(r"[^a-z0-9\s]", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

def enrich_politics(s: str) -> str:
    for pat, rep in POLITICS_HINTS.items():
        s = re.sub(pat, rep, s)
    return s

def normalize_plus(s: str) -> str:
    return enrich_politics(normalize(s))

train_df["norm"] = train_df["text"].map(normalize_plus)
test_df["norm"]  = test_df["text"].map(normalize_plus)

# -------------------------------------
# 3) Features: word + char TF-IDF
# -------------------------------------
word_tfidf = TfidfVectorizer(
    ngram_range=(1,2),
    max_features=30000,
    min_df=2,
    lowercase=True
)
char_tfidf = TfidfVectorizer(
    analyzer="char",
    ngram_range=(3,5),
    max_features=30000,
    min_df=2,
    lowercase=True
)
features = FeatureUnion([("word", word_tfidf), ("char", char_tfidf)])

# -------------------------------------
# 4) Stacked ensemble (probabilistic)
# -------------------------------------
svc_cal = CalibratedClassifierCV(LinearSVC(), method="isotonic", cv=3, n_jobs=None)
nb      = ComplementNB()
sgd     = SGDClassifier(loss="log_loss", max_iter=2000, tol=1e-3, random_state=RANDOM_STATE)

stack = StackingClassifier(
    estimators=[("svc", svc_cal), ("nb", nb), ("sgd", sgd)],
    final_estimator=LogisticRegression(max_iter=2000, n_jobs=None, random_state=RANDOM_STATE),
    stack_method="predict_proba",
    passthrough=False,
    n_jobs=None
)

pipe = Pipeline([("feat", features), ("clf", stack)])

# -------------------------------------
# 5) Train
# -------------------------------------
X_train, y_train = train_df["norm"], train_df["label"]
X_test,  y_test  = test_df["norm"],  test_df["label"]

pipe.fit(X_train, y_train)

# -------------------------------------
# 6) Evaluate on CLEAN test
# -------------------------------------
y_pred = pipe.predict(X_test)
acc = accuracy_score(y_test, y_pred)
f1m = f1_score(y_test, y_pred, average="macro")
print("=== CLEAN TEST ===")
print(f"Accuracy:  {acc:.4f}")
print(f"Macro-F1:  {f1m:.4f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred))

labels_sorted = sorted(df["label"].unique())
cm = confusion_matrix(y_test, y_pred, labels=labels_sorted)
cm_df = pd.DataFrame(cm, index=[f"true_{l}" for l in labels_sorted], columns=[f"pred_{l}" for l in labels_sorted])
print("\nConfusion Matrix:\n", cm_df)

# -------------------------------------
# 7) Noisy robustness evaluation
#    (augment test with synthetic noise)
# -------------------------------------
def add_noise(s: str) -> str:
    s = s.lower()
    # a few noisy transforms
    s = re.sub(r"\btoday\b", "2day", s)
    s = re.sub(r"\bprime minister\b", "pm", s)
    s = re.sub(r"speech", "speeech", s)
    # sprinkle emoji keywords
    s = s.replace("burger", "burger 🍔").replace("movie", "movie 🎬").replace("fire", "🔥")
    # random drop small words
    words = s.split()
    if len(words) > 5:
        keep = [w for w in words if random.random() > 0.1]
        s = " ".join(keep) if keep else s
    return s

# Build a small noisy probe from test set
probe = X_test.sample(min(200, len(X_test)), random_state=RANDOM_STATE)
probe_noisy = probe.apply(add_noise)

# Reuse normalize_plus at inference time (what model expects)
probe_norm = probe_noisy.apply(normalize_plus)
probe_pred = pipe.predict(probe_norm)
robust_acc = (probe_pred == y_test.loc[probe.index]).mean()

print("\n=== NOISY ROBUSTNESS ===")
print(f"Robustness Accuracy (noisy probe): {robust_acc:.4f}")

# Show a few noisy examples for the report
print("\nNoisy sample predictions:")
for i in probe.sample(5, random_state=RANDOM_STATE).index.tolist():
    raw  = df.loc[i, "text"] if i in df.index else "(sample)"
    nraw = probe_noisy.loc[i]
    pred = pipe.predict([normalize_plus(nraw)])[0]
    truth = y_test.loc[i]
    print(f"- RAW: {raw}\n  NOISY: {nraw}\n  PRED: {pred} | TRUE: {truth}\n")

# Also your 5 custom noisy sentences
custom_noisy = [
    "Brgr 🍔 iz da best!!",
    "PM gave big speeech 2day",
    "Cric match woww 🔥🔥",
    "New AI phone dropped!",
    "Oscars award show 🎬"
]
custom_pred = pipe.predict([normalize_plus(t) for t in custom_noisy])
print("Custom noisy predictions:")
for t, p in zip(custom_noisy, custom_pred):
    print(f"- {t} -> {p}")

# -------------------------------------
# 8) Save artifacts
# -------------------------------------
joblib.dump(pipe, "text_cls_stacked.joblib")
cm_df.to_csv("confusion_matrix_clean.csv", index=True)
print("\nSaved: text_cls_stacked.joblib, confusion_matrix_clean.csv")


=== CLEAN TEST ===
Accuracy:  0.7549
Macro-F1:  0.8245

Classification Report:
                precision    recall  f1-score   support

entertainment       1.00      1.00      1.00       122
         food       1.00      1.00      1.00       298
     politics       1.00      1.00      1.00       510
       sports       0.35      1.00      0.51       278
         tech       1.00      0.44      0.61       930

     accuracy                           0.75      2138
    macro avg       0.87      0.89      0.82      2138
 weighted avg       0.92      0.75      0.77      2138


Confusion Matrix:
                     pred_entertainment  pred_food  pred_politics  pred_sports  \
true_entertainment                 122          0              0            0   
true_food                            0        298              0            0   
true_politics                        0          0            510            0   
true_sports                          0          0              0          278 

