# Text, Tobacco & Spam Analysis — Lab 7

**Student:** Parv  
**Generated on:** 2025-11-12 06:16:04

**Note:** This notebook uses `dataset_reviews.csv` (as a local IMDB-like sample), `GYTS.csv`, and `spam.csv` which should be present in `/mnt/data/`.

---


In [None]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer, WordNetLemmatizer
from wordcloud import WordCloud
plt.rcParams.update({'figure.dpi':120, 'font.size':10})

print('Imports ready')

## Part I — Text Preprocessing & Feature Extraction (6 marks)
Using `dataset_reviews.csv` (local sample).

In [None]:
# Load sample reviews
reviews_path = '/mnt/data/dataset_reviews.csv'
reviews = pd.read_csv(reviews_path)
print('Columns:', reviews.columns.tolist())
print('Total reviews available:', len(reviews))

# Take sample of 1000 (or all if <1000)
sample = reviews.sample(n=min(1000, len(reviews)), random_state=42).reset_index(drop=True)
print('Sample size:', sample.shape)

# Show first two reviews for POS tagging later
sample.head(2)


In [None]:
# Preprocessing functions
import string
from nltk import word_tokenize, pos_tag

stemmer = SnowballStemmer('english')
lemmatizer = WordNetLemmatizer()
stop_en = set(stopwords.words('english')) if 'stopwords' in globals() else set()

_contractions = {
    "don't":"do not","can't":"cannot","i'm":"i am","it's":"it is","you're":"you are"
}

def expand_contractions(text):
    text = text.lower()
    for k,v in _contractions.items():
        text = text.replace(k, v)
    return text

def clean_text(text):
    if pd.isna(text):
        return ''
    text = expand_contractions(str(text))
    text = re.sub(r'[^a-zA-Z\s]', ' ', text)
    tokens = word_tokenize(text)
    tokens = [t for t in tokens if t not in stop_en]
    stems = [stemmer.stem(t) for t in tokens]
    lemmas = [lemmatizer.lemmatize(t) for t in stems]
    return ' '.join(lemmas)

# Apply cleaning on sample (this may take time)
sample['cleaned'] = sample.iloc[:,0].astype(str).apply(clean_text)
print('Cleaned sample (first 5):')
print(sample['cleaned'].head())


In [None]:
# Feature extraction: BoW and n-grams
vectorizer = CountVectorizer(ngram_range=(1,1), max_features=5000)
dtm = vectorizer.fit_transform(sample['cleaned'])
bow_vocab = vectorizer.get_feature_names_out()
word_counts = np.asarray(dtm.sum(axis=0)).ravel()
freq_df = pd.DataFrame({'word': bow_vocab, 'count': word_counts}).sort_values('count', ascending=False)
print('Top 5 words:')
print(freq_df.head(5))

# Bigrams and trigrams
vect_2 = CountVectorizer(ngram_range=(2,2), max_features=2000)
bi = vect_2.fit_transform(sample['cleaned'])
bi_df = pd.DataFrame({'bigram': vect_2.get_feature_names_out(), 'count': np.asarray(bi.sum(axis=0)).ravel()}).sort_values('count', ascending=False)
print('\nTop 5 bigrams:')
print(bi_df.head(5))

vect_3 = CountVectorizer(ngram_range=(3,3), max_features=1000)
tri = vect_3.fit_transform(sample['cleaned'])
tri_df = pd.DataFrame({'trigram': vect_3.get_feature_names_out(), 'count': np.asarray(tri.sum(axis=0)).ravel()}).sort_values('count', ascending=False)
print('\nTop 3 trigrams:')
print(tri_df.head(3))


In [None]:
# POS tagging for first two original reviews
from nltk import pos_tag, word_tokenize
orig_texts = sample.iloc[:2,0].astype(str).tolist()
for i,t in enumerate(orig_texts):
    toks = word_tokenize(t)
    print(f'POS tags for review {i+1}:')
    print(pos_tag(toks)[:50])


## Part II — GYTS (Youth Tobacco Awareness) (6 marks)
Using `GYTS.csv`. Analysis at the India level (Area == 'Total', State/UT == 'India').

In [None]:
# Load GYTS
gyts = pd.read_csv('/mnt/data/GYTS.csv')
print('GYTS columns:', gyts.columns.tolist())

# Filter for India and Area == Total
gyts_ind = gyts[(gyts['State/UT'].str.strip().str.lower()=='india') & (gyts['Area'].str.strip().str.lower()=='total')]
print('Rows for India Area=Total:', len(gyts_ind))

gyts_ind.head()


In [None]:
# Waffle chart requires counts for exposure sources
# Identify Exposure columns (columns containing 'Exposure' or similar)
exp_cols = [c for c in gyts.columns if 'exposure' in c.lower() or 'exposed' in c.lower()]
print('Exposure columns found:', exp_cols)
if exp_cols and len(gyts_ind)>0:
    vals = gyts_ind[exp_cols].iloc[0]
    print('Exposure values:')
    print(vals)
else:
    print('No exposure columns detected or no India row found; please inspect the GYTS file.')


In [None]:
# Winsorization of usage columns (India excluded) - filter Area == Total and State/UT != India
from scipy.stats.mstats import winsorize
usage_cols = [c for c in gyts.columns if any(x in c.lower() for x in ['current tobacco', 'current smoker', 'cigarette', 'bidi', 'smokeless'])]
print('Detected usage columns:', usage_cols)

subset = gyts[(gyts['State/UT'].str.strip().str.lower()!='india') & (gyts['Area'].str.strip().str.lower()=='total')]
print('Rows for states (Area=Total):', len(subset))
orig_means = subset[usage_cols].mean()
# apply winsorization at 5% both tails
wins = subset[usage_cols].apply(lambda col: winsorize(col.dropna(), limits=[0.05,0.05]) if col.notna().any() else col)
wins_means = pd.DataFrame({ 'original_mean': orig_means, 'winsorized_mean': [w.mean() if hasattr(w,'mean') else np.nan for w in wins.values.T] }, index=usage_cols)
print(wins_means)

# Plot comparison
wins_means.plot(kind='bar', figsize=(8,4))
plt.title('Original vs Winsorized means (states, Area=Total)')
plt.ylabel('Percentage')
plt.tight_layout()
plt.show()


## Part III — Spam vs Ham (6 marks)
Using `spam.csv`. Clean text, save cleaned csv, and generate two word clouds.

In [None]:
# Load spam dataset
spam = pd.read_csv('/mnt/data/spam.csv', encoding='latin-1')
# Try to identify label and message columns
label_col = None
msg_col = None
for c in spam.columns:
    if 'label' in c.lower() or 'v1'==c.lower():
        label_col = c
    if 'message' in c.lower() or 'v2'==c.lower():
        msg_col = c
print('Detected:', label_col, msg_col)
spam = spam[[label_col, msg_col]].rename(columns={label_col:'label', msg_col:'message'})
spam = spam.dropna()
spam['label'] = spam['label'].str.strip().str.lower()

# Cleaning function
def clean_msg(s):
    s = str(s).lower()
    s = re.sub(r'[^a-z\s]', ' ', s)
    toks = [w for w in s.split() if w not in ENGLISH_STOP_WORDS]
    return ' '.join(toks)

spam['cleaned'] = spam['message'].apply(clean_msg)
spam.to_csv('/mnt/data/spam_cleaned.csv', index=False)
print('Saved cleaned spam to /mnt/data/spam_cleaned.csv')
spam.head()


In [None]:
# Word clouds for top 100 longest spam and ham messages
from wordcloud import WordCloud
spam_msgs = spam[spam['label']=='spam']['cleaned'].sort_values(key=lambda s: s.str.len(), ascending=False).head(100).str.cat(sep=' ')
ham_msgs = spam[spam['label']=='ham']['cleaned'].sort_values(key=lambda s: s.str.len(), ascending=False).head(100).str.cat(sep=' ')

wc_spam = WordCloud(width=600, height=400).generate(spam_msgs)
wc_ham = WordCloud(width=600, height=400).generate(ham_msgs)

plt.figure(figsize=(12,5))
plt.subplot(1,2,1)
plt.imshow(wc_spam, interpolation='bilinear')
plt.axis('off')
plt.title('Spam (top 100 longest)')

plt.subplot(1,2,2)
plt.imshow(wc_ham, interpolation='bilinear')
plt.axis('off')
plt.title('Ham (top 100 longest)')
plt.show()


---

**End of Lab 7 notebook.**

If you want, I can run this notebook here and paste the exact outputs (tables and plots). Would you like me to execute it now?