In [8]:
# Install if needed
!pip install numpy pandas scikit-learn gensim nltk

import numpy as np
import pandas as pd

from gensim.models import KeyedVectors
from gensim.downloader import load as gensim_load

import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
# nltk.download('punkt_tab') # Removed as it's not a standard corpus

STOPWORDS = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [3]:
# Use smaller, fast-loading embeddings for quick testing
print("Loading Word2Vec (glove-wiki-gigaword-50, fast) ...")
w2v = gensim_load('glove-wiki-gigaword-50')
print("✓ Word2Vec loaded.")


Loading Word2Vec (glove-wiki-gigaword-50, fast) ...
✓ Word2Vec loaded.


In [9]:
def sentence_vector(sentence, w2v):
    vectors = [w2v[w] for w in sentence if w in w2v]
    vec = np.mean(vectors, axis=0) if vectors else np.zeros(w2v.vector_size)
    if np.isnan(vec).any():
        vec = np.zeros(w2v.vector_size)
    return vec

def preprocess_sms(text):
    # Use default word_tokenize which relies on the downloaded 'punkt' resource
    tokens = nltk.word_tokenize(text.lower())
    tokens = [t for t in tokens if t.isalpha() and t not in STOPWORDS]
    return tokens

def expand_contractions(text):
    CONTRACTIONS = {"don't": "do not", "can't": "can not", "i'm": "i am", "it's": "it is", "won't": "will not", "didn't": "did not"}
    for contraction, expanded in CONTRACTIONS.items():
        text = text.replace(contraction, expanded)
    return text

import re, string

URL_RE = re.compile(r'https?://\S+|www\.\S+')
MENTION_RE = re.compile(r'@\w+')
HASHTAG_RE = re.compile(r'#\w+')

def clean_tweet(text):
    text = expand_contractions(text.lower())
    text = URL_RE.sub('', text)
    text = MENTION_RE.sub('', text)
    text = HASHTAG_RE.sub('', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(t) for t in tokens if t.isalpha() and t not in STOPWORDS]
    return tokens

In [10]:
# Upload 'spam.csv' manually via Colab UI before running this cell
df_sms = pd.read_csv('/content/spam.csv', encoding='latin-1')
df_sms = df_sms.rename(columns={'v1': 'Label', 'v2': 'Message'})[['Label', 'Message']].dropna()

df_sms['tokens'] = df_sms['Message'].apply(preprocess_sms)
X_sms = np.vstack(df_sms['tokens'].apply(lambda toks: sentence_vector(toks, w2v)))
y_sms = df_sms['Label'].map({'ham': 0, 'spam': 1}).values

X_train_sms, X_test_sms, y_train_sms, y_test_sms = train_test_split(X_sms, y_sms, test_size=0.2, random_state=42, stratify=y_sms)

clf_sms = LogisticRegression(max_iter=1000, solver='liblinear')
clf_sms.fit(X_train_sms, y_train_sms)
y_pred_sms = clf_sms.predict(X_test_sms)

print("\n📌 **SMS Spam Classification Results:**")
print("Accuracy:", accuracy_score(y_test_sms, y_pred_sms))
print(classification_report(y_test_sms, y_pred_sms, target_names=['ham', 'spam']))



📌 **SMS Spam Classification Results:**
Accuracy: 0.9264573991031391
              precision    recall  f1-score   support

         ham       0.94      0.97      0.96       966
        spam       0.78      0.62      0.69       149

    accuracy                           0.93      1115
   macro avg       0.86      0.80      0.83      1115
weighted avg       0.92      0.93      0.92      1115



In [11]:
# Upload 'Tweets.csv' manually via Colab UI before running this cell
df_twitter = pd.read_csv('/content/Tweets.csv')[['airline_sentiment', 'text']].dropna()
df_twitter['tokens'] = df_twitter['text'].apply(clean_tweet)

X_twitter = np.vstack(df_twitter['tokens'].apply(lambda toks: sentence_vector(toks, w2v)))
sentiment_map = {'negative': 0, 'neutral': 1, 'positive': 2}
y_twitter = df_twitter['airline_sentiment'].map(sentiment_map).values

X_train_twitter, X_test_twitter, y_train_twitter, y_test_twitter = train_test_split(
    X_twitter, y_twitter, test_size=0.2, random_state=42, stratify=y_twitter
)

clf_twitter = LogisticRegression(max_iter=1000, multi_class='ovr')
clf_twitter.fit(X_train_twitter, y_train_twitter)
y_pred_twitter = clf_twitter.predict(X_test_twitter)

print("\n📌 **Twitter Sentiment Classification Results:**")
print("Accuracy:", accuracy_score(y_test_twitter, y_pred_twitter))
print(classification_report(y_test_twitter, y_pred_twitter, target_names=['negative', 'neutral', 'positive']))





📌 **Twitter Sentiment Classification Results:**
Accuracy: 0.7213114754098361
              precision    recall  f1-score   support

    negative       0.74      0.94      0.83      1835
     neutral       0.59      0.30      0.40       620
    positive       0.71      0.44      0.54       473

    accuracy                           0.72      2928
   macro avg       0.68      0.56      0.59      2928
weighted avg       0.70      0.72      0.69      2928



In [12]:
# Problem 1: Predict spam/ham
test_sms = "Congratulations! You've won a free cruise to the Bahamas. Call now."
tokens_sms = preprocess_sms(test_sms)
vec_sms = sentence_vector(tokens_sms, w2v).reshape(1, -1)
pred_sms = clf_sms.predict(vec_sms)[0]
print("\nTest SMS prediction:", "spam" if pred_sms == 1 else "ham")

# Problem 2: Predict tweet sentiment
test_tweet = "I love how friendly the flight attendants were on my @SouthwestAir flight!"
tokens_tweet = clean_tweet(test_tweet)
vec_tweet = sentence_vector(tokens_tweet, w2v).reshape(1, -1)
pred_tweet = clf_twitter.predict(vec_tweet)[0]
print("Test Tweet Sentiment Prediction:", {0: 'negative', 1: 'neutral', 2: 'positive'}[pred_tweet])



Test SMS prediction: spam
Test Tweet Sentiment Prediction: negative
