# Question 1

In [None]:
!pip install pandas nltk gensim scikit-learn

In [None]:
import pandas as pd

# Read only the required columns
df = pd.read_csv("spam.csv", encoding='latin-1')[['v1', 'v2']]
df.columns = ['Label', 'Message']
df.head()

Unnamed: 0,Label,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [22]:
# preprocessing
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')

stop_words = set(stopwords.words('english'))

def preprocess(msg):
    tokens = word_tokenize(msg.lower())
    filtered = [word for word in tokens if word.isalpha() and word not in stop_words]
    return filtered

df['Processed'] = df['Message'].apply(preprocess)
df[['Message', 'Processed']].head()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Unnamed: 0,Message,Processed
0,"Go until jurong point, crazy.. Available only ...","[go, jurong, point, crazy, available, bugis, n..."
1,Ok lar... Joking wif u oni...,"[ok, lar, joking, wif, u, oni]"
2,Free entry in 2 a wkly comp to win FA Cup fina...,"[free, entry, wkly, comp, win, fa, cup, final,..."
3,U dun say so early hor... U c already then say...,"[u, dun, say, early, hor, u, c, already, say]"
4,"Nah I don't think he goes to usf, he lives aro...","[nah, think, goes, usf, lives, around, though]"


In [None]:
import gensim.downloader as api
w2v_model = api.load('word2vec-google-news-300')
print("Word2Vec model loaded successfully!")

In [24]:
# vectorize
import numpy as np
def vectorize(tokens):
    vectors = [w2v_model[word] for word in tokens if word in w2v_model]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(300)

df['Vector']=df['Processed'].apply(vectorize)
df[['Processed','Vector']].head()

Unnamed: 0,Processed,Vector
0,"[go, jurong, point, crazy, available, bugis, n...","[-0.019805908, 0.05167062, 0.02709961, 0.21868..."
1,"[ok, lar, joking, wif, u, oni]","[-0.06323496, 0.0803833, 0.060943604, 0.102498..."
2,"[free, entry, wkly, comp, win, fa, cup, final,...","[-0.03242302, -0.0050720214, -0.06273012, 0.11..."
3,"[u, dun, say, early, hor, u, c, already, say]","[-0.06568061, 0.0262146, 0.1081543, 0.0869751,..."
4,"[nah, think, goes, usf, lives, around, though]","[0.032470703, 0.037462506, 0.047345843, 0.1572..."


In [26]:
# splitting
from sklearn.model_selection import train_test_split

x = np.stack(df['Vector'].values)
y = df['Label'].map({'ham': 0, 'spam': 1})

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [27]:
# training logistic model
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

model = LogisticRegression(max_iter=1000)
model.fit(x_train, y_train)

y_pred = model.predict(x_test)
print("Test Accuracy:", accuracy_score(y_test, y_pred))

Test Accuracy: 0.9417040358744395


In [None]:
#func to predict class
def predict_message_class(model, w2v_model, message):
    tokens = preprocess(message)
    vector = vectorize(tokens)
    pred = model.predict([vector])[0]
    return 'spam' if pred == 1 else 'ham'

In [29]:
print(predict_message_class(model, w2v_model, "Free free free!! You won 7 crore"))
print(predict_message_class(model, w2v_model, "Hey, how are you today"))

spam
ham


# Question 2

In [31]:
import pandas as pd

df = pd.read_csv("Tweets.csv")[['airline_sentiment', 'text']]
df.columns = ['Sentiment', 'Tweet']
df.head()

Unnamed: 0,Sentiment,Tweet
0,neutral,@VirginAmerica What @dhepburn said.
1,positive,@VirginAmerica plus you've added commercials t...
2,neutral,@VirginAmerica I didn't today... Must mean I n...
3,negative,@VirginAmerica it's really aggressive to blast...
4,negative,@VirginAmerica and it's a really big bad thing...


In [32]:
# preprocessing
import re
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

contractions = {"don't": "do not", "can't": "cannot", "won't": "will not", "i'm": "i am"}

def preprocess_tweet(text):
    text = text.lower()
    text = re.sub(r'http\S+|@\S+|#\S+', '', text)
    text = re.sub(r'[^\w\s]', '', text)

    for word, expand in contractions.items():
        text = text.replace(word, expand)

    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word.isalpha() and word not in stop_words]

    return tokens

# Apply preprocessing
df['Processed'] = df['Tweet'].apply(preprocess_tweet)
df[['Tweet', 'Processed']].head()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,Tweet,Processed
0,@VirginAmerica What @dhepburn said.,[said]
1,@VirginAmerica plus you've added commercials t...,"[plus, youve, added, commercial, experience, t..."
2,@VirginAmerica I didn't today... Must mean I n...,"[didnt, today, must, mean, need, take, another..."
3,@VirginAmerica it's really aggressive to blast...,"[really, aggressive, blast, obnoxious, enterta..."
4,@VirginAmerica and it's a really big bad thing...,"[really, big, bad, thing]"


In [35]:
# vectorize

def vectorize(tokens):
    vectors = [w2v_model[word] for word in tokens if word in w2v_model]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(300)

# Apply vectorization
df['Vector'] = df['Processed'].apply(vectorize)
df[['Processed', 'Vector']].head()

Unnamed: 0,Processed,Vector
0,[said],"[-0.009094238, -0.044189453, 0.099609375, -0.0..."
1,"[plus, youve, added, commercial, experience, t...","[0.0009358724, -0.05480957, -0.04031372, 0.078..."
2,"[didnt, today, must, mean, need, take, another...","[-0.0025896344, 0.04867118, 0.0355399, 0.03494..."
3,"[really, aggressive, blast, obnoxious, enterta...","[0.0012191773, 0.09075928, -0.017980957, 0.053..."
4,"[really, big, bad, thing]","[0.11010742, 0.06271362, 0.0031738281, 0.13183..."


In [36]:
# splitting
from sklearn.model_selection import train_test_split

x = np.stack(df['Vector'].values)
y = df['Sentiment'].map({'negative': 0, 'neutral': 1, 'positive': 2})

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [37]:
#model training

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

model = LogisticRegression(max_iter=1000, multi_class='multinomial', solver='lbfgs')
model.fit(x_train, y_train)

y_pred = model.predict(x_test)
print("Test Accuracy:", accuracy_score(y_test, y_pred))



Test Accuracy: 0.7783469945355191


In [38]:
def predict_tweet_sentiment(model, w2v_model, tweet):
    tokens = preprocess_tweet(tweet)
    vector = vectorize(tokens)
    pred = model.predict([vector])[0]

    return {0: 'negative', 1: 'neutral', 2: 'positive'}[pred]

In [46]:
print(predict_tweet_sentiment(model, w2v_model, "The flight was delayed and the staff was rude."))
print(predict_tweet_sentiment(model, w2v_model, "Great service! I loved the experience."))


negative
positive
