In [2]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import spacy

In [3]:
# Download required NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/shivamsinghrawat/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/shivamsinghrawat/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [1]:
##Function get dataframe from text file
def load_txt_data(filepath):
    
    texts = []
    labels = []
    
    with open(filepath, "r", encoding="utf-8") as file:
        lines = file.readlines()
        
        
        for line in lines[1:]:  
            parts = line.strip().split("\t")  
            
            print(len(parts))
            
            if len(parts) < 2:
                parts = line.strip().split()  
            
            if len(parts) >= 2:
                tweet = " ".join(parts[1:-1])  
                label = parts[-1]  
                texts.append(tweet)
                
                
                sentiment_mapping = {
                    "0: neutral or mixed emotional state can be inferred": 0,
                    "1: slightly positive emotional state can be inferred": 1,
                    "2: moderately positive emotional state can be inferred": 2,
                    "3: very positive emotional state can be inferred": 3,
                    "-1: slightly negative emotional state can be inferred": -1,
                    "-2: moderately negative emotional state can be inferred": -2,
                    "-3: very negative emotional state can be inferred": -3
                }
                
                
                labels.append(sentiment_mapping.get(label, np.nan))
    
    df = pd.DataFrame({"Tweet": texts, "Sentiment": labels})
    
    df.dropna(inplace=True)
    df["Sentiment"] = df["Sentiment"].astype(int)
    
    return df

In [2]:
# Loading training data
train_data = load_txt_data("train.txt")

4


NameError: name 'np' is not defined

In [15]:
#Loading test datasets
test_data = load_txt_data("test.txt") 

In [9]:
num_train_reviews = len(train_data)
print(f"Number of reviews in the training set: {num_train_reviews}")

Number of reviews in the training set: 1181


In [10]:
num_test_reviews = len(test_data)
print(f"Number of reviews in the testing set: {num_test_reviews}")

Number of reviews in the testing set: 937


In [16]:
train_data[:5]

Unnamed: 0,Tweet,Sentiment
0,@liamch88 yeah! :) playing well valence,0
1,At least I don't have a guy trying to discoura...,0
2,UPLIFT: If you're still discouraged it means y...,0
3,"...at your age, the heyday in the blood is tam...",0
4,i was so embarrassed when she saw us i was lik...,-2


## POS Tagging

In [11]:
import nltk

# # Download the NLTK tagger model, if necessary. You can comment this out once the tagger is downloaded.
# nltk.download('averaged_perceptron_tagger')
# nltk.download('averaged_perceptron_tagger_eng')

# class for tokenization
class Splitter(object):
    # load the tokenizer
    def __init__(self):
        self.nltk_splitter = nltk.data.load('tokenizers/punkt/english.pickle')
        self.nltk_tokenizer = nltk.tokenize.TreebankWordTokenizer()
    #split input 
    def split(self, text):
        sentences = self.nltk_splitter.tokenize(text)
        tokenized_sentences = [self.nltk_tokenizer.tokenize(sent) for sent in sentences]
        return tokenized_sentences

# class for POS tagging
class POSTagger(object):
    def __init__(self):
        pass
    def pos_tag(self, sentences):
        pos = [nltk.pos_tag(sent) for sent in sentences]
        return pos
    
splitter = Splitter()
postagger = POSTagger()

In [12]:
train_pos_data = train_data

In [18]:
print(train_pos_data.Tweet.tolist()[3])
print("\n")

tweet = train_pos_data.Tweet.tolist()[3]
splitted_sentences = splitter.split(tweet)
pos_tagged_sentences = postagger.pos_tag(splitted_sentences)
for sentence in pos_tagged_sentences:
    for words in sentence:
        print(words)
    print("\n")

...at your age, the heyday in the blood is tame...' @TheArtofCharm #shakespeareaninsults #hamlet #elizabethan #williamshakespeare valence


('...', ':')
('at', 'IN')
('your', 'PRP$')
('age', 'NN')
(',', ',')
('the', 'DT')
('heyday', 'NN')
('in', 'IN')
('the', 'DT')
('blood', 'NN')
('is', 'VBZ')
('tame', 'JJ')
('...', ':')
("'", 'POS')
('@', 'JJ')
('TheArtofCharm', 'NNP')
('#', '#')
('shakespeareaninsults', 'NNS')
('#', '#')
('hamlet', 'NN')
('#', '#')
('elizabethan', 'JJ')
('#', '#')
('williamshakespeare', 'NN')
('valence', 'NN')




In [21]:
print(train_pos_data.Tweet.tolist()[9])
print("\n")

tweet = train_pos_data.Tweet.tolist()[9]
splitted_sentences = splitter.split(tweet)
pos_tagged_sentences = postagger.pos_tag(splitted_sentences)
for sentence in pos_tagged_sentences:
    for words in sentence:
        print(words)
    print("\n")

Just because I'm hurting \nDoesn't mean I'm hurt \nDoesn't mean I didn't get \nWhat I deserved \nNo better and no worse #lost  @coldplay valence


('Just', 'RB')
('because', 'IN')
('I', 'PRP')
("'m", 'VBP')
('hurting', 'VBG')
('\\nDoes', 'VBP')
("n't", 'RB')
('mean', 'VB')
('I', 'PRP')
("'m", 'VBP')
('hurt', 'JJ')
('\\nDoes', 'VBP')
("n't", 'RB')
('mean', 'VB')
('I', 'PRP')
('did', 'VBD')
("n't", 'RB')
('get', 'VB')
('\\nWhat', 'RB')
('I', 'PRP')
('deserved', 'VBD')
('\\nNo', 'NNP')
('better', 'RBR')
('and', 'CC')
('no', 'DT')
('worse', 'JJR')
('#', '#')
('lost', 'VBN')
('@', 'JJ')
('coldplay', 'NN')
('valence', 'NN')




In [22]:
print(train_pos_data.Tweet.tolist()[12])
print("\n")

tweet = train_pos_data.Tweet.tolist()[12]
splitted_sentences = splitter.split(tweet)
pos_tagged_sentences = postagger.pos_tag(splitted_sentences)
for sentence in pos_tagged_sentences:
    for words in sentence:
        print(words)
    print("\n")

We have been a better second half team this season. So there's that.  valence


('We', 'PRP')
('have', 'VBP')
('been', 'VBN')
('a', 'DT')
('better', 'JJR')
('second', 'JJ')
('half', 'NN')
('team', 'NN')
('this', 'DT')
('season', 'NN')
('.', '.')


('So', 'IN')
('there', 'EX')
("'s", 'VBZ')
('that', 'DT')
('.', '.')


('valence', 'NN')




In [33]:
### TASK 3: Extract Unigram Features ###

X_train, y_train = train_data["Tweet"], train_data["Sentiment"]
X_test, y_test = test_data["Tweet"], test_data["Sentiment"]

vectorizer = CountVectorizer(ngram_range=(1, 1))  # Unigrams
X_train_features = vectorizer.fit_transform(X_train)
X_test_features = vectorizer.transform(X_test)


print(f"Number of features in training set: {X_train_features.shape[1]}")
print(f"Number of features in test set: {X_test_features.shape[1]}")

Number of features in training set: 5039
Number of features in test set: 5039


In [35]:
### TASK 4: Train and Evaluate Naïve Bayes Classifier ###

nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_features, y_train)


y_pred = nb_classifier.predict(X_test_features)


accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.4f}")


class_report = classification_report(y_test, y_pred, digits=4, zero_division=0, target_names=[
    "Very Negative (-3)", "Moderately Negative (-2)", "Slightly Negative (-1)",
    "Neutral (0)", "Slightly Positive (1)", "Moderately Positive (2)", "Very Positive (3)"
])

print("Classification Report:\n", class_report)

Model Accuracy: 0.3127
Classification Report:
                           precision    recall  f1-score   support

      Very Negative (-3)     0.5000    0.0645    0.1143        93
Moderately Negative (-2)     0.3179    0.2874    0.3019       167
  Slightly Negative (-1)     0.0000    0.0000    0.0000        80
             Neutral (0)     0.2986    0.8206    0.4379       262
   Slightly Positive (1)     0.1304    0.0280    0.0462       107
 Moderately Positive (2)     0.0000    0.0000    0.0000        91
       Very Positive (3)     0.6774    0.1533    0.2500       137

                accuracy                         0.3127       937
               macro avg     0.2749    0.1934    0.1643       937
            weighted avg     0.3037    0.3127    0.2294       937



In [36]:
### TASK 5: Unigram + Bigram Features ###
vectorizer_bigram = CountVectorizer(ngram_range=(1, 2))  # Unigram + Bigram
X_train_bigram = vectorizer_bigram.fit_transform(X_train)
X_test_bigram = vectorizer_bigram.transform(X_test)


print(f"Number of features in training set (Unigram + Bigram): {X_train_bigram.shape[1]}")
print(f"Number of features in test set (Unigram + Bigram): {X_test_bigram.shape[1]}")


nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_bigram, y_train)


y_pred_bigram = nb_classifier.predict(X_test_bigram)


accuracy_bigram = accuracy_score(y_test, y_pred_bigram)
print(f"Model Accuracy (Unigram + Bigram): {accuracy_bigram:.4f}")


class_report_bigram = classification_report(y_test, y_pred_bigram, digits=4, zero_division=0, target_names=[
    "Very Negative (-3)", "Moderately Negative (-2)", "Slightly Negative (-1)",
    "Neutral (0)", "Slightly Positive (1)", "Moderately Positive (2)", "Very Positive (3)"
])
print("Classification Report (Unigram + Bigram):\n", class_report_bigram)


Number of features in training set (Unigram + Bigram): 19390
Number of features in test set (Unigram + Bigram): 19390
Model Accuracy (Unigram + Bigram): 0.3020
Classification Report (Unigram + Bigram):
                           precision    recall  f1-score   support

      Very Negative (-3)     0.5000    0.0645    0.1143        93
Moderately Negative (-2)     0.2734    0.2275    0.2484       167
  Slightly Negative (-1)     0.0000    0.0000    0.0000        80
             Neutral (0)     0.2915    0.8244    0.4307       262
   Slightly Positive (1)     0.0000    0.0000    0.0000       107
 Moderately Positive (2)     0.0000    0.0000    0.0000        91
       Very Positive (3)     0.7419    0.1679    0.2738       137

                accuracy                         0.3020       937
               macro avg     0.2581    0.1835    0.1525       937
            weighted avg     0.2883    0.3020    0.2161       937



In [37]:
### TASK 6: Unigram + Bigram + Trigram Features ###
vectorizer_trigram = CountVectorizer(ngram_range=(1, 3))  # Unigram + Bigram + Trigram
X_train_trigram = vectorizer_trigram.fit_transform(X_train)
X_test_trigram = vectorizer_trigram.transform(X_test)


print(f"Number of features in training set (Unigram + Bigram + Trigram): {X_train_trigram.shape[1]}")
print(f"Number of features in test set (Unigram + Bigram + Trigram): {X_test_trigram.shape[1]}")


nb_classifier.fit(X_train_trigram, y_train)


y_pred_trigram = nb_classifier.predict(X_test_trigram)


accuracy_trigram = accuracy_score(y_test, y_pred_trigram)
print(f"Model Accuracy (Unigram + Bigram + Trigram): {accuracy_trigram:.4f}")


class_report_trigram = classification_report(y_test, y_pred_trigram, digits=4, zero_division=0, target_names=[
    "Very Negative (-3)", "Moderately Negative (-2)", "Slightly Negative (-1)",
    "Neutral (0)", "Slightly Positive (1)", "Moderately Positive (2)", "Very Positive (3)"
])
print("Classification Report (Unigram + Bigram + Trigram):\n", class_report_trigram)

Number of features in training set (Unigram + Bigram + Trigram): 35185
Number of features in test set (Unigram + Bigram + Trigram): 35185
Model Accuracy (Unigram + Bigram + Trigram): 0.2999
Classification Report (Unigram + Bigram + Trigram):
                           precision    recall  f1-score   support

      Very Negative (-3)     0.6667    0.0645    0.1176        93
Moderately Negative (-2)     0.2698    0.2036    0.2321       167
  Slightly Negative (-1)     0.0000    0.0000    0.0000        80
             Neutral (0)     0.2855    0.8282    0.4247       262
   Slightly Positive (1)     0.0000    0.0000    0.0000       107
 Moderately Positive (2)     1.0000    0.0110    0.0217        91
       Very Positive (3)     0.7667    0.1679    0.2754       137

                accuracy                         0.2999       937
               macro avg     0.4270    0.1822    0.1531       937
            weighted avg     0.4033    0.2999    0.2142       937



In [40]:
### TASK 7: TF-IDF Feature Extraction for Best Model ###
vectorizer_tfidf = TfidfVectorizer(ngram_range=(1, 1))  # TF-IDF for Unigram
X_train_tfidf = vectorizer_tfidf.fit_transform(X_train)
X_test_tfidf = vectorizer_tfidf.transform(X_test)


nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_tfidf, y_train)


y_pred_tfidf = nb_classifier.predict(X_test_tfidf)


accuracy_tfidf = accuracy_score(y_test, y_pred_tfidf)
print(f"Model Accuracy (TF-IDF): {accuracy_tfidf:.4f}")


class_report_tfidf = classification_report(y_test, y_pred_tfidf, digits=4, zero_division=0, target_names=[
    "Very Negative (-3)", "Moderately Negative (-2)", "Slightly Negative (-1)",
    "Neutral (0)", "Slightly Positive (1)", "Moderately Positive (2)", "Very Positive (3)"
])
print("Classification Report (TF-IDF):\n", class_report_tfidf)

Model Accuracy (TF-IDF): 0.2828
Classification Report (TF-IDF):
                           precision    recall  f1-score   support

      Very Negative (-3)     0.0000    0.0000    0.0000        93
Moderately Negative (-2)     0.2647    0.0539    0.0896       167
  Slightly Negative (-1)     0.0000    0.0000    0.0000        80
             Neutral (0)     0.2838    0.9771    0.4399       262
   Slightly Positive (1)     0.0000    0.0000    0.0000       107
 Moderately Positive (2)     0.0000    0.0000    0.0000        91
       Very Positive (3)     0.0000    0.0000    0.0000       137

                accuracy                         0.2828       937
               macro avg     0.0784    0.1473    0.0756       937
            weighted avg     0.1265    0.2828    0.1390       937



In [53]:
### TASK 8: Preprocessing + Apply Best Model ###
from nltk.stem import PorterStemmer 
# Function to preprocess text
def preprocess_text(text):
    text = text.lower() 
    text = re.sub(r'\W', ' ', text)  
    text = re.sub(r'\s+', ' ', text)  
    tokens = word_tokenize(text) 
    tokens = [word for word in tokens if word not in stopwords.words('english')]  
    stemmer = PorterStemmer()  # Initialize stemmer
    tokens = [stemmer.stem(word) for word in tokens]  # Apply stemming
    return " ".join(tokens)



X_train_preprocessed = X_train.apply(preprocess_text)
X_test_preprocessed = X_test.apply(preprocess_text)


vectorizer_unigram = CountVectorizer(ngram_range=(1, 1))  # Unigrams only
X_train_unigram_preprocessed = vectorizer_unigram.fit_transform(X_train_preprocessed)
X_test_unigram_preprocessed = vectorizer_unigram.transform(X_test_preprocessed)


nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_unigram_preprocessed, y_train)
y_pred_unigram_preprocessed = nb_classifier.predict(X_test_unigram_preprocessed)


accuracy_unigram_preprocessed = accuracy_score(y_test, y_pred_unigram_preprocessed)
print(f"Model Accuracy (Unigram with Preprocessing): {accuracy_unigram_preprocessed:.4f}")


class_report_unigram_preprocessed = classification_report(y_test, y_pred_unigram_preprocessed, digits=4, zero_division=0, target_names=[
    "Very Negative (-3)", "Moderately Negative (-2)", "Slightly Negative (-1)",
    "Neutral (0)", "Slightly Positive (1)", "Moderately Positive (2)", "Very Positive (3)"
])
print("Classification Report (Unigram with Preprocessing):\n", class_report_unigram_preprocessed)

Model Accuracy (Unigram with Preprocessing): 0.3298
Classification Report (Unigram with Preprocessing):
                           precision    recall  f1-score   support

      Very Negative (-3)     0.3939    0.1398    0.2063        93
Moderately Negative (-2)     0.3096    0.3653    0.3352       167
  Slightly Negative (-1)     0.0000    0.0000    0.0000        80
             Neutral (0)     0.3129    0.7023    0.4329       262
   Slightly Positive (1)     0.1463    0.0561    0.0811       107
 Moderately Positive (2)     0.3333    0.0110    0.0213        91
       Very Positive (3)     0.5867    0.3212    0.4151       137

                accuracy                         0.3298       937
               macro avg     0.2976    0.2279    0.2131       937
            weighted avg     0.3166    0.3298    0.2733       937



In [50]:
### TASK 10: Error Analysis  ###

print("\nError Analysis (Misclassified Examples):\n")


misclassified = test_data[y_test != y_pred_unigram_preprocessed].copy()
misclassified["Original Sentiment"] = y_test[y_test != y_pred_unigram_preprocessed]
misclassified["Predicted Sentiment"] = y_pred_unigram_preprocessed[y_test != y_pred_unigram_preprocessed]


for label in sorted(y_test.unique()):  
    print(f"\nMisclassified examples for sentiment label {label}:\n")
    
    
    samples = misclassified[misclassified["Original Sentiment"] == label].sample(3, replace=True)
    
    
    for index, row in samples.iterrows():
        print(f"Tweet: {row['Tweet']}")
        print(f"Original Sentiment: {row['Original Sentiment']}, Predicted Sentiment: {row['Predicted Sentiment']}")
        print("-" * 80)



Error Analysis (Misclassified Examples):


Misclassified examples for sentiment label -3:

Tweet: @DPD_UK apparently u left a calling card... @ which address cos it certainly wasn't the address u were supposed to be delivering 2!!! #awful valence
Original Sentiment: -3, Predicted Sentiment: 0
--------------------------------------------------------------------------------
Tweet: @DPD_UK apparently u left a calling card... @ which address cos it certainly wasn't the address u were supposed to be delivering 2!!! #awful valence
Original Sentiment: -3, Predicted Sentiment: 0
--------------------------------------------------------------------------------
Tweet: discouraged valence
Original Sentiment: -3, Predicted Sentiment: 0
--------------------------------------------------------------------------------

Misclassified examples for sentiment label -2:

Tweet: I'm still feeling some type of way about Viserion. #GameOfThrones #crying #stresseating valence
Original Sentiment: -2, Predicted

In [46]:
### TASK 11: Three-Way vs. Seven-Way Classification ###

sentiment_mapping = {
    -3: "negative", -2: "negative",
    -1: "neutral", 0: "neutral", 1: "neutral",
    2: "positive", 3: "positive"
}


y_train_3way = y_train.map(sentiment_mapping)
y_test_3way = y_test.map(sentiment_mapping)

nb_classifier.fit(X_train_unigram_preprocessed, y_train_3way)

y_pred_3way = nb_classifier.predict(X_test_unigram_preprocessed)


accuracy_3way = accuracy_score(y_test_3way, y_pred_3way)
print(f"\nModel Accuracy (Three-way classification): {accuracy_3way:.4f}")


class_report_3way = classification_report(y_test_3way, y_pred_3way, digits=4, zero_division=0, target_names=["Negative", "Neutral", "Positive"])
print("Classification Report (Three-way Classification):\n", class_report_3way)


Model Accuracy (Three-way classification): 0.5816
Classification Report (Three-way Classification):
               precision    recall  f1-score   support

    Negative     0.6087    0.4846    0.5396       260
     Neutral     0.5545    0.8040    0.6564       449
    Positive     0.7342    0.2544    0.3779       228

    accuracy                         0.5816       937
   macro avg     0.6325    0.5143    0.5246       937
weighted avg     0.6133    0.5816    0.5562       937

