# LAB 4

    SHWETANK SHEKHAR
    22BEC1204

## TASK:
    Implement sentiment classifier using both BoW and N-gram language 
    models, observe the difference in accuracies.


In [1]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from datasets import load_dataset
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
import numpy as np
import pandas as pd
from scipy.sparse import lil_matrix
from collections import Counter


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
dataset = pd.read_csv(r"C:\Users\shwet\Downloads\archive\movie.csv")
sentences = dataset['text'].tolist()
sentiment = dataset['label'].tolist()

In [3]:

def preprocess(text, stop_words):
    clean = re.sub(r'<[^>]+>', '', text)
    clean = re.sub(r'[^a-zA-Z\s]', '', clean.lower())
    words = word_tokenize(clean)
    words = [word for word in words if word not in stop_words]
    return " ".join(words)

stop_words = set(stopwords.words('english'))
processed_sentences = [preprocess(s, stop_words) for s in sentences]


In [4]:
# Create a vocabulary and index mapping for BoW
def create_vocab(sentences):
    vocab = set()
    for s in sentences:
        words = s.split()
        vocab.update(words)
    vocab = sorted(list(vocab))
    return {word: i for i, word in enumerate(vocab)}

word_to_index = create_vocab(processed_sentences)

BOW-VECTORIZATION

In [5]:
# Fixed BoW vectorization using sparse matrix
def create_bow_vectors_sparse(sentences, word_to_index):
    num_sentences = len(sentences)
    vocab_size = len(word_to_index)
    vectors = lil_matrix((num_sentences, vocab_size), dtype=np.int32)
    
    for i, s in enumerate(sentences):
        words = s.split()
        for word in words:
            if word in word_to_index:
                idx = word_to_index[word]
                vectors[i, idx] += 1
    
    return vectors.tocsr()

X_bow = create_bow_vectors_sparse(processed_sentences, word_to_index)
y = np.array(sentiment)


In [6]:
X_train_bow, X_test_bow, y_train, y_test = train_test_split(X_bow, y, test_size=0.2, random_state=42)
clf_bow = LogisticRegression(max_iter=1000)
clf_bow.fit(X_train_bow, y_train)

y_pred_bow = clf_bow.predict(X_test_bow)
print("BoW Model Results")
print("Accuracy:", accuracy_score(y_test, y_pred_bow))
print(classification_report(y_test, y_pred_bow))


BoW Model Results
Accuracy: 0.87875
              precision    recall  f1-score   support

           0       0.88      0.88      0.88      3966
           1       0.88      0.88      0.88      4034

    accuracy                           0.88      8000
   macro avg       0.88      0.88      0.88      8000
weighted avg       0.88      0.88      0.88      8000



In [7]:

def predict_bow(text, model, word_to_index, stop_words):
    vector = lil_matrix((1, len(word_to_index)), dtype=np.int32)
    clean = re.sub(r'<[^>]+>', '', text)
    clean = re.sub(r'[^a-zA-Z\s]', '', clean.lower())
    words = word_tokenize(clean)
    words = [word for word in words if word not in stop_words]
    for word in words:
        if word in word_to_index:
            vector[0, word_to_index[word]] += 1
    prediction = model.predict(vector)
    return "Positive" if prediction[0] == 1 else "Negative"


In [8]:
test_sentence_pos = "This movie was absolutely wonderful!"
test_sentence_neg = "I hated this movie, it was so boring."
print("Prediction for '", test_sentence_pos, "':", predict_bow(test_sentence_pos, clf_bow, word_to_index, stop_words))
print("Prediction for '", test_sentence_neg, "':", predict_bow(test_sentence_neg, clf_bow, word_to_index, stop_words))

Prediction for ' This movie was absolutely wonderful! ': Positive
Prediction for ' I hated this movie, it was so boring. ': Negative


N-GRAM

In [9]:
def generate_ngrams(tokens, n):
    ngrams = []
    for i in range(len(tokens) - n + 1):
        ngram = tuple(tokens[i:i+n])
        ngrams.append(ngram)
    return ngrams

def create_ngram_vectors_sparse(sentences, ngram_to_index):
    num_sentences = len(sentences)
    vocab_size = len(ngram_to_index)
    vectors = lil_matrix((num_sentences, vocab_size), dtype=np.int32)
    
    for i, s in enumerate(sentences):
        tokens = s.split()
        ngrams = generate_ngrams(tokens, 1) + generate_ngrams(tokens, 2)
        for ng in ngrams:
            if ng in ngram_to_index:
                vectors[i, ngram_to_index[ng]] += 1
    
    return vectors.tocsr()


In [10]:
# Create N-gram vocabulary
all_ngrams = []
for sentence in processed_sentences:
    tokens = sentence.split()
    unigrams = generate_ngrams(tokens, 1)
    bigrams = generate_ngrams(tokens, 2)
    all_ngrams.extend(unigrams + bigrams)
    
ngram_counter = Counter(all_ngrams)
top_k = 20000
vocab = [ng for ng, _ in ngram_counter.most_common(top_k)]
ngram_to_index = {ng: i for i, ng in enumerate(vocab)}


In [11]:
# Generate N-gram vectors
X_ngram = create_ngram_vectors_sparse(processed_sentences, ngram_to_index)

In [12]:
X_train_ngram, X_test_ngram, y_train, y_test = train_test_split(X_ngram, y, test_size=0.2, random_state=42)
clf_ngram = LogisticRegression(max_iter=1000)
clf_ngram.fit(X_train_ngram, y_train)

y_pred_ngram = clf_ngram.predict(X_test_ngram)
print("\n N-gram Model Results")
print("Accuracy:", accuracy_score(y_test, y_pred_ngram))
print(classification_report(y_test, y_pred_ngram))


 N-gram Model Results
Accuracy: 0.879125
              precision    recall  f1-score   support

           0       0.88      0.88      0.88      3966
           1       0.88      0.88      0.88      4034

    accuracy                           0.88      8000
   macro avg       0.88      0.88      0.88      8000
weighted avg       0.88      0.88      0.88      8000



In [13]:
def predict_ngram(text, model, ngram_to_index, stop_words):
    clean = re.sub(r'<[^>]+>', '', text)
    clean = re.sub(r'[^a-zA-Z\s]', '', clean.lower())
    tokens = word_tokenize(clean)
    tokens = [word for word in tokens if word not in stop_words]
    
    vector = lil_matrix((1, len(ngram_to_index)), dtype=np.int32)
    ngrams = generate_ngrams(tokens, 1) + generate_ngrams(tokens, 2)
    for ng in ngrams:
        if ng in ngram_to_index:
            vector[0, ngram_to_index[ng]] += 1
    
    prediction = model.predict(vector)
    return "Positive" if prediction[0] == 1 else "Negative"

In [14]:
test_sentence_pos = "This movie was absolutely wonderful!"
test_sentence_neg = "I hated this movie, it was so boring."
print("Prediction for '", test_sentence_pos, "':", predict_ngram(test_sentence_pos, clf_ngram, ngram_to_index, stop_words))
print("Prediction for '", test_sentence_neg, "':", predict_ngram(test_sentence_neg, clf_ngram, ngram_to_index, stop_words))

Prediction for ' This movie was absolutely wonderful! ': Positive
Prediction for ' I hated this movie, it was so boring. ': Negative
