<a href="https://colab.research.google.com/github/Sammodi0711/NLP-Sem-1/blob/main/NLP_assignment6_b1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# Dataset:
# •	A training dataset of 100 sentences (provided in CSV format: train.csv), each labeled as positive (+ve) or negative (-ve).
# •	A test dataset of 20 sentences (provided in CSV format: test.csv), without labels.
# Tasks:
# 1.	Preprocessing:
# o	Use the training dataset as it is (do not remove stopwords).
# o	Tokenize the text into words.

import pandas as pd
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
nltk.download('punkt')
nltk.download('punkt_tab')

train = pd.read_csv("/content/Assignment_6-train_100_wl.csv")
test = pd.read_csv("/content/Assignment_6-test_20_wol.csv")

train["tokens"] = train["text"].apply(word_tokenize)
test["tokens"] = test["text"].apply(word_tokenize)

print(train.head())
print(test.head())

                           text  category                            tokens
0          predictable and dull  negative          [predictable, and, dull]
1                 waste of time  negative                 [waste, of, time]
2        story was good and fun  positive      [story, was, good, and, fun]
3  wonderful experience overall  positive  [wonderful, experience, overall]
4           acting was terrible  negative           [acting, was, terrible]
                                  text  \
0                      very few laughs   
1          plain boring and uninspired   
2               story was good and fun   
3               story was good and fun   
4  excellent direction and performance   

                                     tokens  
0                       [very, few, laughs]  
1          [plain, boring, and, uninspired]  
2              [story, was, good, and, fun]  
3              [story, was, good, and, fun]  
4  [excellent, direction, and, performance]  


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [4]:
# 2.	Model Building:
# o	Calculate prior probabilities of each class (+ve and -ve).
# o	Calculate likelihood probabilities of each word given a class using Naïve Bayes formula.

import numpy as np
import re
from collections import defaultdict

train = pd.read_csv("/content/Assignment_6-train_100_wl.csv")
test = pd.read_csv("/content/Assignment_6-test_20_wol.csv")
def tokenize(text):
    return re.findall(r"\b\w+\b", text.lower())
train["tokens"] = train["text"].apply(tokenize)
class_counts = train["category"].value_counts()
total_docs = len(train)
priors = {c: class_counts[c]/total_docs for c in class_counts.index}
vocab = set([word for tokens in train["tokens"] for word in tokens])
word_counts = {c: defaultdict(int) for c in class_counts.index}
class_word_totals = defaultdict(int)

for _, row in train.iterrows():
    label = row["category"]
    for word in row["tokens"]:
        word_counts[label][word] += 1
        class_word_totals[label] += 1
likelihoods = {c: {} for c in class_counts.index}
for c in class_counts.index:
    for word in vocab:
        likelihoods[c][word] = (word_counts[c][word] + 1) / (class_word_totals[c] + len(vocab))

print("Prior Probabilities:")
print(priors)

print("\nLikelihood Probabilities (sample):")
for c in likelihoods:
    print(c, dict(list(likelihoods[c].items())[:10]))

Prior Probabilities:
{'negative': np.float64(0.5), 'positive': np.float64(0.5)}

Likelihood Probabilities (sample):
negative {'great': 0.004484304932735426, 'a': 0.004484304932735426, 'powerful': 0.004484304932735426, 'performance': 0.004484304932735426, 'inspiring': 0.004484304932735426, 'amazing': 0.004484304932735426, 'with': 0.004484304932735426, 'enjoyed': 0.004484304932735426, 'movie': 0.017937219730941704, 'wonderful': 0.004484304932735426}
positive {'great': 0.01486988847583643, 'a': 0.02973977695167286, 'powerful': 0.02973977695167286, 'performance': 0.01486988847583643, 'inspiring': 0.011152416356877323, 'amazing': 0.01486988847583643, 'with': 0.01486988847583643, 'enjoyed': 0.01858736059479554, 'movie': 0.03345724907063197, 'wonderful': 0.03345724907063197}


In [6]:
# 3.	Prediction:
# o	Apply your Naïve Bayes model on the test dataset.
# o	Assign each test sentence either +ve or -ve sentiment.

import math
import re
train = pd.read_csv("/content/Assignment_6-train_100_wl.csv")
test = pd.read_csv("/content/Assignment_6-test_20_wol.csv")
pos_count = sum(train['category'] == 'positive')
neg_count = sum(train['category'] == 'negative')
total_count = len(train)

prior_pos = pos_count / total_count
prior_neg = neg_count / total_count
def tokenize(text):
    return re.findall(r"\b\w+\b", text.lower())

word_counts_pos = {}
word_counts_neg = {}
total_words_pos = 0
total_words_neg = 0

for _, row in train.iterrows():
    words = tokenize(row['text'])
    if row['category'] == 'positive':
        for w in words:
            word_counts_pos[w] = word_counts_pos.get(w, 0) + 1
            total_words_pos += 1
    else:
        for w in words:
            word_counts_neg[w] = word_counts_neg.get(w, 0) + 1
            total_words_neg += 1

vocab = set(list(word_counts_pos.keys()) + list(word_counts_neg.keys()))
vocab_size = len(vocab)

def predict(sentence):
    words = tokenize(sentence)
    log_prob_pos = math.log(prior_pos)
    log_prob_neg = math.log(prior_neg)
    for w in words:
        log_prob_pos += math.log((word_counts_pos.get(w, 0) + 1) / (total_words_pos + vocab_size))
        log_prob_neg += math.log((word_counts_neg.get(w, 0) + 1) / (total_words_neg + vocab_size))
    return 'positive' if log_prob_pos > log_prob_neg else 'negative'
test['prediction'] = test['text'].apply(predict)
print(test[['text','prediction']])

                                   text prediction
0                       very few laughs   negative
1           plain boring and uninspired   negative
2                story was good and fun   positive
3                story was good and fun   positive
4   excellent direction and performance   positive
5        amazing film with great acting   positive
6                      not a good story   positive
7           plain boring and uninspired   negative
8           plain boring and uninspired   negative
9   the film was touching and inspiring   positive
10       good storyline and nice acting   positive
11            movie was boring and slow   negative
12             really enjoyed the movie   positive
13                        waste of time   negative
14                     not a good story   positive
15  the film was touching and inspiring   positive
16       poor direction and weak script   negative
17       loved the characters and story   positive
18  excellent direction and per

In [7]:
# 4.	Evaluation:
# o	Calculate Precision, Recall, Accuracy and F1-score without any package.
# o	Calculate Precision, Recall, Accuracy and F1-score with package.
# o	Is there any difference between these two ways of evaluation? If yes, please mention that.

y_true = ['+ve', '+ve', '-ve', '+ve', '-ve', '-ve', '+ve', '-ve']
y_pred = ['+ve', '-ve', '-ve', '+ve', '-ve', '+ve', '+ve', '-ve']
TP = TN = FP = FN = 0

for true, pred in zip(y_true, y_pred):
    if true == '+ve' and pred == '+ve':
        TP += 1
    elif true == '-ve' and pred == '-ve':
        TN += 1
    elif true == '-ve' and pred == '+ve':
        FP += 1
    elif true == '+ve' and pred == '-ve':
        FN += 1

precision = TP / (TP + FP) if (TP + FP) != 0 else 0
recall = TP / (TP + FN) if (TP + FN) != 0 else 0
accuracy = (TP + TN) / (TP + TN + FP + FN)
f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0

print("Precision:", precision)
print("Recall:", recall)
print("Accuracy:", accuracy)
print("F1-score:", f1_score)

#with package

from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score
y_true_bin = [1 if label == '+ve' else 0 for label in y_true]
y_pred_bin = [1 if label == '+ve' else 0 for label in y_pred]

precision = precision_score(y_true_bin, y_pred_bin)
recall = recall_score(y_true_bin, y_pred_bin)
accuracy = accuracy_score(y_true_bin, y_pred_bin)
f1 = f1_score(y_true_bin, y_pred_bin)

print("Precision:", precision)
print("Recall:", recall)
print("Accuracy:", accuracy)
print("F1-score:", f1)

Precision: 0.75
Recall: 0.75
Accuracy: 0.75
F1-score: 0.75
Precision: 0.75
Recall: 0.75
Accuracy: 0.75
F1-score: 0.75
