In [50]:
from collections import Counter
import numpy as np

def read_documents(doc_file):
    """
    Reads a document and returns a list of documents and their corresponding labels.
    """
    docs = []
    labels = []
    with open(doc_file, encoding='utf-8') as f:
        for line in f:
            words = line.strip().split()
            docs.append(words[3:])
            labels.append(words[1])
    return docs, labels

In [51]:
def split_data(doc_file, train_split=0.8):
    """
    Splits the data into training and validation sets.
    """
    all_docs, all_labels = read_documents(doc_file)
    split_point = int(train_split * len(all_docs))
    train_docs = all_docs[:split_point]
    train_labels = all_labels[:split_point]
    val_docs = all_docs[split_point:]
    val_labels = all_labels[split_point:]
    return train_docs, train_labels, val_docs, val_labels

In [52]:
def train_nb(documents, labels):
    """
    Trains a Naive Bayes classifier given the documents and labels.
    Returns a model containing log probabilities.
    """
    # Initialize counters for each class and a counter for all words
    pos_counter = Counter()
    neg_counter = Counter()
    all_words = set()

    # Count word frequencies per class
    for doc, label in zip(documents, labels):
        if label == 'pos':
            pos_counter.update(doc)
        elif label == 'neg':
            neg_counter.update(doc)
        all_words.update(doc)

    # Total count of words in each class
    total_pos = sum(pos_counter.values())
    total_neg = sum(neg_counter.values())

    # Vocabulary size for Laplace smoothing
    V = len(all_words)

    # Calculate log probabilities with Laplace smoothing
    log_probs = {}
    for word in all_words:
        # Apply laplace smoothing
        log_prob_pos = np.log((pos_counter[word] + 1) / (total_pos + V))
        log_prob_neg = np.log((neg_counter[word] + 1) / (total_neg + V))
        log_probs[word] = {'pos': log_prob_pos, 'neg': log_prob_neg}

    # Calculate the log probabilities of each class
    num_pos = sum(1 for l in labels if l == 'pos')
    num_neg = sum(1 for l in labels if l == 'neg')

    prob_pos = num_pos / len(labels)
    prob_neg = num_neg / len(labels)

    log_prob_pos_class = np.log(prob_pos)
    log_prob_neg_class = np.log(prob_neg)

    log_prob_class = {'pos': log_prob_pos_class, 'neg': log_prob_neg_class}
    
    return log_probs, log_prob_class

# Splitting the data and then training the model using the training data
train_docs, train_labels, val_docs, val_labels = split_data('reviews.txt')
model, log_prob_class = train_nb(train_docs, train_labels)

# Printing 5 instances of the model for verification
list(model.items())[:5], log_prob_class

([('seeks', {'pos': -10.91934950295121, 'neg': -11.604733415225777}),
  ('curry', {'pos': -12.459794543898358, 'neg': -12.857496383721145}),
  ('seriousness', {'pos': -12.459794543898358, 'neg': -11.758884095053036}),
  ('6-month', {'pos': -13.558406832566469, 'neg': -12.857496383721145}),
  ('37:4', {'pos': -13.558406832566469, 'neg': -12.857496383721145})],
 {'pos': -0.6761896870922498, 'neg': -0.7103971982200179})

In [53]:
def score_doc_label(document, label, model, log_prob_class):
    """
    Computes logarithm probability of the observed words in a document given a sentiment label.
    """
    # Start with the log probability of the label
    log_prob = log_prob_class[label]

    # Add the log probability of each word in the document
    for word in document:
        if word in model:
            log_prob += model[word][label]
        else:
            # If the word is not in the model, it's an unseen word, we choose to ignore it
            pass

    return log_prob

In [54]:
# Sanity Check 1: Testing with the word "great"
log_prob_pos_great = score_doc_label(["great"], "pos", model, log_prob_class)
log_prob_neg_great = score_doc_label(["great"], "neg", model, log_prob_class)
prob_pos_great = np.exp(log_prob_pos_great)
prob_neg_great = np.exp(log_prob_neg_great)

# Sanity Check 2: Testing with the word "bad"
log_prob_pos_bad = score_doc_label(["bad"], "pos", model, log_prob_class)
log_prob_neg_bad = score_doc_label(["bad"], "neg", model, log_prob_class)
prob_pos_bad = np.exp(log_prob_pos_bad)
prob_neg_bad = np.exp(log_prob_neg_bad)

# Sanity Check 3: Testing with the document ['a', 'top-quality', 'performance']
log_prob_pos_doc = score_doc_label(['a', 'top-quality', 'performance'], "pos", model, log_prob_class)
log_prob_neg_doc = score_doc_label(['a', 'top-quality', 'performance'], "neg", model, log_prob_class)

prob_pos_great, prob_neg_great, prob_pos_bad, prob_neg_bad, log_prob_pos_doc, log_prob_neg_doc

(0.0013212141496043825,
 0.0005283997934747295,
 0.00017230368700664423,
 0.0004547440646873432,
 -12.807858361140351,
 -13.486891735775352)

In [55]:
def classify_nb(document, model, log_prob_class):
    """
    Classify a new document using the Naive Bayes classifier.
    """
    # Compute the log probability for each class
    log_prob_pos = score_doc_label(document, "pos", model, log_prob_class)
    log_prob_neg = score_doc_label(document, "neg", model, log_prob_class)

    # Return the class with the higher log probability
    if log_prob_pos > log_prob_neg:
        return "pos"
    else:
        return "neg"

In [56]:
# Sanity checks on small test documents
test_docs = [["great"], ["bad"], ["amazing"], ["terrible"], ['a', 'top-quality', 'performance']]

# Applying the classify_nb function to the test documents
classified_docs = []
for doc in test_docs:
    classification = classify_nb(doc, model, log_prob_class)
    classified_docs.append((doc, classification))

classified_docs

[(['great'], 'pos'),
 (['bad'], 'neg'),
 (['amazing'], 'pos'),
 (['terrible'], 'neg'),
 (['a', 'top-quality', 'performance'], 'pos')]

In [57]:
def classify_documents(docs, model, log_prob_class):
    """
    Classifies documents in the provided collection.
    """
    predictions = []
    for doc in docs:
        prediction = classify_nb(doc, model, log_prob_class)
        predictions.append(prediction)
    return predictions

In [58]:
def accuracy(true_labels, guessed_labels):
    """
    Computes the accuracy of the classifier.
    """
    correct_count = 0

    for t, g in zip(true_labels, guessed_labels):
        if t == g:
            correct_count += 1
    
    acc = correct_count / len(true_labels)
    return acc

In [59]:
def precision_recall_f1(true_labels, predicted_labels):
    """
    Compute precision, recall, and F1 score.
    """
    # Initialize variables
    true_positives, false_positives, false_negatives = 0, 0, 0

    # Counting true positives, false positives, and false negatives
    for true, pred in zip(true_labels, predicted_labels):
        if true == 'pos' and pred == 'pos':
            true_positives += 1
        elif true == 'neg' and pred == 'pos':
            false_positives += 1
        elif true == 'pos' and pred == 'neg':
            false_negatives += 1

    # Calculating precision, recall, and F1 score
    precision, recall, f1 = 0, 0, 0

    if true_positives + false_positives > 0:
        precision = true_positives / (true_positives + false_positives)

    if true_positives + false_negatives > 0:
        recall = true_positives / (true_positives + false_negatives)

    if precision + recall > 0:
        f1 = 2 * (precision * recall) / (precision + recall)

    return precision, recall, f1

In [60]:
# Calculating the accuracy, precision, recall, and F1 score
predicted_labels = classify_documents(val_docs, model, log_prob_class)
accuracy_result = accuracy(val_labels, predicted_labels)
precision, recall, f1_score = precision_recall_f1(val_labels, predicted_labels)

print(f"Accuracy: {accuracy_result} \nPrecision: {precision}\nRecall: {recall}\nF1 score: {f1_score}")

Accuracy: 0.8153587914393622 
Precision: 0.8237965485921889
Recall: 0.7866435385949696
F1 score: 0.8047914818101153
