In [None]:
#                      TEXT PREDICTION
import re
from collections import defaultdict, Counter
import random

# Preprocessing function to clean the text
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    return text

# Tokenize the text into words
def tokenize(text):
    return text.split()

# Stopword Removal
stopwords = set(['the', 'is', 'in', 'it', 'of', 'and', 'to', 'a'])

def remove_stopwords(tokens):
    return [token for token in tokens if token not in stopwords]

# Function to build n-gram models
def build_ngram_model(tokens, n):
    ngram_model = defaultdict(Counter)
    for i in range(len(tokens) - n):
        ngram = tuple(tokens[i:i + n])
        next_word = tokens[i + n]
        ngram_model[ngram][next_word] += 1
    return ngram_model

# Add-one Smoothing
def add_one_smoothing(ngram_model, vocab_size):
    smoothed_model = defaultdict(Counter)
    for context, word_counts in ngram_model.items():
        total_count = sum(word_counts.values()) + vocab_size
        for word, count in word_counts.items():
            smoothed_model[context][word] = (count + 1) / total_count
        smoothed_model[context]['<UNK>'] = 1 / total_count
    return smoothed_model

# Backoff Model for Prediction
def backoff_ngram_predict(unigram_model, bigram_model, trigram_model, context):
    if len(context) == 2 and tuple(context) in trigram_model:
        return max(trigram_model[tuple(context)], key=trigram_model[tuple(context)].get)
    elif len(context) >= 1 and tuple(context[-1:]) in bigram_model:
        return max(bigram_model[tuple(context[-1:])], key=bigram_model[tuple(context[-1:])].get)
    else:
        return max(unigram_model, key=unigram_model.get)

# Model Accuracy Evaluation
def evaluate_and_display_model(test_sentences, unigram_model, bigram_model, trigram_model):
    for sentence in test_sentences:
        print(f"\nOriginal sentence: {sentence}")
        tokens = tokenize(preprocess_text(sentence))
        tokens = remove_stopwords(tokens)
        predicted_sentence = tokens[:2]

        # Predict each word one by one using backoff model
        for i in range(2, len(tokens)):
            context = predicted_sentence[-2:]
            predicted_word = backoff_ngram_predict(unigram_model, bigram_model, trigram_model, context)
            predicted_sentence.append(predicted_word)
            print(f"Context: {context} -> Predicted next word: '{predicted_word}', Actual word: '{tokens[i]}'")

        # Show the predicted sentence
        predicted_sentence_str = ' '.join(predicted_sentence)
        print(f"Predicted sentence: {predicted_sentence_str}")
        print(f"Accuracy: {sum(1 for a, b in zip(predicted_sentence, tokens) if a == b)}/{len(tokens)} words correct")


file_path = '/IMDB Dataset.csv'
df = pd.read_csv(file_path)
df['cleaned_review'] = df['review'].apply(preprocess_text)
df['tokens'] = df['cleaned_review'].apply(tokenize)
df['tokens'] = df['tokens'].apply(remove_stopwords)

# Combine all tokens for the n-gram model training
all_tokens = [token for sublist in df['tokens'] for token in sublist]


unigram_model = Counter(all_tokens)
bigram_model = build_ngram_model(all_tokens, 1)
trigram_model = build_ngram_model(all_tokens, 2)

# Smoothing
vocab_size = len(set(all_tokens))
bigram_model_smoothed = add_one_smoothing(bigram_model, vocab_size)
trigram_model_smoothed = add_one_smoothing(trigram_model, vocab_size)

#test data for evaluation
test_data = [
    "this movie is great",
    "the acting was terrible",
    "the story was very interesting",
    "I didn't like the direction"
]


evaluate_and_display_model(test_data, unigram_model, bigram_model, trigram_model)



Original sentence: this movie is great
Context: ['this', 'movie'] -> Predicted next word: 'was', Actual word: 'great'
Predicted sentence: this movie was
Accuracy: 2/3 words correct

Original sentence: the acting was terrible
Context: ['acting', 'was'] -> Predicted next word: 'terrible', Actual word: 'terrible'
Predicted sentence: acting was terrible
Accuracy: 3/3 words correct

Original sentence: the story was very interesting
Context: ['story', 'was'] -> Predicted next word: 'so', Actual word: 'very'
Context: ['was', 'so'] -> Predicted next word: 'bad', Actual word: 'interesting'
Predicted sentence: story was so bad
Accuracy: 2/4 words correct

Original sentence: I didn't like the direction
Context: ['i', 'didnt'] -> Predicted next word: 'like', Actual word: 'like'
Context: ['didnt', 'like'] -> Predicted next word: 'this', Actual word: 'direction'
Predicted sentence: i didnt like this
Accuracy: 3/4 words correct


In [None]:
#                                TEXT CLASSIFICATION

import re
from collections import defaultdict, Counter
import math

# Preprocessing function to clean the text
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    return text

# Tokenize the text into words
def tokenize(text):
    return text.split()

# Function to build n-gram models for each class
def build_ngram_model_by_class(reviews, n):
    ngram_model = defaultdict(Counter)
    for review in reviews:
        tokens = tokenize(preprocess_text(review))
        for i in range(len(tokens) - n):
            ngram = tuple(tokens[i:i + n])
            next_word = tokens[i + n]
            ngram_model[ngram][next_word] += 1
    return ngram_model

import math
from collections import defaultdict, Counter

# Bayesian Classification
def bayesian_classification(review, pos_model, neg_model, pos_prior, neg_prior, n):
    tokens = tokenize(preprocess_text(review))

    pos_prob = math.log(pos_prior)
    neg_prob = math.log(neg_prior)

    print(f"\nReview: '{review}'")
    print(f"Tokens: {tokens}")


    for i in range(len(tokens) - n + 1):
        ngram = tuple(tokens[i:i + n])


        total_pos_ngrams = sum(sum(counter.values()) for counter in pos_model.values())
        total_neg_ngrams = sum(sum(counter.values()) for counter in neg_model.values())

        # count of the n-gram in the positive and negative models
        pos_ngram_count = pos_model[ngram] if ngram in pos_model else Counter()
        neg_ngram_count = neg_model[ngram] if ngram in neg_model else Counter()

        # Adding 1 for smoothing
        pos_ngram_prob = math.log((sum(pos_ngram_count.values()) + 1) / (total_pos_ngrams + len(pos_model)))
        neg_ngram_prob = math.log((sum(neg_ngram_count.values()) + 1) / (total_neg_ngrams + len(neg_model)))

        # log probabilities
        pos_prob += pos_ngram_prob
        neg_prob += neg_ngram_prob

        # Display calculations
        print(f"Context: {ngram}")
        print(f"  Positive class log probability: {pos_ngram_prob}")
        print(f"  Negative class log probability: {neg_ngram_prob}")

    print(f"\nFinal log probabilities:")
    print(f"  Positive class: {pos_prob}")
    print(f"  Negative class: {neg_prob}")


    predicted_label = 'positive' if pos_prob > neg_prob else 'negative'
    print(f"Predicted label: {predicted_label}")

    return predicted_label


def evaluate_model(test_data, pos_model, neg_model, pos_prior, neg_prior, n):
    correct = 0
    total = len(test_data)

    print("\n==== Model Evaluation Start ====\n")

    for review, actual_label in test_data:
        print(f"Actual label: {actual_label}")
        predicted_label = bayesian_classification(review, pos_model, neg_model, pos_prior, neg_prior, n)
        print(f"  ==> Predicted label: {predicted_label}, Actual label: {actual_label}\n")

        if predicted_label == actual_label:
            print("  ==> Correct!\n")
            correct += 1
        else:
            print("  ==> Incorrect!\n")

    accuracy = correct / total
    print(f"==== Model Evaluation Complete ====")
    print(f"Total Correct: {correct} / {total}")
    print(f"Accuracy: {accuracy * 100:.2f}%\n")

# Sample test data for evaluation
test_data = [
    ("I loved the acting and the storyline", 'positive'),
    ("The movie was terrible and boring", 'negative'),
    ("An excellent performance by the cast", 'positive'),
    ("I hated the film, it was the worst", 'negative')
]


evaluate_model(test_data, pos_ngram_model, neg_ngram_model, pos_prior, neg_prior, n)



==== Model Evaluation Start ====

Actual label: positive

Review: 'I loved the acting and the storyline'
Tokens: ['i', 'loved', 'the', 'acting', 'and', 'the', 'storyline']
Context: ('i', 'loved')
  Positive class log probability: -8.865629254239504
  Negative class log probability: -10.110322130841297
Context: ('loved', 'the')
  Positive class log probability: -9.652902715491539
  Negative class log probability: -10.531702694255362
Context: ('the', 'acting')
  Positive class log probability: -8.295771356427476
  Negative class log probability: -7.684123081880075
Context: ('acting', 'and')
  Positive class log probability: -9.677053175723822
  Negative class log probability: -9.309186312217257
Context: ('and', 'the')
  Positive class log probability: -6.218146091879847
  Negative class log probability: -6.359374309171852
Context: ('the', 'storyline')
  Positive class log probability: -10.109546956219907
  Negative class log probability: -9.846609248502512

Final log probabilities:
  Po