### Task 1 Code mostly

In [5]:
import csv
from nltk.tokenize import word_tokenize
from nltk import FreqDist, bigrams, trigrams

# nltk.download('punkt')

reviews = []
with open("IMDB Dataset.csv", encoding='utf-8') as file:
    reader = csv.reader(file)
    for row in reader:
        if row:
            reviews.append(row[0])

reviews_text = ' '.join(reviews)
tokens = word_tokenize(reviews_text)
lowercase_tokens = [token.lower() for token in tokens]
cleaned_list = [word for word in lowercase_tokens if word.isalpha()]
removed_br = [word for word in cleaned_list if word != "br"]  # Remove 'br' tokens, which are breakline characters

unigram_freqdist = FreqDist(removed_br)

bigrams_list = list(bigrams(removed_br))
bigram_freqdist = FreqDist(bigrams_list)

trigrams_list = list(trigrams(removed_br))
trigram_freqdist = FreqDist(trigrams_list)

def predict_unigram():
    return unigram_freqdist.max()

def predict_bigram(prev_word):
    candidates = [bigram for bigram in bigram_freqdist if bigram[0] == prev_word]
    
    if candidates:
        return max(candidates, key=lambda x: bigram_freqdist[x])[1]
    else:
        return None

def predict_trigram(prev_two_words):
    candidates = [trigram for trigram in trigram_freqdist if (trigram[0], trigram[1]) == prev_two_words]
    
    if candidates:
        return max(candidates, key=lambda x: trigram_freqdist[x])[2]
    else:
        return None


### Task 2 Code Start here

Below is the required preprossing for task 2
and Setting of postive and negative reviews distributions

In [6]:
positive_word_freq = FreqDist()
negative_word_freq = FreqDist()

reviews_with_sentiment = []
with open("IMDB Dataset.csv", encoding='utf-8') as file:
    reader = csv.reader(file)
    for row in reader:
        if row:
            review, sentiment = row[0], row[1]
            reviews_with_sentiment.append((review, sentiment))

for review, sentiment in reviews_with_sentiment:
    tokens = word_tokenize(review)
    lowercase_tokens = [token.lower() for token in tokens]
    cleaned_tokens = [word for word in lowercase_tokens if word.isalpha()]

    if sentiment == 'positive':
        positive_word_freq.update(cleaned_tokens)
    else:
        negative_word_freq.update(cleaned_tokens)

### Function for generating review for the us.

We have to give it some start word and some the length of which we want our review size to be

In [7]:
def generate_review(start_word, max_length=5):
    review = [start_word]
    
    for _ in range(max_length - 1):
        if len(review) >= 2:
            next_word = predict_trigram((review[-2], review[-1]))
            if next_word:
                review.append(next_word)
            else:
                next_word = predict_bigram(review[-1])
                if next_word:
                    review.append(next_word)
                else:
                    break
        elif len(review) == 1:
            next_word = predict_bigram(review[-1])
            if next_word:
                review.append(next_word)
            else:
                break
    
    return ' '.join(review)


### Function for classifying review

In [8]:
def classify_review(review):
    tokens = word_tokenize(review)
    cleaned_tokens = [word.lower() for word in tokens if word.isalpha()]
    
    positive_prob = 0
    negative_prob = 0
    
    total_positive_words = sum(positive_word_freq.values())
    total_negative_words = sum(negative_word_freq.values())
    
    vocab_size = len(set(positive_word_freq.keys()).union(set(negative_word_freq.keys())))
    
    # Calculate the log probabilities of each class
    for token in cleaned_tokens:
        positive_prob += (positive_word_freq[token] + 1) / (total_positive_words + vocab_size)
        negative_prob += (negative_word_freq[token] + 1) / (total_negative_words + vocab_size)
    
    if positive_prob > negative_prob:
        return "positive"
    else:
        return "negative"

### Generating the review now and classifying it by calling the functions

In [9]:
sample_review = generate_review('action', max_length=10)
print("Generated Review:", sample_review)

generated_review_sentiment = classify_review(sample_review)
print("Generated Review Sentiment:", generated_review_sentiment)


Generated Review: action and the film is a very good and the
Generated Review Sentiment: positive
