### Importing Libraries

In [2]:
import csv
from nltk.tokenize import word_tokenize
from nltk import FreqDist, bigrams, trigrams

# nltk.download('punkt')

### Preprocessing the data

- Cleaning the text

- Converting Reviews into tokens
- Lowercase all the words
- Removing Punctuations
- Removing the breakline characters 'br'


In [3]:
reviews = []
with open("IMDB Dataset.csv", encoding='utf-8') as file:
# with open("copied.csv", encoding='utf-8') as file:
    reader = csv.reader(file)
    for row in reader:
        if row:
            reviews.append(row[0])

reviews_text = ' '.join(reviews)
tokens = word_tokenize(reviews_text)
lowercase_tokens = [token.lower() for token in tokens]
cleaned_list = [word for word in lowercase_tokens if word.isalpha()]
removed_br = [word for word in cleaned_list if word != "br"]  # Remove 'br' tokens, whice are breakline charachter

### Calculating Frequencies of Unigrams, Bigrams, Trigrams

Also Display the 10 most common unigrasm, bigrams, trigrams

In [4]:
unigram_freqdist = FreqDist(removed_br)
print("Top 10 most common unigrams:", unigram_freqdist.most_common(10))

bigrams_list = list(bigrams(removed_br))
bigram_freqdist = FreqDist(bigrams_list)
print("Top 10 most common bigrams:", bigram_freqdist.most_common(10))

trigrams_list = list(trigrams(removed_br))
trigram_freqdist = FreqDist(trigrams_list)
print("Top 10 most common trigrams:", trigram_freqdist.most_common(10))

Top 10 most common unigrams: [('the', 664213), ('and', 323076), ('a', 321274), ('of', 288526), ('to', 267033), ('is', 216900), ('it', 187987), ('in', 185215), ('i', 173731), ('this', 150142)]
Top 10 most common bigrams: [(('of', 'the'), 77658), (('in', 'the'), 50472), (('this', 'movie'), 30954), (('and', 'the'), 27173), (('is', 'a'), 26752), (('the', 'film'), 26269), (('to', 'the'), 24139), (('the', 'movie'), 23408), (('to', 'be'), 23269), (('this', 'film'), 21439)]
Top 10 most common trigrams: [(('one', 'of', 'the'), 9808), (('this', 'movie', 'is'), 5388), (('of', 'the', 'film'), 5029), (('this', 'is', 'a'), 4939), (('a', 'lot', 'of'), 4683), (('of', 'the', 'movie'), 4206), (('some', 'of', 'the'), 3770), (('the', 'film', 'is'), 3759), (('is', 'one', 'of'), 3626), (('this', 'film', 'is'), 3572)]


### Creating the predictor functions for my N-grams

In [5]:
def predict_unigram():
    return unigram_freqdist.max()

def predict_bigram(prev_word):
    candidates = [bigram for bigram in bigram_freqdist if bigram[0] == prev_word]
    
    if candidates:
        return max(candidates, key=lambda x: bigram_freqdist[x])[1]
    else:
        return None

def predict_trigram(prev_two_words):
    candidates = [trigram for trigram in trigram_freqdist if (trigram[0], trigram[1]) == prev_two_words]
    
    if candidates:
        return max(candidates, key=lambda x: trigram_freqdist[x])[2]
    else:
        return None


### Printing the most commong predicted word for my models

* One thing the most predicted word for my unigram would be the one that comes the most so it will be the onle one

* While we can change the input previous words for bigram, trigramsjk

In [6]:
print("Unigram Prediction (most frequent word):", predict_unigram())

print("Bigram Prediction: ", predict_bigram('film'))

print("Trigram Prediction: ", predict_trigram(('some', 'of')))

Unigram Prediction (most frequent word): the
Bigram Prediction:  is
Trigram Prediction:  the
