# MUHAMMAD TAYYAB SOHAIL
# 21i2478
# CS B
# NLP ASSIGNMENT :1


In [30]:
import random
import pandas as pd
import nltk
import string
from nltk.tokenize import word_tokenize
from nltk import ngrams

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

# Load the dataset
dataset = pd.read_csv('IMDB Dataset.csv')

# Remove duplicates in the dataset
dataset = dataset.drop_duplicates(subset=['review'])

# Clean the reviews directly from the dataset
cleaned_reviews = []
for review in dataset['review']:
    review = str(review).lower()  # Convert to lowercase
    review = review.replace('<br />', ' ')  # Remove HTML break tags
    review = review.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    cleaned_reviews.append(review)

# Combine cleaned reviews into a single string
processed_reviews = ' '.join(cleaned_reviews)

# Tokenize the cleaned reviews
tokens = word_tokenize(processed_reviews)

# Display the first 10 tokens
print("First 10 tokens from the reviews:")
print(tokens[:10])

# Function to generate n-grams
def create_ngrams(tokens, n):
    return list(ngrams(tokens, n))

# Create n-grams
unigrams = create_ngrams(tokens, 1)
bigrams = create_ngrams(tokens, 2)
trigrams = create_ngrams(tokens, 3)

# Count frequencies for n-grams
def count_frequencies(ngram_list):
    frequency_count = {}
    for ngram in ngram_list:
        frequency_count[ngram] = frequency_count.get(ngram, 0) + 1
    return frequency_count

# Frequency counts
unigram_frequencies = count_frequencies(unigrams)
bigram_frequencies = count_frequencies(bigrams)
trigram_frequencies = count_frequencies(trigrams)

# Display top n-grams
def display_top_ngrams(frequency_count, top_n=10):
    sorted_ngrams = sorted(frequency_count.items(), key=lambda x: x[1], reverse=True)
    for ngram, freq in sorted_ngrams[:top_n]:
        print(f"{ngram}: {freq}")

print("\nTop 10 most frequent unigrams:")
display_top_ngrams(unigram_frequencies)

print("\nTop 10 most frequent bigrams:")
display_top_ngrams(bigram_frequencies)

print("\nTop 10 most frequent trigrams:")
display_top_ngrams(trigram_frequencies)

# Random selection logic based on frequency
def select_word(candidates):
    rand_num = random.randint(1, 5)

    if rand_num == 1 or rand_num == 2:
        # Choose the most frequent next word
        sorted_candidates = sorted(candidates, key=lambda x: x[1], reverse=True)
        return sorted_candidates[0][0]  # Return the most frequent word
    else:
        # Choose any word available from the candidates
        return random.choice(candidates)[0]

# Predict next word using bigrams
def predict_bigram(previous_word):
    candidates = [(bigram[1], count) for bigram, count in bigram_frequencies.items() if bigram[0] == previous_word]

    if candidates:
        return select_word(candidates)

    return random.choice(tokens)  # Random fallback word

# Predict next word using trigrams
def predict_trigram(word1, word2):
    candidates = [(trigram[2], count) for trigram, count in trigram_frequencies.items() if trigram[0] == word1 and trigram[1] == word2]

    if candidates:
        return select_word(candidates)

    return random.choice(tokens)  # Random fallback word

# Sentence generation with bigrams
def generate_bigram_sentence(length=30):
    start_word = random.choice(tokens)  # Random starter word
    sentence = [start_word]
    for _ in range(length - 1):
        next_word = predict_bigram(sentence[-1])
        sentence.append(next_word)
    return ' '.join(sentence)

# Sentence generation with trigrams
def generate_trigram_sentence(length=30):
    word1 = random.choice(tokens)  # Random first word
    word2 = random.choice(tokens)  # Random second word
    sentence = [word1, word2]
    for _ in range(length - 2):
        next_word = predict_trigram(sentence[-2], sentence[-1])
        sentence.append(next_word)
    return ' '.join(sentence)

# Classifier function

# Separate the reviews based on their sentiment
positive_reviews = dataset[dataset['sentiment'] == 'positive']['review'].tolist()
negative_reviews = dataset[dataset['sentiment'] == 'negative']['review'].tolist()

# Remove duplicates by converting lists to sets and back to lists
positive_reviews = list(set(positive_reviews))
negative_reviews = list(set(negative_reviews))

# Function to clean and tokenize reviews
def preprocess_reviews(reviews):
    processed_reviews = []
    for review in reviews:
        # Convert to lowercase and remove punctuation
        review = str(review).lower()
        review = review.translate(str.maketrans('', '', string.punctuation))
        # Tokenize the review
        tokens = word_tokenize(review)
        processed_reviews.append(tokens)
    return processed_reviews

# Preprocess the reviews and store them separately
processed_positive_reviews = preprocess_reviews(positive_reviews)
processed_negative_reviews = preprocess_reviews(negative_reviews)


def count_occurrences(word, sentiment):
    count = 0
    for review in dataset['review']:
        review = str(review).lower()
        review = review.translate(str.maketrans('', '', string.punctuation)).replace('<br />', ' ')
        if word in review.split() and (sentiment == 'positive' if 'positive' in review else 'negative' in review):
            count += 1
    return count

# Pre-computed probabilities
total_reviews = len(dataset)
positive_reviews = len(dataset[dataset['sentiment'] == 'positive'])
negative_reviews = len(dataset[dataset['sentiment'] == 'negative'])

positive_prob = positive_reviews / total_reviews
negative_prob = negative_reviews / total_reviews
print(f"Probability of positive reviews: {positive_prob}, Probability of negative reviews: {negative_prob}")



def count_occurrences(word, sentiment):
    count = 0
    # Use the appropriate list based on the sentiment
    reviews = processed_positive_reviews if sentiment == 'positive' else processed_negative_reviews

    # Count occurrences of the word in the chosen reviews
    for tokens in reviews:
        count += tokens.count(word)
    return count

def classifier(review):
    review = review.lower()
    review = review.translate(str.maketrans('', '', string.punctuation))

    pos_prob = 1
    neg_prob = 1

    pos_total_reviews = len(dataset[dataset['sentiment'] == 'positive'])
    neg_total_reviews = len(dataset[dataset['sentiment'] == 'negative'])

    total_vocab = len(unigram_frequencies)

    for w in word_tokenize(review):
        if any(key[0] == w for key in unigram_frequencies.keys()):
            pos_word_count = count_occurrences(w, 'positive')
            neg_word_count = count_occurrences(w, 'negative')


            pos_prob_word = (pos_word_count + 1) / (pos_total_reviews + total_vocab)
            neg_prob_word = (neg_word_count + 1) / (neg_total_reviews + total_vocab)

            pos_prob *= pos_prob_word
            neg_prob *= neg_prob_word



    pos_prob *= pos_total_reviews / len(dataset)
    neg_prob *= neg_total_reviews / len(dataset)


    if pos_prob > neg_prob:
        print(f"The review '{review}' is classified as POSITIVE.")
        print()
        return 'positive'
    else:
        print(f"The review '{review}' is classified as NEGATIVE.")
        print()
        return 'negative'


# Generate and store sentences separately
bigram_sentences = []
trigram_sentences = []

# Generate 10 bigram sentences
print("\nBigram Sentences:")
for _ in range(10):
    bigram_sentence = generate_bigram_sentence()
    bigram_sentences.append(bigram_sentence)
    print(bigram_sentence)

# Generate 10 trigram sentences
print("\nTrigram Sentences:")
for _ in range(10):
    trigram_sentence = generate_trigram_sentence()
    trigram_sentences.append(trigram_sentence)
    print(trigram_sentence)

# Combine bigram and trigram sentences
all_sentences = bigram_sentences + trigram_sentences

# Check sentiment of each generated sentence
print("\nSentiment analysis of combined sentences:")
for sentence in all_sentences:
    sentiment = classifier(sentence)
    print(f"Sentence: '{sentence}' | Sentiment: {sentiment}")

# Sample reviews to evaluate the classifier
sample_reviews = {
    "This movie exceeded all my expectations! The visuals were stunning, and the storyline kept me captivated throughout.": "positive",
    "A brilliant performance by the lead actor! I was emotionally invested in every scene.": "positive",
    "This was a fantastic film! I can't recommend it enough to anyone who loves a good story.": "positive",
    "I was really disappointed by this film. The characters were shallow, and the plot felt rushed.": "negative",
    "It had some good moments, but overall, it was a letdown. I wouldn't watch it again.": "negative",
    "Unfortunately, this film did not deliver. The acting was mediocre, and the storyline lacked depth.": "negative",
    "A dreadful experience! The pacing was off, and I found myself bored throughout.": "negative",
    "The film tried to do too much and ended up being a confusing mess.": "negative"
}

# Evaluation variables
false_positive = 0
false_negative = 0
true_positive = 0
true_negative = 0

# Evaluate the classifier
for review, label in sample_reviews.items():
    result = classifier(review)
    if result == 'positive' and label == 'positive':
        true_positive += 1
    elif result == 'positive' and label == 'negative':
        false_positive += 1
    elif result == 'negative' and label == 'negative':
        true_negative += 1
    else:
        false_negative += 1

# Calculate metrics
precision = true_positive / (true_positive + false_positive) if (true_positive + false_positive) > 0 else 0
recall = true_positive / (true_positive + false_negative) if (true_positive + false_negative) > 0 else 0
accuracy = (true_positive + true_negative) / (true_positive + true_negative + false_positive + false_negative)

print(f"\nEvaluation Metrics:")
print(f"True Positives: {true_positive}, True Negatives: {true_negative}")
print(f"False Positives: {false_positive}, False Negatives: {false_negative}")
print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, Accuracy: {accuracy:.4f}")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dell\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dell\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


First 10 tokens from the reviews:
['one', 'of', 'the', 'other', 'reviewers', 'has', 'mentioned', 'that', 'after', 'watching']

Top 10 most frequent unigrams:
('the',): 658858
('and',): 318397
('a',): 318341
('of',): 286421
('to',): 264932
('is',): 209069
('in',): 183706
('it',): 153747
('i',): 150860
('this',): 148719

Top 10 most frequent bigrams:
('of', 'the'): 76501
('in', 'the'): 49692
('this', 'movie'): 30629
('and', 'the'): 26166
('is', 'a'): 25893
('the', 'film'): 24655
('to', 'the'): 23496
('to', 'be'): 23124
('the', 'movie'): 22739
('this', 'film'): 21166

Top 10 most frequent trigrams:
('one', 'of', 'the'): 9726
('this', 'movie', 'is'): 5170
('of', 'the', 'film'): 4790
('this', 'is', 'a'): 4713
('a', 'lot', 'of'): 4649
('of', 'the', 'movie'): 4147
('some', 'of', 'the'): 3739
('the', 'film', 'is'): 3623
('is', 'one', 'of'): 3530
('this', 'film', 'is'): 3455
Probability of positive reviews: 0.5018756806905732, Probability of negative reviews: 0.4981243193094268

Bigram Sentence