In [None]:
import nltk
import numpy as np
import pandas as pd
from nltk import ngrams
from nltk.probability import FreqDist
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

nltk.download('popular')

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# Step 1: Load the data
data = pd.read_csv('/content/drive/MyDrive/NLP_CA1/google_play_store_apps_reviews.csv')

# Step 2: Split the data
train_data, test_data = train_test_split(data, test_size = 0.2, random_state = 42)

In [20]:
# Step 3: Build the n-gram Language Model
def get_ngrams(text, n):
    tokens = nltk.word_tokenize(text)
    return list(ngrams(tokens, n))

def train_ngram(data, n):
    positive_ngrams = []
    negative_ngrams = []

    for index, row in data.iterrows():
        grams = get_ngrams(row['review'], n)
        if row['polarity'] == 1:
            positive_ngrams.extend(grams)
        elif row['polarity'] == 0:
            negative_ngrams.extend(grams)

    positive_freq = FreqDist(positive_ngrams)
    negative_freq = FreqDist(negative_ngrams)

    return positive_freq, negative_freq

# Step 4: Train the Model
n = 5  # Change to the desired n-gram size
positive_freq, negative_freq = train_ngram(train_data ,  n)

precision, recall, f1_score = test_ngram(test_data, positive_freq, negative_freq, n)

print("n : " , n)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1_score)

# positive_freq, negative_freq

n :  5
Precision: 0.5714285714285714
Recall: 0.07547169811320754
F1 Score: 0.13333333333333333


In [10]:
from nltk.util import ngrams
from nltk import FreqDist
import nltk

def test_ngram(test_data, positive_freq, negative_freq, n):
    true_positives = 0
    false_positives = 0
    true_negatives = 0
    false_negatives = 0

    for index, row in test_data.iterrows():
        grams = get_ngrams(row['review'], n)
        pos_score = sum([positive_freq.get(gram, 0) for gram in grams])
        neg_score = sum([negative_freq.get(gram, 0) for gram in grams])

        if pos_score > neg_score:
            predicted_polarity = 1
        else:
            predicted_polarity = 0

        if row['polarity'] == 1 and predicted_polarity == 1:
            true_positives += 1
        elif row['polarity'] == 0 and predicted_polarity == 1:
            false_positives += 1
        elif row['polarity'] == 1 and predicted_polarity == 0:
            false_negatives += 1
        elif row['polarity'] == 0 and predicted_polarity == 0:
            true_negatives += 1

    precision = true_positives / (true_positives + false_positives)
    recall = true_positives / (true_positives + false_negatives)
    f1_score = 2 * (precision * recall) / (precision + recall)

    print("n : " , n)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1 Score:", f1_score)

    return precision, recall, f1_score

# Test the n-gram model on test data
n = 6  # Change to the desired n-gram size
precision, recall, f1_score = test_ngram(test_data, positive_freq, negative_freq, n)

print("n : " , n)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1_score)

n :  5
Precision: 0.5714285714285714
Recall: 0.07547169811320754
F1 Score: 0.13333333333333333
