In [None]:
import nltk
import numpy as np
import pandas as pd
from nltk import ngrams
from nltk.probability import FreqDist
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
# Step 1: Load the data
data = pd.read_csv('/content/google_play_store_apps_reviews.csv')

# Step 2: Split the data
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

In [None]:
# Getting N-grams
def get_ngrams(text, n):
    tokens = nltk.word_tokenize(text)
    return list(ngrams(tokens, n))

In [None]:
# Step 3: Build the n-gram Language Model
def train_ngram(data, n):
    positive_ngrams = []
    negative_ngrams = []

    for index, row in data.iterrows():
        grams = get_ngrams(row['review'], n)
        if row['polarity'] == 1:
            positive_ngrams.extend(grams)
        elif row['polarity'] == 0:
            negative_ngrams.extend(grams)

    positive_freq = FreqDist(positive_ngrams)
    negative_freq = FreqDist(negative_ngrams)

    return positive_freq, negative_freq

# Step 4: Train the Model
n = 2
positive_freq, negative_freq = train_ngram(train_data, n)

In [None]:
# Step 5: Test the n-gram model
def test_ngram(test_data, positive_freq, negative_freq, n):
    pred_labels = []
    total_reviews = len(test_data)
    print(f"Total reviews to process: {total_reviews}")

    for index, (i, row) in enumerate(test_data.iterrows(), 1):
        grams = get_ngrams(row['review'], n)
        pos_score = 0
        neg_score = 0

        for gram in grams:
            pos_score += positive_freq[gram]
            neg_score += negative_freq[gram]

        if pos_score > neg_score:
            pred_labels.append(1)  # Positive
        else:
            pred_labels.append(0)  # Negative or neutral


        if index % 100 == 0:
            print(f"Processed {index}/{total_reviews} reviews.")


    if total_reviews % 100 != 0:
        print(f"Processed {total_reviews}/{total_reviews} reviews.")

    return pred_labels

In [None]:
# Step 6: Evaluate the model on the test set
pred_labels = test_ngram(test_data, positive_freq, negative_freq, n)
actual_labels = test_data['polarity'].tolist()
accuracy = accuracy_score(actual_labels, pred_labels)
print(f"Accuracy: {accuracy}")

Total reviews to process: 179
Processed 100/179 reviews.
Processed 179/179 reviews.
Accuracy: 0.7653631284916201
