In [1]:
import pandas as pd
import numpy as np
from NaivesBayes import NBSentimentModel
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, recall_score, precision_score

In [2]:
amazon = pd.read_csv("amazon.csv")
movie = pd.read_csv("IMDB Dataset.csv")
restaurant = pd.read_csv("Restaurant_Reviews.csv")

In [3]:
amazon_updated = amazon[['product_name', 'rating', 'review_title', 'review_content']]

In [4]:
print(restaurant.head())

                                              Review  Liked
0                           Wow... Loved this place.      1
1                                 Crust is not good.      0
2          Not tasty and the texture was just nasty.      0
3  Stopped by during the late May bank holiday of...      1
4  The selection on the menu was great and so wer...      1


In [5]:
print(movie.head())

                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive


In [6]:
print(f"Number of data in the Amazon dataset: {len(amazon)}")
print(f"Number of data in the Movie dataset: {len(movie)}")
print(f"Number of data in the Restaurant dataset: {len(restaurant)}")

Number of data in the Amazon dataset: 1465
Number of data in the Movie dataset: 50000
Number of data in the Restaurant dataset: 1000


In [7]:
movie['binary_rating'] = movie['sentiment'].map({'positive': 1, 'negative': 0})
print(movie.head())

                                              review sentiment  binary_rating
0  One of the other reviewers has mentioned that ...  positive              1
1  A wonderful little production. <br /><br />The...  positive              1
2  I thought this was a wonderful way to spend ti...  positive              1
3  Basically there's a family where a little boy ...  negative              0
4  Petter Mattei's "Love in the Time of Money" is...  positive              1


In [8]:
movie['binary_rating'] = pd.to_numeric(movie['binary_rating'], errors='coerce')

In [9]:
counts = restaurant['Liked'].value_counts()

count_of_1 = counts.get(1, 0)
count_of_0 = counts.get(0, 3)

print(f"Number of 1s: {count_of_1}")
print(f"Number of 0s: {count_of_0}")

Number of 1s: 500
Number of 0s: 500


In [10]:
counts = movie['binary_rating'].value_counts()

count_of_1 = counts.get(1, 0)
count_of_0 = counts.get(0, 3)

print(f"Number of 1s: {count_of_1}")
print(f"Number of 0s: {count_of_0}")

Number of 1s: 25000
Number of 0s: 25000


In [11]:
amazon_updated['rating'] = pd.to_numeric(amazon_updated['rating'], errors='coerce')

amazon_updated['binary_rating'] = (amazon_updated['rating'] >= 3).astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  amazon_updated['rating'] = pd.to_numeric(amazon_updated['rating'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  amazon_updated['binary_rating'] = (amazon_updated['rating'] >= 3).astype(int)


In [12]:
counts = amazon_updated['binary_rating'].value_counts()

count_of_1 = counts.get(1, 0)
count_of_0 = counts.get(0, 3)

print(f"Number of 1s: {count_of_1}")
print(f"Number of 0s: {count_of_0}")

Number of 1s: 1458
Number of 0s: 7


In [13]:
newdf_0 = movie[movie['binary_rating'] == 0]
newdf_1 = movie[movie['binary_rating'] == 1]

newdf_1_downsampled = newdf_1.sample(n=500, random_state=42)
newdf_0_downsampled = newdf_0.sample(n=500, random_state=42)
movie = pd.concat([newdf_0_downsampled, newdf_1_downsampled])
movie = movie.sample(frac=1, random_state=42).reset_index(drop=True)

print(movie['binary_rating'].value_counts())

binary_rating
1    500
0    500
Name: count, dtype: int64


In [14]:
train_data, test_data = train_test_split(movie, test_size=0.2, random_state=42)
train_data, dev_data = train_test_split(train_data, test_size=0.4, random_state=42)

In [15]:
train_sentences = train_data['review'].fillna("").tolist()
train_labels = train_data['binary_rating'].tolist()
dev_sentences = dev_data['review'].fillna("").tolist()
dev_labels = dev_data['binary_rating'].tolist()
test_sentences = test_data['review'].fillna("").tolist()
test_labels = test_data['binary_rating'].tolist()

In [16]:
best_accuracy = 0
best_ngram_size = 1

# Try n-gram sizes from 1 to 8
for ngram_size in range(1, 12):
    model = NBSentimentModel(ngram_size=ngram_size)
    model.fit(train_sentences, train_labels)
    predicted_dev_ratings = model.predict(dev_sentences)
    accuracy = accuracy_score(dev_labels, predicted_dev_ratings)
    print(f"Model accuracy on dev data with n-gram size {ngram_size}: {accuracy}")
    
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_ngram_size = ngram_size

print(f"Best n-gram size based on dev data: {best_ngram_size}")

Model accuracy on dev data with n-gram size 1: 0.65
Model accuracy on dev data with n-gram size 2: 0.7375
Model accuracy on dev data with n-gram size 3: 0.759375
Model accuracy on dev data with n-gram size 4: 0.76875
Model accuracy on dev data with n-gram size 5: 0.790625
Model accuracy on dev data with n-gram size 6: 0.7875
Model accuracy on dev data with n-gram size 7: 0.790625
Model accuracy on dev data with n-gram size 8: 0.809375
Model accuracy on dev data with n-gram size 9: 0.79375
Model accuracy on dev data with n-gram size 10: 0.76875
Model accuracy on dev data with n-gram size 11: 0.75
Best n-gram size based on dev data: 8


In [17]:
#Best ngram-size is 8 after testing on dev set
model = NBSentimentModel(ngram_size=8)
model.fit(train_sentences, train_labels)

In [18]:
predicted_ratings = model.predict(test_sentences)
test_data['predicted_rating'] = predicted_ratings

In [19]:
pd.set_option('display.max_rows', None)
print(test_data[['binary_rating', 'predicted_rating']])

     binary_rating  predicted_rating
521              1                 1
737              0                 0
740              1                 1
660              0                 0
411              1                 1
678              1                 1
626              0                 0
513              0                 0
859              1                 1
136              0                 0
811              0                 0
76               1                 0
636              0                 0
973              1                 1
938              0                 0
899              1                 1
280              1                 1
883              0                 0
761              1                 1
319              0                 0
549              0                 0
174              0                 1
371              1                 1
527              0                 0
210              0                 0
235              0                 0
1

In [20]:
accuracy = accuracy_score(test_labels, predicted_ratings)
print(f"Model accuracy on test data: {accuracy}")

Model accuracy on test data: 0.865


In [23]:
train_data, test_data = train_test_split(restaurant, test_size=0.2, random_state=42)
test_sentences = test_data['Review'].fillna("").tolist()
test_labels = test_data['Liked'].tolist()

In [24]:
predicted_ratings = model.predict(test_sentences)
test_data['predicted_rating'] = predicted_ratings

In [25]:
accuracy = accuracy_score(test_labels, predicted_ratings)
print(f"Model accuracy on test data: {accuracy}")

Model accuracy on test data: 0.7


In [29]:
train_data, test_data = train_test_split(amazon_updated, test_size=0.2, random_state=42)
test_sentences = test_data['review_content'].fillna("").tolist()
test_labels = test_data['binary_rating'].tolist()

In [30]:
predicted_ratings = model.predict(test_sentences)
test_data['predicted_rating'] = predicted_ratings

In [31]:
accuracy = accuracy_score(test_labels, predicted_ratings)
print(f"Model accuracy on test data: {accuracy}")

Model accuracy on test data: 0.6621160409556314
