In [7]:
import numpy as np
import math as math


##LOAD DATA
labels=[]
reviews=[]

with open("simple-food-reviews.txt","r") as f:
    lines=f.readlines()
    for line in lines:
        line=line.replace("\n","")
        words=line.split(" ")
        label=int(words[0])
        review=" ".join(words[1:])
        labels.append(label)
        reviews.append(review)
##HERE WE PRINT THE LABELS AND REVIEWS TO CHECK IF IT GOT LOADED WELL
if not(len(labels)==len(reviews)):
    print("DID NOT LOAD LABELS AND REVIEWS PROPERLY")
else:
    print("Labels : \n",labels)
    print("Reviews : \n",reviews)





Labels : 
 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1]
Reviews : 
 ['the food is lovely', 'this is a great restaurant', 'i really enjoyed my food', 'i enjoyed the experience at the restaurant', 'we had a lovely meal', 'my food tasted great', 'the food was lovely and the service was not bad', 'the service was great', 'what a lovely restaurant', 'the food the service and the restaurant was great', 'this restaurant is lovely', 'the service is terrible', 'the food tasted awful', 'this is a bad restaurant  ', 'the food was really bad', 'the service and the food was terrible', 'we had a terrible experience', 'avoid this restaurant', 'avoid the food', 'the meal was terrible', 'the service was bad']


# Process Features

Below I create a bag of words from the dataset in reviews so that we have a list of unique words contained in our dataset.

In [8]:
import re 

bag_of_words=[]
seen_words=set()

for review in reviews:
    words=re.findall(r'\b\w+\b', review.lower())  #Here I tokenise the words
    for word in words:
        if word not in seen_words:
            bag_of_words.append(word)#Only add words have not been seen
            seen_words.add(word)

##verify visually if all words are unique
print(bag_of_words)


##CREATE FEATURES

def create_features(review):
    review_features=[0]*len(bag_of_words)
    review_words=re.findall(r'\b\w+\b',review.lower())##Split the review into an array of words then check them against bag of words
    for i,word in enumerate(bag_of_words):
        if(word in review_words):
            review_features[i]=1
    #return a python list of 1s and 0s
    return review_features


##BELOW WE TEST IF CREATE FEATURES FOR THE 1ST REVIEW ACTUALLY WORKED PROPERLY
review_1_features = create_features(reviews[0])
print(f"Your review 1 features: {review_1_features}")

correct_features = [1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
print(f"Correct review 1 features: {correct_features}\n")

if review_1_features == correct_features:
    print(f"create_features correct for first review: review_1_features = correct_features")
else:
    print(f"create_features incorrect for first review: review_1_features != correct_features")


##PROCESS ALL FEATURES

review_features=[create_features(review) for review in reviews]


['the', 'food', 'is', 'lovely', 'this', 'a', 'great', 'restaurant', 'i', 'really', 'enjoyed', 'my', 'experience', 'at', 'we', 'had', 'meal', 'tasted', 'was', 'and', 'service', 'not', 'bad', 'what', 'terrible', 'awful', 'avoid']
Your review 1 features: [1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Correct review 1 features: [1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

create_features correct for first review: review_1_features = correct_features


# Train and Test Data Split

Below we split our data into training and testing data.We do not create a validation split since we do not change any hyperparameters in our model.We do this manually for convenience though on a proffesional level this is done randomly.

In [9]:
##WE SPLIT 90% FOR TRAINING 
print("###############")
print("# TRAIN SPLIT #")
print("###############\n")

train_reviews=reviews[:10]+reviews[11:-1]
train_features=review_features[:10]+review_features[11:-1]
train_labels=labels[:10]+labels[11:-1]

print("train_reviews:\n", train_reviews)
print("train_features:\n", train_features)
print("train_labels\n", train_labels)

print("\n###############")
print("# TEST SPLIT #")
print("###############\n")
#WE SPLIT 10% FOR TESTING
test_reviews = [reviews[10], reviews[-1]]
test_features = [review_features[10], review_features[-1]]
test_labels = [labels[10], labels[-1]]



print("test_reviews:\n", test_reviews)
print("test_features:\n", test_features)
print("test_labels\n", test_labels)
                                                   

###############
# TRAIN SPLIT #
###############

train_reviews:
 ['the food is lovely', 'this is a great restaurant', 'i really enjoyed my food', 'i enjoyed the experience at the restaurant', 'we had a lovely meal', 'my food tasted great', 'the food was lovely and the service was not bad', 'the service was great', 'what a lovely restaurant', 'the food the service and the restaurant was great', 'the service is terrible', 'the food tasted awful', 'this is a bad restaurant  ', 'the food was really bad', 'the service and the food was terrible', 'we had a terrible experience', 'avoid this restaurant', 'avoid the food', 'the meal was terrible']
train_features:
 [[1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0,

# Naive Bayes Computations
## Below we compute all components of our Naive Bayes model before combining them

In [10]:
import numpy as np
import re

#COMPUTE PRIOR PROBABILITIES OF THE LABELS "negative(-1)" and "positive(-1)"
number_of_negative_reviews=0
number_of_positive_reviews=0
for number in train_labels:
    if number==-1:
        number_of_negative_reviews=number_of_negative_reviews+1
    else:
        number_of_positive_reviews=number_of_positive_reviews+1
p_pos=number_of_positive_reviews/len(train_labels)
p_neg=number_of_negative_reviews/len(train_labels)
print(f"p_pos = {p_pos}")
print(f"p_neg = {p_neg}")





#COMPUTE THE CLASS CONDITIONAL MODELS FOR EACH FEATURE AND CLASS
class_conditionals = np.zeros((2, len(bag_of_words)))
print(f"number of positive reviews {number_of_positive_reviews}")
print(f"number of negative reviews in the training data {number_of_negative_reviews}")

#for every word in bag of words,go to the negative reviews and count how many times does it appear and after that go to the positive reviews 
for i in range(len(bag_of_words)):
    word=bag_of_words[i].lower()
    positive_count=0
    negative_count=0
    for k in range(len(train_reviews)):  #check if the review is negative or positive 
        review = re.findall(r'\b\w+\b', train_reviews[k].lower()) 
        if (word in review):
            if (train_labels[k]==1):
                #The word is in a positive review
                positive_count=positive_count+1
            else:
                negative_count=negative_count+1
    #set the word's conditional probability
    class_conditionals[0][i]=positive_count/number_of_positive_reviews
    class_conditionals[1][i]=negative_count/number_of_negative_reviews


print(f"\nclass_conditionals =\n {class_conditionals}")


p_pos = 0.5263157894736842
p_neg = 0.47368421052631576
number of positive reviews 10
number of negative reviews in the training data 9

class_conditionals =
 [[0.5        0.5        0.2        0.4        0.1        0.3
  0.4        0.4        0.2        0.1        0.2        0.2
  0.1        0.1        0.1        0.1        0.1        0.1
  0.3        0.2        0.3        0.1        0.1        0.1
  0.         0.         0.        ]
 [0.66666667 0.44444444 0.22222222 0.         0.22222222 0.22222222
  0.         0.22222222 0.         0.11111111 0.         0.
  0.11111111 0.         0.11111111 0.11111111 0.11111111 0.11111111
  0.33333333 0.11111111 0.22222222 0.         0.22222222 0.
  0.44444444 0.11111111 0.22222222]]


# Inference

Next we will infer the associated label for the review "the service was bad" in our test data.



In [11]:
###

infer_review = test_reviews[0]
infer_features = test_features[0]

print("Infer Review: ", infer_review)
print("Infer Features: ", infer_features)


##WE COMPUTE THE CLASS CONDITIONALS MODELS FOR INFER FEATURES
##infer features for the positive class
class_cond_pos =1
for number in infer_features:
    if number==1:
        class_cond_pos=class_cond_pos*(class_conditionals[0][i])
    else:
        class_cond_pos=class_cond_pos*(1-class_conditionals[0][i])

##Infer features for the negative class
class_cond_neg = 1.0
for number in infer_features:
    if number==1:
        class_cond_pos=float(class_cond_pos*(class_conditionals[1][i]))
    else:
        class_cond_pos=float(class_cond_pos*(1-class_conditionals[1][i]))



p_infer_features = float(p_pos*class_cond_pos+p_neg*class_cond_neg)#P(X=infer_features)
p_cond_pos =float(p_pos*class_cond_pos/p_infer_features)#P(C=positive|infer_features)
p_cond_neg =1-p_cond_pos     #P(C=negative|infer_features)

#print(p_cond_neg)




Infer Review:  this restaurant is lovely
Infer Features:  [0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [12]:
###PREDICT THE MOST LIKELY LABEL FOR THIS REVIEW

pred_label =max(p_cond_pos,p_cond_neg)

print(f"The predicted label for the review '{infer_review}' is {pred_label}")
# print(pred_label)


The predicted label for the review 'this restaurant is lovely' is 1.0
