# Experiment 6
**Aim**: Implement Naïve Bayes theorem to classify the English text.

In [1]:
import numpy as np
import pandas as pd

# Defining the dataset

In [3]:
data = {
    "corpus": [
        "I love this movie, it's so entertaining!",
        "The weather is terrible today.",
        "This book is amazing, highly recommended.",
        "I don't like the taste of this dish.",
        "The performance was outstanding, I was truly impressed.",
        "I hate waiting in long lines.",
        "The food at the restaurant was delicious.",
        "The service was very slow and disappointing.",
        "The concert was fantastic, I had a great time.",
        "I'm not a fan of the new design.",
        "The team played poorly, it was a disappointing match.",
        "I adore the artwork in this museum.",
        "The customer support was helpful and friendly.",
        "I can't stand the noise in this neighborhood.",
        "The movie was boring, I fell asleep halfway through.",
        "The software is user-friendly and efficient.",
        "The traffic was unbearable, I was stuck for hours.",
        "This product is a waste of money.",
        "The atmosphere in the cafe was cozy and inviting.",
        "I'm impressed with the quality of this product.",
        "The hotel room was dirty and smelled bad.",
        "I enjoy spending time with my friends.",
        "The play was thought-provoking and well-executed.",
        "I couldn't stop laughing, the comedy show was hilarious.",
        "The experience was underwhelming, I expected more.",
    ],
    "label": [1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0]
}
df = pd.DataFrame(data)

# Separating data by class labels

In [4]:
positive = df[df['label'] == 1]
negative = df[df['label'] == 0]

print("No. of Positive: ", len(positive))
print("No. of Negative: ", len(negative))

No. of Positive:  12
No. of Negative:  13





## Calculating `Prior` probabilities

In [7]:
total = len(df)
prior_positive = len(positive) / total
prior_negative = len(negative) / total

print("Prior probability of Positive samples: ", prior_positive)
print("Prior probability of Negative samples: ", prior_negative)

Prior probability of Positive samples:  0.48
Prior probability of Negative samples:  0.52


## Creating Vocabulary

In [8]:
positive_text = ' '.join(positive['corpus']).split()
negative_text = ' '.join(negative['corpus']).split()

vocabulary = list(set(negative_text + positive_text))
print('Vocabulary: \n', vocabulary)

Vocabulary: 
 ['disappointing', 'like', "can't", 'my', 'with', 'play', 'fan', 'movie,', 'product', 'inviting.', 'match.', 'slow', 'comedy', 'food', 'lines.', 'dirty', 'room', 'noise', 'stop', 'was', "don't", 'friends.', 'artwork', 'smelled', 'friendly.', 'for', 'hilarious.', 'laughing,', 'through.', 'fantastic,', 'adore', 'and', 'amazing,', 'quality', 'poorly,', 'show', 'of', 'at', 'played', 'recommended.', 'new', 'service', 'atmosphere', 'expected', 'neighborhood.', 'fell', 'This', 'efficient.', 'great', 'today.', 'cozy', 'dish.', 'concert', 'is', 'stuck', 'taste', 'product.', 'The', 'not', 'well-executed.', 'underwhelming,', 'cafe', 'stand', 'weather', 'design.', 'this', 'time', 'museum.', 'unbearable,', 'halfway', 'had', 'a', 'outstanding,', 'entertaining!', 'so', "couldn't", 'impressed', 'thought-provoking', 'hours.', 'long', 'team', 'delicious.', 'waste', 'spending', 'waiting', 'highly', 'in', 'bad.', 'restaurant', 'time.', 'movie', 'support', 'user-friendly', 'impressed.', 'terri

### Calculating Word Frequencies

In [12]:
positive_word_freq = { word: positive_text.count(word) for word in vocabulary }
negative_word_freq = { word: negative_text.count(word) for word in vocabulary }

word_data = {
    'Positive Word Frequency': positive_word_freq,
    'Negative Word Frequency': negative_word_freq,
}
word_df = pd.DataFrame(word_data)
word_df.head(10)

Unnamed: 0,Positive Word Frequency,Negative Word Frequency
disappointing,0,1
like,0,1
can't,0,1
my,1,0
with,2,0
play,1,0
fan,0,1
"movie,",1,0
product,0,1
inviting.,1,0


## Classifying New Text

In [15]:
test_data = {
    "corpus": [
        "This restaurant serves delicious food and has excellent service.",
        "I'm really disappointed with the customer service I received.",
        "The weather is perfect for outdoor activities today.",
        "I can't believe how fast and efficient the delivery service was.",
        "The movie was a complete waste of time and money, I regret watching it."
    ],
    "label": [1, 0, 1, 1, 0]
}
test_df = pd.DataFrame(test_data)

In [25]:
def predict(string, positive_word_freq, negative_word_freq, prior_positive, prior_negative, smoothing_factor = 1): # Using Laplace Smoothing for Likelihood Calculation
   
    new_text_words = string.split()
    likelihood_positive = 1
    for word in new_text_words:
        likelihood_positive *= (positive_word_freq.get(word, 0) + smoothing_factor) / (len(positive_text) + smoothing_factor * len(vocabulary))

    likelihood_negative = 1
    for word in new_text_words:
        likelihood_negative *= (negative_word_freq.get(word, 0) + smoothing_factor) / (len(negative_text) + smoothing_factor * len(vocabulary))

    # Apply Naive Bayes formula
    posterior_positive = prior_positive * likelihood_positive
    posterior_negative = prior_negative * likelihood_negative

    # Classify based on the higher posterior probability
    predicted_class = 1 if posterior_positive > posterior_negative else 0
    return predicted_class


### Predicting Values


In [27]:
test_df['Predicted Label'] = [
    predict(
        string, positive_word_freq, negative_word_freq, prior_positive, prior_negative
    ) for string in test_df['corpus']
]

In [28]:
test_df.head()

Unnamed: 0,corpus,label,Predicted Label
0,This restaurant serves delicious food and has ...,1,1
1,I'm really disappointed with the customer serv...,0,1
2,The weather is perfect for outdoor activities ...,1,0
3,I can't believe how fast and efficient the del...,1,0
4,The movie was a complete waste of time and mon...,0,0
