# Naive Bayes Classifier Simulation on IMDB Dataset

Name: Stanley Nathanael Wijaya
<br>
NIM: 2702217125

## Import Library

In [13]:
import re
import numpy as np
from collections import defaultdict
from sklearn.model_selection import train_test_split

## Preprocessing Function

In [14]:
def preprocess_text(text):
    
    stopwords = set(["is", "it", "this", "the", "i", "so", "be", "to", "and", "a", "they", "may", "in", "on", "for", "ever"])  
    text = text.lower()
    text = re.sub(r"[^\w\s]", "", text)
    words = text.split()
    words = [word for word in words if word not in stopwords]

    bigrams = [" ".join(words[i:i+2]) for i in range(len(words)-1)]
    return words + bigrams 


## Datasets

In [15]:
training_data = [
    ("-", "It is Boring and it is a bad super hero movie"),
    ("+", "The best movie i have ever seen so far in 2022"),
    ("+", "This may be the best batman movie ever"),
    ("+", "They are pretty good"),
    ("+", "The prop and tech design is so good and fun"),
    ("-", "May be the least entertaining batman movie"),
    ("+", "Really enjoyed this movie"),
    ("-", "The batman proves to be bad and violent movie"),
    ("-", "So boring and bleak and cynical")
]

test_data = [
    ("?", "Boring and violent"),
    ("?", "Enjoyed just watched this movie"),
    ("?", "The first half is good but i hate the ending"),
    ("?", "Pretty good and fun movie")
]

## Tokenizing

In [16]:
train_texts = []
train_labels = []

for label, text in training_data:
    train_texts.append(preprocess_text(text))
    train_labels.append(label)

## Naive Bayes Model

In [17]:
class NaiveBayesClassifier:
    def __init__(self, smoothing=1):
        self.smoothing = smoothing
        self.vocab = set()
        self.word_counts = {"+": defaultdict(int), "-": defaultdict(int)}
        self.class_counts = {"+": 0, "-": 0}
        
        # Kata negatif kuat dengan bobot tinggi
        self.negative_boost = {"hate", "boring", "bad", "worst", "cynical", "bleak", "violent"}  
        
        # Kata kontradiktif yang harus diperhatikan
        self.contradictory_words = {"but", "however", "although", "though"}

    def train(self, texts, labels):
        
        for words, label in zip(texts, labels):
            self.class_counts[label] += 1
            for i, word in enumerate(words):
                # Jika kata adalah kata negatif kuat, tambahkan bobot lebih besar
                if word in self.negative_boost:
                    self.word_counts[label][word] += 5  # Tambahkan bobot 5x
                else:
                    self.word_counts[label][word] += 1
                self.vocab.add(word)
                
                # Jika kata adalah kata kontradiktif, beri penalti jika setelahnya ada kata negatif
                if word in self.contradictory_words and i < len(words) - 1:
                    next_word = words[i + 1]
                    if next_word in self.negative_boost:
                        self.word_counts["-"][next_word] += 5  # Tambahkan penalti lebih besar

        self.total_words = {"+": sum(self.word_counts["+"].values()), "-": sum(self.word_counts["-"].values())}
        self.vocab_size = len(self.vocab)

    def predict(self, text):
        
        words = preprocess_text(text)
        
        # Prior Probabilities
        total_samples = sum(self.class_counts.values())
        log_prob_pos = np.log(self.class_counts["+"] / total_samples)
        log_prob_neg = np.log(self.class_counts["-"] / total_samples)

        # Likelihood Calculation with Laplace Smoothing
        for word in words:
            pos_word_prob = (self.word_counts["+"][word] + self.smoothing) / (self.total_words["+"] + self.smoothing * self.vocab_size)
            neg_word_prob = (self.word_counts["-"][word] + self.smoothing) / (self.total_words["-"] + self.smoothing * self.vocab_size)

            log_prob_pos += np.log(pos_word_prob)
            log_prob_neg += np.log(neg_word_prob)

        return "+" if log_prob_pos > log_prob_neg else "-"

## Train and Predict

In [18]:
classifier = NaiveBayesClassifier()
classifier.train(train_texts, train_labels)

## Result

In [19]:
print("Sentiment Analysis:")
for label, text in test_data:
    prediction = classifier.predict(text)
    print(f"'{text}' -> {prediction}")

Sentiment Analysis:
'Boring and violent' -> -
'Enjoyed just watched this movie' -> +
'The first half is good but i hate the ending' -> +
'Pretty good and fun movie' -> +


## Conclusion

Based on the Sentiment Analysis Result:
<ul>
    <li>'Boring and violent' -> - (True Negative)</li>
    <li>'Enjoyed just watched this movie' -> + (True Positive)</li>
    <li>'The first half is good but i hate the ending' -> + (It depends, however maybe it's False Positive)</li>
    <li>'Pretty good and fun movie' -> + (True Positive)</li>
</ul>

<code>Striving for Excellence 🔥❤️‍🔥</code>