In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv("IMDB Dataset.csv")

In [3]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [5]:
import re
stopwords = set([
    "the", "and", "was", "is", "in", "to", "of", "for", "a", "an", "this", "that",
    "it", "on", "with", "as", "at", "by", "be", "are", "from", "but", "not", "you",
    "i", "they", "he", "she", "we", "do", "does", "did", "have", "has", "had", "my"
])
def preprocess(text):
    text = text.lower()
    text = re.sub(r"<.*?>", "", text)
    text = re.sub(r"[^\w\s]", "", text)
    tokens = text.split()
    return [word for word in tokens if word not in stopwords]

df['tokens'] = df['review'].apply(preprocess)

In [7]:
from collections import Counter
all_tokens = [token for tokens in df['tokens'] for token in tokens]
common_words = Counter(all_tokens).most_common(10000)
vocab = {word: idx for idx, (word, _) in enumerate(common_words)}

In [8]:
doc_freq = {}
total_docs = len(df)

for tokens in df['tokens']:
    unique_tokens = set(tokens)
    for token in unique_tokens:
        doc_freq[token] = doc_freq.get(token, 0) + 1

def vectorize_tfidf(tokens, vocab, doc_freq, total_docs):
    vec = np.zeros(len(vocab))
    term_counts = Counter(tokens)
    for token, count in term_counts.items():
        if token in vocab:
            tf = count / len(tokens)
            dfreq = doc_freq.get(token, 1)
            idf = np.log((total_docs + 1) / (dfreq + 1)) + 1
            vec[vocab[token]] = tf * idf
    return vec

df['tfidf'] = df['tokens'].apply(lambda tokens: vectorize_tfidf(tokens, vocab, doc_freq, total_docs))

In [11]:
X = np.stack(df['tfidf'].values)
y = df['sentiment'].map({'positive': 1, 'negative': 0}).values

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

In [12]:
def train_naive_bayes(X, y, alpha=0.5):
    class_count = np.bincount(y)
    n_classes = len(class_count)
    n_features = X.shape[1]
    
    log_class_priors = np.log(class_count / len(y))
    word_counts = np.zeros((n_classes, n_features))
    
    for c in range(n_classes):
        word_counts[c] = X[y == c].sum(axis=0)
    
    word_probs = (word_counts + alpha) / (word_counts.sum(axis=1)[:, None] + alpha * n_features)
    log_word_probs = np.log(word_probs)
    
    return log_class_priors, log_word_probs

In [14]:
def predict_naive_bayes(X, log_class_priors, log_word_probs):
    log_probs = X @ log_word_probs.T + log_class_priors
    return np.argmax(log_probs, axis=1)

log_class_priors, log_word_probs = train_naive_bayes(X_train, y_train, alpha=0.5)
y_pred = predict_naive_bayes(X_test, log_class_priors, log_word_probs)

accuracy = np.mean(y_pred == y_test)
print(f" Naive Bayes Accuracy (TF-IDF + stopwords removed): {accuracy:.4f}")

 Naive Bayes Accuracy (TF-IDF + stopwords removed): 0.8632


In [15]:
# Pickle the model

import pick;e
with open('naive_bayes_model.pkl', 'wb') as model_file:
    pickle.dump((log_class_priors, log_word_probs), model_file)
    
# Pickle the vocab and doc_freq
with open('vocab.pkl', 'wb') as vocab_file:
    pickle.dump(vocab, vocab_file)

with open('doc_freq.pkl', 'wb') as doc_freq_file:
    pickle.dump(doc_freq, doc_freq_file)

NameError: name 'pickle' is not defined