In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

In [3]:
# Let's imagine we have a dataset of movie review texts and their sentiment.

reviews = [
    "I absolutely loved this movie, it was amazing!",
    "The acting was terrible and the plot was boring.",
    "A fantastic film with a wonderful story.",
    "I was so disappointed with the special effects.",
    "This is a masterpiece of modern cinema.",
    "I fell asleep halfway through, it was so slow.",
    "An instant classic, I'd watch it again and again.",
    "The worst movie I've seen all year. Waste of time."
]


In [None]:
# The sentiments: 1 for positive, 0 for negative.
sentiments = [1, 0, 1, 0, 1, 0, 1, 0]

In [None]:
# Create a Pandas DataFrame to hold our data
reviews_df = pd.DataFrame({'review_text': reviews, 
                           'sentiment': sentiments})

print("--- The First 5 Reviews in Our Dataset ---")
print(reviews_df.head())
print("\n" + "="*40 + "\n")

--- The First 5 Reviews in Our Dataset ---
                                        review_text  sentiment
0    I absolutely loved this movie, it was amazing!          1
1  The acting was terrible and the plot was boring.          0
2          A fantastic film with a wonderful story.          1
3   I was so disappointed with the special effects.          0
4           This is a masterpiece of modern cinema.          1




In [None]:
# --- 2. Preparing the Ritual: Splitting our data ---
# This time, our feature (X) is the text itself.
# Our target (y) is the sentiment.
X = reviews_df['review_text']
y = reviews_df['sentiment']

In [None]:
# The train_test_split() function performs the crucial step of
# randomly splitting our data into training and testing sets.

X_train, X_test, y_train, y_test =\
train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training set size: {X_train.shape[0]} reviews")
print(f"Testing set size: {X_test.shape[0]} reviews")
print("\n" + "="*40 + "\n")


Training set size: 6 reviews
Testing set size: 2 reviews




In [None]:
# --- 3. The Feature Engineering Ritual: Turning Text into Numbers ---
# Naive Bayes can't read text directly. We must transform our words into numbers.
# CountVectorizer is our tool for this ritual. It counts word occurrences.

vectorizer = CountVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

print("--- Example of Vectorized Text (first review) ---")
print(X_train.iloc[0])
print("Vectorized form (counts):")
print(X_train_vectorized[0].toarray())
print("\n" + "="*40 + "\n")

--- Example of Vectorized Text (first review) ---
I absolutely loved this movie, it was amazing!
Vectorized form (counts):
[[1 0 0 1 0 0 0 0 0 0 0 0 0 0 1 1 0 0 1 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0]]




In [13]:
# --- 4. The Grand Ritual: Training the Naive Bayes Model ---
# We create an instance of the Multinomial Naive Bayes model.
model = MultinomialNB()

In [None]:
# We train our model using the .fit() method on our vectorized training data.
# The model is learning the probability of each word appearing in positive vs. negative reviews.

print(f"Training the Naive Bayes model...")
model.fit(X_train_vectorized, y_train)
print("Training complete! The model is ready to make prophecies.")
print("\n" + "="*40 + "\n")


Training the Naive Bayes model...
Training complete! The model is ready to make prophecies.




In [None]:
# --- 5. The First Prophecy: Making a prediction ---
# Now, we use the model to make predictions on our secret testing data.

y_pred = model.predict(X_test_vectorized)

print("--- The Model's Prophecies vs. The Real Outcomes ---")
sentiment_map = {0: 'Negative', 1: 'Positive'}
for i in range(len(y_pred)):
    print(f"Review: '{X_test.iloc[i]}'")
    print(f"  Model's Prophecy: {sentiment_map[y_pred[i]]}")
    print(f"  True Outcome:     {sentiment_map[y_test.iloc[i]]}\n")


--- The Model's Prophecies vs. The Real Outcomes ---
Review: 'The acting was terrible and the plot was boring.'
  Model's Prophecy: Negative
  True Outcome:     Negative

Review: 'I fell asleep halfway through, it was so slow.'
  Model's Prophecy: Positive
  True Outcome:     Negative



In [None]:
# --- 6. Evaluating the Prophecy's Accuracy ---
# For classification, accuracy tells us the percentage of correct predictions.

accuracy = accuracy_score(y_test, y_pred)
print(f"The model's overall prediction accuracy: {accuracy:.2f}")


The model's overall prediction accuracy: 0.50
