## Sentiment Analysis using Bag of Words + Logistic Regression

We will classify movie reviews as Positive or Negative.

Python

scikit-learn (ML)

pandas (data handling)

CountVectorizer (BoW)

In [2]:
#sample dataset

In [13]:
import pandas as pd

data = {
    'review': [
        "I loved the movie, it was fantastic!",
        "Absolutely terrible movie, I hated it",
        "It was a wonderful performance",
        "The movie was boring and slow",
        "Such a brilliant and touching story",
        "Worst film I have ever seen",
        "Amazing direction and great acting",
        "Not worth watching, very disappointing",
        "I regret watching this film",        # 0
    "It was a total waste of time",       # 0
    "Not recommended at all",             # 0
    "I fell asleep halfway through",      # 0
    "One of the worst movies I've seen",  # 0
    "This movie made no sense",           # 0
    "I didn't enjoy a single scene",      # 0
    "The plot was predictable and dull",  # 0
    ],
    'sentiment': [1, 0, 1, 0, 1, 0, 1, 0,0,0,0,0,0,0,0,0]  # 1 = Positive, 0 = Negative
}

df = pd.DataFrame(data)
print(df)


                                    review  sentiment
0     I loved the movie, it was fantastic!          1
1    Absolutely terrible movie, I hated it          0
2           It was a wonderful performance          1
3            The movie was boring and slow          0
4      Such a brilliant and touching story          1
5              Worst film I have ever seen          0
6       Amazing direction and great acting          1
7   Not worth watching, very disappointing          0
8              I regret watching this film          0
9             It was a total waste of time          0
10                  Not recommended at all          0
11           I fell asleep halfway through          0
12       One of the worst movies I've seen          0
13                This movie made no sense          0
14           I didn't enjoy a single scene          0
15       The plot was predictable and dull          0


In [14]:
#Convert Text → Numbers using Bag of Words

In [15]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['review'])

# View feature names (words used)
print(vectorizer.get_feature_names_out())

# View BoW representation
print(X.toarray())


['absolutely' 'acting' 'all' 'amazing' 'and' 'asleep' 'at' 'boring'
 'brilliant' 'didn' 'direction' 'disappointing' 'dull' 'enjoy' 'ever'
 'fantastic' 'fell' 'film' 'great' 'halfway' 'hated' 'have' 'it' 'loved'
 'made' 'movie' 'movies' 'no' 'not' 'of' 'one' 'performance' 'plot'
 'predictable' 'recommended' 'regret' 'scene' 'seen' 'sense' 'single'
 'slow' 'story' 'such' 'terrible' 'the' 'this' 'through' 'time' 'total'
 'touching' 've' 'very' 'was' 'waste' 'watching' 'wonderful' 'worst'
 'worth']
[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0]
 [1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0]
 [0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0]
 [0 0 0 0 1 0 0 

Now every review becomes a vector (matrix of 1s and 0s / counts) → this is your ML input feature.

#Build a Model (Logistic Regression)

In [16]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

y = df['sentiment']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Train
model = LogisticRegression()
model.fit(X_train, y_train)


In [17]:
#Predict and Evaluate

In [18]:
from sklearn.metrics import accuracy_score, classification_report

y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.75
              precision    recall  f1-score   support

           0       0.75      1.00      0.86         3
           1       0.00      0.00      0.00         1

    accuracy                           0.75         4
   macro avg       0.38      0.50      0.43         4
weighted avg       0.56      0.75      0.64         4



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [19]:
#Try It on Your Own Text!

In [20]:
sample_text = ["This movie was a masterpiece", "I regret watching this film"]

# Vectorize the new input
sample_vec = vectorizer.transform(sample_text)

# Get predicted probabilities
pred_probs = model.predict_proba(sample_vec)

# Apply threshold to convert to class labels (0 or 1)
predictions = (pred_probs[:, 1] > 0.5).astype(int)

# Show result with sentiment label
for text, pred in zip(sample_text, predictions):
    label = "Positive" if pred == 1 else "Negative"
    print(f"Review: {text} → Sentiment: {label}")


Review: This movie was a masterpiece → Sentiment: Negative
Review: I regret watching this film → Sentiment: Negative


In [21]:
# 📦 Install if not already installed
# pip install scikit-learn pandas

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd

# 🔹 Step 1: Sample Dataset
data = {
    'review': [
        "I loved the movie, it was fantastic!",
        "Absolutely terrible movie, I hated it",
        "It was a wonderful performance",
        "The movie was boring and slow",
        "Such a brilliant and touching story",
        "Worst film I have ever seen",
        "Amazing direction and great acting",
        "Not worth watching, very disappointing"
    ],
    'sentiment': [1, 0, 1, 0, 1, 0, 1, 0]  # 1 = Positive, 0 = Negative
}

df = pd.DataFrame(data)

# 🔹 Step 2: Feature Extraction using Bag of Words
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['review'])
y = df['sentiment']

# 🔹 Step 3: Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# 🔹 Step 4: Train Logistic Regression Model
model = LogisticRegression()
model.fit(X_train, y_train)

# 🔹 Step 5: Evaluate on Test Data
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

# 🔹 Step 6: Predict on New Reviews
sample_text = ["This movie was a masterpiece", "I regret watching this film"]

# Vectorize new text
sample_vec = vectorizer.transform(sample_text)

# Predict probabilities
pred_probs = model.predict_proba(sample_vec)

# Convert probabilities to class labels (0 = Negative, 1 = Positive)
predictions = (pred_probs[:, 1] > 0.5).astype(int)

# 🔹 Step 7: Output the Predictions
for text, pred, prob in zip(sample_text, predictions, pred_probs[:, 1]):
    label = "Positive" if pred == 1 else "Negative"
    confidence = round(prob * 100, 2)
    print(f"Review: \"{text}\" → Sentiment: {label} ({confidence}%)")


Accuracy: 0.0
Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00       2.0
           1       0.00      0.00      0.00       0.0

    accuracy                           0.00       2.0
   macro avg       0.00      0.00      0.00       2.0
weighted avg       0.00      0.00      0.00       2.0

Review: "This movie was a masterpiece" → Sentiment: Positive (66.5%)
Review: "I regret watching this film" → Sentiment: Positive (64.05%)


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd

# 🔹 Step 1: Improved Dataset with More Negative Reviews
data = {
    'review': [
        "I loved the movie, it was fantastic!",
        "Absolutely terrible movie, I hated it",
        "It was a wonderful performance",
        "The movie was boring and slow",
        "Such a brilliant and touching story",
        "Worst film I have ever seen",
        "Amazing direction and great acting",
        "Not worth watching, very disappointing",
        "I regret watching this film",               # 👈 Added
        "This movie was a complete disaster",        # 👈 Added
        "One of the worst movies I've seen",         # 👈 Added
        "Great story and excellent acting",          # 👈 Added
        "I didn't enjoy a single scene",             # 👈 Added
        "The plot was predictable and dull"          # 👈 Added
    ],
    'sentiment': [1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0]
}

df = pd.DataFrame(data)

# 🔹 Step 2: TF-IDF Vectorization
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['review'])
y = df['sentiment']

# 🔹 Step 3: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# 🔹 Step 4: Train Model
model = LogisticRegression()
model.fit(X_train, y_train)

# 🔹 Step 5: Evaluate
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

# 🔹 Step 6: Predict New Reviews
sample_text = ["This movie was a masterpiece", "I regret watching this film"]
sample_vec = vectorizer.transform(sample_text)
pred_probs = model.predict_proba(sample_vec)
predictions = (pred_probs[:, 1] > 0.5).astype(int)

# 🔹 Step 7: Show Results
for text, pred, prob in zip(sample_text, predictions, pred_probs[:, 1]):
    label = "Positive" if pred == 1 else "Negative"
    confidence = round(prob * 100, 2)
    print(f"Review: \"{text}\" → Sentiment: {label} ({confidence}%)")


Accuracy: 0.5
Classification Report:
               precision    recall  f1-score   support

           0       0.50      1.00      0.67         2
           1       0.00      0.00      0.00         2

    accuracy                           0.50         4
   macro avg       0.25      0.50      0.33         4
weighted avg       0.25      0.50      0.33         4

Review: "This movie was a masterpiece" → Sentiment: Negative (26.97%)
Review: "I regret watching this film" → Sentiment: Negative (23.6%)


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd

# 🔹 Step 1: Improved Balanced Dataset
data = {
    'review': [
        # Positive
        "This movie was a masterpiece",
        "Absolutely loved the film",
        "Incredible acting and story",
        "The direction was brilliant",
        "A cinematic gem, totally loved it",
        "Great plot and character development",
        "One of the best movies ever made",
        "A touching and emotional story",
        "Wonderful experience from start to end",
        "This film blew my mind",

        # Negative
        "I regret watching this film",
        "It was a total waste of time",
        "The plot was dull and boring",
        "Worst movie I've seen this year",
        "The acting was terrible",
        "Not worth the time or money",
        "Disappointing from start to finish",
        "I fell asleep halfway through",
        "A disaster of a movie",
        "Painfully slow and meaningless"
    ],
    'sentiment': [1]*10 + [0]*10
}

df = pd.DataFrame(data)

# 🔹 Step 2: TF-IDF Vectorization
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['review'])
y = df['sentiment']

# 🔹 Step 3: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# 🔹 Step 4: Train Model
model = LogisticRegression()
model.fit(X_train, y_train)

# 🔹 Step 5: Evaluate
print("Accuracy:", accuracy_score(y_test, model.predict(X_test)))
print("Classification Report:\n", classification_report(y_test, model.predict(X_test)))

# 🔹 Step 6: Predict New Reviews
sample_text = ["This movie was a masterpiece", "I regret watching this film"]
sample_vec = vectorizer.transform(sample_text)
pred_probs = model.predict_proba(sample_vec)
predictions = (pred_probs[:, 1] > 0.5).astype(int)

# 🔹 Step 7: Show Results
for text, pred, prob in zip(sample_text, predictions, pred_probs[:, 1]):
    label = "Positive" if pred == 1 else "Negative"
    confidence = round(prob * 100, 2)
    print(f"Review: \"{text}\" → Sentiment: {label} ({confidence}%)")


Accuracy: 0.4
Classification Report:
               precision    recall  f1-score   support

           0       0.40      1.00      0.57         2
           1       0.00      0.00      0.00         3

    accuracy                           0.40         5
   macro avg       0.20      0.50      0.29         5
weighted avg       0.16      0.40      0.23         5

Review: "This movie was a masterpiece" → Sentiment: Negative (39.24%)
Review: "I regret watching this film" → Sentiment: Negative (39.5%)


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd

# 🔹 Step 1: Improved Balanced Dataset
data = {
    'review': [
        # Positive
        "This movie was a masterpiece",
        "Absolutely loved the film",
        "Incredible acting and story",
        "The direction was brilliant",
        "A cinematic gem, totally loved it",
        "Great plot and character development",
        "One of the best movies ever made",
        "A touching and emotional story",
        "Wonderful experience from start to end",
        "This film blew my mind",

        # Negative
        "I regret watching this film",
        "It was a total waste of time",
        "The plot was dull and boring",
        "Worst movie I've seen this year",
        "The acting was terrible",
        "Not worth the time or money",
        "Disappointing from start to finish",
        "I fell asleep halfway through",
        "A disaster of a movie",
        "Painfully slow and meaningless"
    ],
    'sentiment': [1]*10 + [0]*10
}

df = pd.DataFrame(data)

# 🔹 Step 2: TF-IDF Vectorization
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['review'])
y = df['sentiment']

# 🔹 Step 3: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# 🔹 Step 4: Train Model
model = LogisticRegression()
model.fit(X_train, y_train)

# 🔹 Step 5: Evaluate
print("Accuracy:", accuracy_score(y_test, model.predict(X_test)))
print("Classification Report:\n", classification_report(y_test, model.predict(X_test)))

# 🔹 Step 6: Predict New Reviews
sample_text = ["This movie was a masterpiece", "I regret watching this film"]
sample_vec = vectorizer.transform(sample_text)
pred_probs = model.predict_proba(sample_vec)
predictions = (pred_probs[:, 1] > 0.5).astype(int)

# 🔹 Step 7: Show Results
for text, pred, prob in zip(sample_text, predictions, pred_probs[:, 1]):
    label = "Positive" if pred == 1 else "Negative"
    confidence = round(prob * 100, 2)
    print(f"Review: \"{text}\" → Sentiment: {label} ({confidence}%)")


Accuracy: 0.4
Classification Report:
               precision    recall  f1-score   support

           0       0.40      1.00      0.57         2
           1       0.00      0.00      0.00         3

    accuracy                           0.40         5
   macro avg       0.20      0.50      0.29         5
weighted avg       0.16      0.40      0.23         5

Review: "This movie was a masterpiece" → Sentiment: Negative (39.24%)
Review: "I regret watching this film" → Sentiment: Negative (39.5%)


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [26]:
import tensorflow as tf
from transformers import BertTokenizer, TFBertForSequenceClassification
import numpy as np

# 🔹 Load pre-trained BERT tokenizer & model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = TFBertForSequenceClassification.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')

# 🔹 Your sample reviews
reviews = [
    "This movie was a masterpiece",         # Expected: Positive
    "I regret watching this film"          # Expected: Negative
]

# 🔹 Tokenize
inputs = tokenizer(reviews, padding=True, truncation=True, return_tensors="tf")

# 🔹 Predict
outputs = model(inputs)
logits = outputs.logits
probs = tf.nn.softmax(logits, axis=-1)

# 🔹 Convert to labels (0 to 4: very negative to very positive)
labels = tf.argmax(probs, axis=1).numpy()

# 🔹 Display Result
sentiment_map = {
    0: "Very Negative",
    1: "Negative",
    2: "Neutral",
    3: "Positive",
    4: "Very Positive"
}

for review, label, prob in zip(reviews, labels, probs):
    print(f"Review: \"{review}\" → Sentiment: {sentiment_map[label]} ({round(100 * np.max(prob), 2)}%)")


ModuleNotFoundError: No module named 'transformers'

In [27]:
pip install transformers

Collecting transformers
  Downloading transformers-4.50.3-py3-none-any.whl.metadata (39 kB)
Collecting huggingface-hub<1.0,>=0.26.0 (from transformers)
  Downloading huggingface_hub-0.30.1-py3-none-any.whl.metadata (13 kB)
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2024.11.6-cp310-cp310-win_amd64.whl.metadata (41 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading tokenizers-0.21.1-cp39-abi3-win_amd64.whl.metadata (6.9 kB)
Collecting safetensors>=0.4.3 (from transformers)
  Downloading safetensors-0.5.3-cp38-abi3-win_amd64.whl.metadata (3.9 kB)
Collecting tqdm>=4.27 (from transformers)
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting fsspec>=2023.5.0 (from huggingface-hub<1.0,>=0.26.0->transformers)
  Downloading fsspec-2025.3.2-py3-none-any.whl.metadata (11 kB)
Downloading transformers-4.50.3-py3-none-any.whl (10.2 MB)
   ---------------------------------------- 0.0/10.2 MB ? eta -:--:--
   --------- -----------------