In [4]:
! pip install datasets



In [5]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
from datasets import load_dataset

In [6]:
datasets = load_dataset('imdb')

In [7]:
# Take subset for faster training
train_data = datasets['train']
test_data = datasets['test']

In [8]:
# Combine everything into one dataset
all_data = [] 

for item in train_data: 
    all_data.append({'text' : item['text'], 'label' : item['label']})

for item in test_data: 
    all_data.append({'text' : item['text'], 'label' : item['label']})

# Convert to DataFrame (SIMPLE!)
df = pd.DataFrame(all_data)

In [9]:
 # preview first rows
print('First 5 rows')
print(df.head())
print("\n" + "="*80)
# checking null Values
print("\nchecking null values")
print(df.isnull().sum())

First 5 rows
                                                text  label
0  I rented I AM CURIOUS-YELLOW from my video sto...      0
1  "I Am Curious: Yellow" is a risible and preten...      0
2  If only to avoid making this type of film in t...      0
3  This film was probably inspired by Godard's Ma...      0
4  Oh, brother...after hearing about this ridicul...      0


checking null values
text     0
label    0
dtype: int64


In [10]:
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from bs4 import BeautifulSoup

# Load stopwords once
stop_words = set(stopwords.words('english'))

# 1. Remove HTML
df['text'] = df['text'].apply(lambda x: BeautifulSoup(x, 'html.parser').get_text())

# 2. Remove URLs
df['text'] = df['text'].apply(lambda x: re.sub(r'http\S+|www\.\S+', '', x))

# 3. Lowercase
df['text'] = df['text'].str.lower()

# 4. Remove special characters (keep letters, numbers, spaces)
df['text'] = df['text'].apply(lambda x: re.sub(r'[^a-z0-9\s]', '', x))

# 5. Remove stopwords
df['text'] = df['text'].apply(lambda x: " ".join([y for y in x.split() if y not in stop_words]))

# 6. Remove extra whitespace
df['text'] = df['text'].apply(lambda x: " ".join(x.split()))


[nltk_data] Downloading package stopwords to /Users/udit/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:
# lemmatization
from nltk.stem import WordNetLemmatizer 
nltk.download('wordnet')
lem = WordNetLemmatizer() 
df['text'] = df['text'].apply(lambda x : ' '.join([lem.lemmatize(word) for word in x.split()]))

[nltk_data] Downloading package wordnet to /Users/udit/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [12]:
# Tokenize all the Sentences
df['text'] = df['text'].apply(lambda x : x.split())
sentences = df['text'].tolist()

# Train Word2Vec on all tokenized reviews
from gensim.models import Word2Vec
w2v_model = Word2Vec(
    sentences= sentences,
    vector_size=100,
    window = 5,
    min_count=2,
    sg=1,
    epochs=50,
    workers=4
)

Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_fl

In [13]:
# avg Word2Vec 
def avg_word2vec(words, model): 
    valid = [w for w in words if w in w2v_model.wv.index_to_key]

    if len(valid) == 0:
        return np.zeros(w2v_model.vector_size)
    
    return np.mean([model.wv[w] for w in valid], axis = 0)

In [14]:
# Independent Features 
X = np.array([avg_word2vec(words ,w2v_model) for words in sentences])
# Output Feature
y = df['label']

In [15]:
# Train Test Split the dataset
from sklearn.model_selection import train_test_split 
X_train, X_test, y_train, y_test = train_test_split(
         X, y, test_size=0.25, random_state=42)

In [16]:
# Train The Model
from sklearn.linear_model import LogisticRegression 
model = LogisticRegression()
model.fit(X_train,y_train)


0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [17]:
# Evaluating the Model
from sklearn.metrics import accuracy_score,classification_report 
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"\n{'='*80}")
print(f"RESULTS")
print(f"{'='*80}")
print(f"Accuracy: {accuracy*100:.2f}%")
print(f"\nDetailed Report:")
print(classification_report(y_test, y_pred, target_names=['Negative', 'Positive']))


RESULTS
Accuracy: 86.14%

Detailed Report:
              precision    recall  f1-score   support

    Negative       0.86      0.86      0.86      6290
    Positive       0.86      0.86      0.86      6210

    accuracy                           0.86     12500
   macro avg       0.86      0.86      0.86     12500
weighted avg       0.86      0.86      0.86     12500



In [18]:
# TEST ON NEW REVIEWS
print(f"\n{'='*80}")
print("TESTING ON NEW REVIEWS")
print(f"{'='*80}")

def predict_sentiment(review_text):
    """Predict sentiment of a new review"""
    # Tokenize
    tokens = review_text.split()
    
    # Convert to vector
    vector = avg_word2vec(tokens, w2v_model)
    
    # Predict
    prediction = model.predict([vector])[0]
    probability = model.predict_proba([vector])[0]
    
    sentiment = "POSITIVE ðŸ˜Š" if prediction == 1 else "NEGATIVE ðŸ˜ž"
    confidence = probability[prediction] * 100
    
    return sentiment, confidence

# Test examples
test_reviews = [
    "This movie was absolutely amazing! Best film I've ever seen!",
    "Terrible movie. Complete waste of time and money.",
    "It was okay, nothing special but not bad either."
]

for review in test_reviews:
    sentiment, confidence = predict_sentiment(review)
    print(f"\nReview: {review}")
    print(f"Prediction: {sentiment} (Confidence: {confidence:.1f}%)")

print(f"\n{'='*80}")
print("PROJECT COMPLETE! ðŸŽ‰")
print(f"{'='*80}")



TESTING ON NEW REVIEWS

Review: This movie was absolutely amazing! Best film I've ever seen!
Prediction: POSITIVE ðŸ˜Š (Confidence: 58.9%)

Review: Terrible movie. Complete waste of time and money.
Prediction: NEGATIVE ðŸ˜ž (Confidence: 100.0%)

Review: It was okay, nothing special but not bad either.
Prediction: NEGATIVE ðŸ˜ž (Confidence: 100.0%)

PROJECT COMPLETE! ðŸŽ‰


In [19]:
# Save Word2Vec Model 
w2v_model.save('word2vec_sentiment.model')

In [20]:
# Save the Model
import pickle
with open('sentiment_classifier.pkl', 'wb') as f:
    pickle.dump(model, f)
