In [3]:
import nltk
from nltk.corpus import movie_reviews
import random

# Load documents
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

# Shuffle dataset
random.shuffle(documents)

print("Total documents:", len(documents))
print("Categories:", movie_reviews.categories())





Total documents: 2000
Categories: ['neg', 'pos']


In [2]:
import nltk
nltk.download('movie_reviews')



[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\Admn\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\movie_reviews.zip.


True

In [4]:
# Convert documents into text format
texts = [" ".join(words) for words, category in documents]
labels = [category for words, category in documents]

print("Sample review:\n")
print(texts[0][:500])  # Print first 500 characters
print("\nLabel:", labels[0])



Sample review:

recently one night a young director named baz luhrmann couldn ' t sleep . he tumbled out of bed and moved over to the television where he watched mtv for an hour . then he moved to his kitchen where he spent the same amount of time eating spoiled food . then he took down a volume of shakespeare ' s work and read it cover to cover - never really paying attention to the words or plot . and then , as a climax , he took out his video camera and pressed the " on " button . the result ? william shakes

Label: neg


In [5]:
from sklearn.feature_extraction.text import CountVectorizer

# Create vectorizer
vectorizer = CountVectorizer(max_features=3000)

# Convert text into feature matrix
X = vectorizer.fit_transform(texts)

print("Feature matrix shape:", X.shape)


Feature matrix shape: (2000, 3000)


In [6]:
from sklearn.model_selection import train_test_split

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, labels, test_size=0.2, random_state=42
)

print("Training samples:", X_train.shape[0])
print("Testing samples:", X_test.shape[0])



Training samples: 1600
Testing samples: 400


In [7]:
from sklearn.linear_model import LogisticRegression

# Create model
model = LogisticRegression(max_iter=1000)

# Train model
model.fit(X_train, y_train)

print("Model training complete!")


Model training complete!


In [8]:
from sklearn.metrics import accuracy_score, classification_report

# Make predictions
y_pred = model.predict(X_test)

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Detailed report
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))


Accuracy: 0.8275

Classification Report:

              precision    recall  f1-score   support

         neg       0.85      0.79      0.82       199
         pos       0.81      0.86      0.83       201

    accuracy                           0.83       400
   macro avg       0.83      0.83      0.83       400
weighted avg       0.83      0.83      0.83       400



In [9]:
def predict_review(review_text):
    review_vector = vectorizer.transform([review_text])
    prediction = model.predict(review_vector)
    return prediction[0]

# Try your own reviews
print(predict_review("This movie was absolutely fantastic and inspiring!"))
print(predict_review("The film was boring and a complete waste of time."))



neg
neg


In [10]:
# Check if certain words are in vocabulary
print("fantastic" in vectorizer.vocabulary_)
print("inspiring" in vectorizer.vocabulary_)
print("boring" in vectorizer.vocabulary_)
print("waste" in vectorizer.vocabulary_)


True
False
True
True


In [11]:
print(predict_review("I absolutely loved this movie. It was amazing and wonderful."))
print(predict_review("This was the worst movie I have ever seen. Terrible acting and awful story."))



neg
neg


In [12]:
import collections
print(collections.Counter(labels))



Counter({'neg': 1000, 'pos': 1000})


In [13]:
from collections import Counter
print(Counter(y_pred))


Counter({np.str_('pos'): 214, np.str_('neg'): 186})


In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=3000)

# Transform text
X_tfidf = tfidf_vectorizer.fit_transform(texts)

# Split data
X_train_t, X_test_t, y_train_t, y_test_t = train_test_split(
    X_tfidf, labels, test_size=0.2, random_state=42
)

# Train model
model_tfidf = LogisticRegression(max_iter=1000)
model_tfidf.fit(X_train_t, y_train_t)

# Evaluate
y_pred_t = model_tfidf.predict(X_test_t)
print("TF-IDF Accuracy:", accuracy_score(y_test_t, y_pred_t))


TF-IDF Accuracy: 0.8075


In [15]:
Ctrl + S


NameError: name 'Ctrl' is not defined