In [1]:
# 1 Import Libraries and Load the Dataset
import nltk
from nltk.corpus import movie_reviews
import random
# Download the movie_reviews dataset if not already downloaded
nltk.download("movie_reviews")
# Load the movie_reviews dataset
documents = [(list(movie_reviews.words(fileid)), category)
    for category in movie_reviews.categories()
    for fileid in movie_reviews.fileids(category)]
random.shuffle(documents)

[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.


In [2]:
# Step 2: Preprocessing the Data
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [11]:
import nltk
from nltk.corpus import movie_reviews
import random
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
nltk.download('stopwords')
# Download the movie_reviews dataset if not already downloaded
nltk.download("movie_reviews")
# Load the movie_reviews dataset
documents = [(list(movie_reviews.words(fileid)), category)
  for category in movie_reviews.categories()
  for fileid in movie_reviews.fileids(category)]
random.shuffle(documents)
# Initialize the stemmer and stopwords
stemmer = PorterStemmer()
stop_words = set(stopwords.words("english"))
def preprocess_text(text):
  words = word_tokenize(text)
  words = [word.lower() for word in words if word.isalpha()]
  words = [word for word in words if word not in stop_words]
  words = [stemmer.stem(word) for word in words]
  return " ".join(words)
# Preprocess the documents
preprocessed_documents = [(preprocess_text(" ".join(words)),category)
for words, category in documents]
# Split the data into training and testing sets
split_ratio = 0.8
split_index = int(len(preprocessed_documents) * split_ratio)
train_data = preprocessed_documents[:split_index]
test_data = preprocessed_documents[split_index:]


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


In [12]:
# Step 3: Feature Extraction
from sklearn.feature_extraction.text import TfidfVectorizer
# Create a TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=2000)
# Fit and transform the training data
X_train = tfidf_vectorizer.fit_transform([text for text, _ in train_data])
y_train = [category for _, category in train_data]
# Transform the testing data
X_test = tfidf_vectorizer.transform([text for text, _ in test_data])
y_test = [category for _, category in test_data]

In [13]:
# Step 4: Building and Training a Classifier
from sklearn.naive_bayes import MultinomialNB
# Create a Naive Bayes classifier
classifier = MultinomialNB()
# Train the classifier
classifier.fit(X_train, y_train)

In [15]:
# Step 5: Evaluating the Model
from sklearn.metrics import classification_report,accuracy_score
# Make predictions on the test data
y_pred = classifier.predict(X_test)
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")
# Generate a classification report
print(classification_report(y_test, y_pred))

Accuracy: 0.81
              precision    recall  f1-score   support

         neg       0.80      0.83      0.81       195
         pos       0.83      0.80      0.82       205

    accuracy                           0.81       400
   macro avg       0.82      0.82      0.81       400
weighted avg       0.82      0.81      0.82       400

