In [14]:
import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import movie_reviews
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score

# Download necessary nltk resources
nltk.download('movie_reviews')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Step 1: Load the dataset
# Using the NLTK movie_reviews dataset
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

# Step 2: Preprocess the data (tokenize, remove stopwords, lowercase, lemmatization)
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess(text):
    # Tokenize and lowercase
    words = word_tokenize(" ".join(text).lower())
    # Remove punctuation and stopwords
    words = [word for word in words if word.isalpha() and word not in stop_words]
    # Lemmatize words
    words = [lemmatizer.lemmatize(word) for word in words]
    return " ".join(words)

# Preprocess the documents
preprocessed_docs = [(preprocess(doc), category) for doc, category in documents]

# Step 3: Prepare the data for training
# Split the data into features (X) and labels (y)
X = [doc for doc, _ in preprocessed_docs]
y = [category for _, category in preprocessed_docs]

# Step 4: Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Step 5: Build a text classification model using a pipeline (CountVectorizer + Naive Bayes)
model = make_pipeline(CountVectorizer(), MultinomialNB())

# Step 6: Train the model
model.fit(X_train, y_train)

# Step 7: Evaluate the model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")



[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\Shaik\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\movie_reviews.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Shaik\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Shaik\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Shaik\AppData\Roaming\nltk_data...


Accuracy: 81.83%
