In [34]:
import nltk 
import pandas as pd
import numpy as np
from nltk.corpus import stopwords, movie_reviews
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score


In [29]:
def bcustom_stopwords():
    words=[]
    for fileid in movie_reviews.fileids():
        words.extend(movie_reviews.words(fileid))
    freq_dist = nltk.FreqDist(w.lower() for w in words if w.isalpha())
    most_common = [word for word, freq in freq_dist.most_common(100)]
    base_stopwords = set(stopwords.words('english'))
    custom_stopwords = base_stopwords.union(set(most_common))
    return custom_stopwords

custom_stopwords = bcustom_stopwords()
print(custom_stopwords)

def get_movie_reviews_data():
    docs = []
    labels = []
    for fileid in movie_reviews.fileids():
        label = movie_reviews.categories(fileid)[0]
        text = ' '.join(movie_reviews.words(fileid))
        docs.append(text)
        labels.append(1 if label == 'pos' else 0)
    return pd.DataFrame({'text': docs, 'label': labels})

movie_df = get_movie_reviews_data()
print(movie_df)

{"we've", 'until', 'ain', 'been', 'would', 'again', 'over', 'are', "haven't", 'well', 'couldn', 'an', 'll', 'its', 'having', "he's", 'were', 'under', 'what', "he'd", 'or', 'own', "that'll", 'hers', "it's", 'against', 'further', "isn't", 'because', 'but', "i've", 'weren', "weren't", "mightn't", "wasn't", 'd', 'yourself', 'after', "couldn't", 'does', 'on', 'in', 'story', 'where', 'itself', 'any', 'ours', 'than', 'to', 'down', 'get', 'has', 'make', 'even', 'aren', "hadn't", 'if', 'it', 'through', "didn't", 'don', 'y', 'as', 'wouldn', 'one', "she'd", 'some', 'haven', 'characters', "aren't", 'shan', 'and', 'did', "i'm", 'up', 'few', 'he', 'off', "we'll", 'with', 'hadn', 'my', "i'll", "we'd", 'little', 'you', 'more', 'whom', 'below', 'the', "they'd", 'me', 'really', 'once', "needn't", 'm', 'this', 'their', 'yourselves', 'before', 'character', 'film', 'themselves', 'these', 'now', 'such', 'during', 'two', "mustn't", 'which', 'about', 'him', 'am', 'shouldn', "hasn't", 'just', 'had', 'her', 're

In [30]:
legal_docs = ['The court has decided that the person guilty will not be spared at all and his sentence will atleast be 50 years']

def preprocess(text, stopword_list):
    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [t for t in tokens if t.isalpha() and t not in stopword_list]
    return ' '.join(tokens)

movie_df['clean_text'] = movie_df['text'].apply(lambda x: preprocess(x, custom_stopwords))

# 4. Print cleaned documents (first 5 for brevity)
print("\n--- Cleaned Movie Review Texts (first 5) ---")
print(movie_df[['clean_text', 'label']].head())





--- Cleaned Movie Review Texts (first 5) ---
                                          clean_text  label
0  teen couples go church party drink drive accid...      0
1  happy bastard quick review damn bug got head s...      0
2  movies jaded viewer thankful invention timex i...      0
3  quest camelot warner bros feature length fully...      0
4  synopsis mentally unstable man undergoing psyc...      0


In [31]:
def analyze_sentiment_accuracy(dataframe, use_stopwords=True):
    if use_stopwords:
        processed_texts = dataframe['text'].apply(lambda x: preprocess(x, custom_stopwords))
    else:
        processed_texts = dataframe['text'].apply(lambda x: preprocess(x, set()))
    
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(processed_texts)
    y = dataframe['label']
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    model = MultinomialNB()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    acc = accuracy_score(y_test, y_pred)
    return acc

In [33]:
acc_with_stopwords = analyze_sentiment_accuracy(movie_df, use_stopwords=True)
acc_without_stopwords = analyze_sentiment_accuracy(movie_df, use_stopwords=False)
print("Accuracy with stopword removal: {:.2f}%".format(acc_with_stopwords * 100))
print("Accuracy without stopword removal: {:.2f}%".format(acc_without_stopwords * 100))

Accuracy with stopword removal: 81.33%
Accuracy without stopword removal: 80.17%
