In this Sentiment Analysis project we will take dataset of movie reviews and use it to train the Model. We use the trained model to predict the sentiment of the manual input as positive or negative.

In [1]:
# !pip install -U scikit-learn

In [18]:
from datasets import load_dataset
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

dataset = load_dataset("cornell-movie-review-data/rotten_tomatoes")
df_train = pd.DataFrame(dataset["train"])
df_test = pd.DataFrame(dataset["test"])

def preprocessing(text):
    #Convert All into Lower Case
    text = text.lower()
    
    #Removing Punctuation
    text = text.replace("[^a-zA-Z#]", " ")
    
    #Removing Numbers
    text = text.replace('\d+', '')

    # Tokenize the new text using NLTK
    new_words = word_tokenize(text)
     
    # Remove stopwords using NLTK
    new_filtered_words = [
        word for word in new_words if word.lower() not in stopwords.words('english')]
     
    # Join the filtered words to form a clean text
    new_clean_text = ' '.join(new_filtered_words)
    return new_clean_text

df_train["processed_text"] = df_train["text"].apply(preprocessing)
df_test["processed_text"] = df_test["text"].apply(preprocessing)
df_train["processed_text"].values

array(["rock destined 21st century 's new `` conan `` 's going make splash even greater arnold schwarzenegger , jean-claud van damme steven segal .",
       "gorgeously elaborate continuation `` lord rings `` trilogy huge column words adequately describe co-writer/director peter jackson 's expanded vision j . r . r . tolkien 's middle-earth .",
       'effective too-tepid biopic', ...,
       "hardly nuanced portrait young woman 's breakdown , film nevertheless works scares .",
       'interminably bleak , say nothing boring .',
       'things really get weird , though particularly scary : movie portent content .'],
      dtype=object)

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer
# tfidfV = TfidfVectorizer(max_features = 1000, stop_words = 'english')
# tfidfV = TfidfVectorizer(max_df = 0.9, min_df = 2, max_features = 800, stop_words = 'english')
tfidfV = TfidfVectorizer(stop_words = 'english')
X_train = tfidfV.fit_transform(df_train["processed_text"].values)
y_train = df_train["label"]
X_test = tfidfV.transform(df_test["processed_text"].values)
y_test = df_test["label"]

print(X_train.shape)
print(y_train.shape)

(8530, 16173)
(8530,)


In [20]:
X_train

<8530x16173 sparse matrix of type '<class 'numpy.float64'>'
	with 81347 stored elements in Compressed Sparse Row format>

In [21]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

In [22]:
prid_logreg = logreg.predict(X_test)
print("Accuracy: %.2f%%" % (accuracy_score(y_test, prid_logreg) * 100))

Accuracy: 77.02%


In [23]:
from sklearn.tree import DecisionTreeClassifier as DT
from sklearn.metrics import accuracy_score
model = DT()
model.fit(X_train,y_train)

In [24]:
prid_dt = model.predict(X_test)
print("Accuracy: %.2f%%" % (accuracy_score(y_test, prid_dt) * 100))

Accuracy: 62.85%


In [25]:
df_test["text"].values[0]

'lovingly photographed in the manner of a golden book sprung to life , stuart little 2 manages sweetness largely without stickiness .'

In [26]:
prid_dt

array([1, 0, 1, ..., 0, 0, 0])

In [27]:
# Naive Bayes
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(X_train,y_train)

In [28]:
prid = model.predict(X_test)
print("Accuracy: %.2f%%" % (accuracy_score(y_test, prid) * 100))

Accuracy: 62.85%


In [34]:
# Naive Bayes with TfidfTransformer
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter
from sklearn.feature_extraction.text import TfidfTransformer

counter_train = Counter(df_train['label'].tolist())
counter_test = Counter(df_test['label'].tolist())

train_list = df_train['text'].tolist()
test_list = df_test['text'].tolist()

count_vect = CountVectorizer()
x_train_counts = count_vect.fit_transform(train_list)
x_test_counts = count_vect.transform(test_list)
print("Count Vector Shape: {}".format(x_train_counts.shape))
tfidf_transformer = TfidfTransformer()
train_x = tfidf_transformer.fit_transform(x_train_counts)
test_x = tfidf_transformer.transform(x_test_counts)

train_y = df_train["label"]
test_y = df_test["label"]

clf = MultinomialNB().fit(train_x, train_y)
y_score = clf.predict(test_x)

print("Accuracy: %.2f%%" % (accuracy_score(y_test, y_score) * 100))


Count Vector Shape: (8530, 16474)
Accuracy: 79.74%


In [35]:
data = [
"Best Movie Ever 10/10", 
"Nice Movie",
"Deadpool & Wolverine is a fast, snappy, cheeky superhero self-parody that takes the mickey out of a franchise that has made billions.",
"Some of the jokes land through sheer volume and force of will, but it feels like you have to endure 40-or-so rectal stabbings for a single juicy zinger.",
"The worst Wolverine or Marvel movie ever.", 
"Terrible story and just completely ridiculous.", 
"I usually love Ryan Reynolds’s but this was his worst movie ever"]
# Expected = [1, 1 , 1, 0, 0, 0, 0]

# Create the pandas DataFrame
test = pd.DataFrame(data, columns=['text'])
test["text"] = test["text"].apply(preprocessing)
# df_test["text"].values
test_vector = tfidfV.transform(test["text"].values)
y_score = model.predict(test_vector)
print("DecisionTreeClassifier")
y_score

DecisionTreeClassifier


array([1, 1, 1, 0, 0, 1, 1])

#### Expected = [1, 1, 1, 0, 0, 0, 0]

In [36]:
y_score = logreg.predict(test_vector)
print("LogisticRegression")
y_score

LogisticRegression


array([1, 0, 0, 0, 0, 0, 0])

#### Expected = [1, 1, 1, 0, 0, 0, 0]

In [37]:
test_count_ = count_vect.transform(test['text'].tolist())
test_TfidfTransformer = tfidf_transformer.transform(test_count_)
y_score = clf.predict(test_TfidfTransformer)
print("Naive Bayes with TfidfTransformer")
y_score

Naive Bayes with TfidfTransformer


array([1, 1, 0, 0, 0, 0, 0])

#### Expected = [1, 1, 1, 0, 0, 0, 0]