In [11]:
import numpy as np
import pandas as pd
import string
import nltk
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer

In [12]:
data = pd.read_csv("Review.csv")

In [13]:
data.head()

Unnamed: 0,sentiment,review
0,Negative,I had no background knowledge of this movie be...
1,Negative,I am a huge Jane Austen fan and I ordered the ...
2,Negative,Nothing to say but Wow! Has anyone actually ha...
3,Negative,i like Jane Austin novels. I love Pride and Pr...
4,Negative,In this day and age of incredible special movi...


In [14]:
data.sentiment = data.sentiment.apply(lambda x: 1 if x == 'Positive' else 0)

In [15]:
nltk.download('punkt')
nltk.download('stopwords')
def clean_text(df):
    all_reviews = list()
    lines = data.review.values.tolist()
    for text in lines:
        text = text.lower()
        text = re.sub(r"[,.\"!@#$%^&*(){}?/;`~:<>+=-]", "", text)
        tokens = word_tokenize(text)
        table = str.maketrans('', '', string.punctuation)
        stripped = [w.translate(table) for w in tokens]
        words = [word for word in stripped if word.isalpha()]
        stop_words = set(stopwords.words("english"))
        stop_words.discard("not")
        stemmer = PorterStemmer()
        words = [stemmer.stem(w) for w in words if w not in stop_words]
        words = ' '.join(words)
        all_reviews.append(words)
    return all_reviews

all_reviews = clean_text(data)
all_reviews[0:10]

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['background knowledg movi bought sound cool want see realli kickbutt vike movi awhil ala film not look hope best instead deliv bore nordic soapopera seem drag long despit minut run time film premis intrigu enough vike warlord defi god odin enrag curs warlord son name barek death rebirth berserk barek guy forc live enrag insan violent lifetim lifetim movi film compet enough rich cinematographi quasigood perform actor found bore question dribbl would end filmmak chanc make someth rather entertain semiuniqu drop ball perhap could improv cheap exploit tactic thrown gratuit nuditi lot gore mean talk berserk nt vike suppos bad enough rape pillag nt berserk suppos even extrem unless fan young restless etc fact insan berserk like self tortur probabl steer clear drab piec celluloid',
 'huge jane austen fan order movi amazonuk could see without wait forev come us realli save money ann run wentworth whole point ann elliot charact quiet refin not impuls vulgar mari suffer stroke someth speech nt 

In [16]:
CV = CountVectorizer(min_df = 3)
X = CV.fit_transform(all_reviews).toarray()
y = data['sentiment']

In [17]:
print(np.shape(X))
print(y.shape)

(10000, 16439)
(10000,)


In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state = 0)

In [19]:
model = GaussianNB()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [20]:
print('Accuracy score: ', metrics.accuracy_score(y_test, y_pred))
print('f1_score: ', metrics.f1_score(y_test, y_pred))
print('Precision score: ', metrics.precision_score(y_test, y_pred))

Accuracy score:  0.659
f1_score:  0.6125
Precision score:  0.7082785808147175
