#### Here we'll build a model to analyse and classify comments using Naive Bayes. We'll train model using "reviews.txt" datset.

In [8]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn import naive_bayes
from sklearn.metrics import roc_auc_score, accuracy_score
# import pickle

In [9]:
reviews_dataset = pd.read_csv("reviews.txt", sep="\t", names = ["rating", "comment"])

In [10]:
reviews_dataset

Unnamed: 0,rating,comment
0,1,The Da Vinci Code book is just awesome.
1,1,this was the first clive cussler i've ever rea...
2,1,i liked the Da Vinci Code a lot.
3,1,i liked the Da Vinci Code a lot.
4,1,I liked the Da Vinci Code but it ultimatly did...
...,...,...
6913,0,Brokeback Mountain was boring.
6914,0,So Brokeback Mountain was really depressing.
6915,0,"As I sit here, watching the MTV Movie Awards, ..."
6916,0,Ok brokeback mountain is such a horrible movie.


In [11]:
# Here strip_accents parameter is used to remove accented words like á, é, í, ó, ú, ü, ñ
"""strip_accents = "ascii" is a fast method that only works on characters that have an direct ASCII mapping. 
Other languages except english have not directly mapped. So it only consider english here."""
# stop_words parameter is used to remove common words like "a", "an", "for" etc.
vectorizer =TfidfVectorizer(strip_accents="ascii", stop_words="english")

In [12]:
X = vectorizer.fit_transform(reviews_dataset.comment)
y = reviews_dataset.rating
# dumping Tfidvectorizer so that it can be later used to transform movie comments
# pickle.dump(vectorizer, open("text_transform.pkl", "wb"))

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)

In [14]:
mnb = naive_bayes.MultinomialNB()
mnb.fit(X_train, y_train)

MultinomialNB()

In [15]:
accuracy_score(y_test, mnb.predict(X_test))

0.9747109826589595

In [16]:
# pickle.dump(mnb,open("comment_classifier.pkl", "wb"))