In [10]:
import pandas as pd 
import nltk
from nltk.stem import LancasterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
import os
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
import pickle

In [2]:
#nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
# Python program to change the
# current working directory


# Function to Get the current
# working directory
def current_path():
	print("Current working directory")
	print(os.getcwd())
	print()


# Driver's code
# Printing CWD before
current_path()

# Changing the CWD
os.chdir('../')
#os.chdir('data')
current_path()

Current working directory before
/content

Current working directory before
/



In [5]:
df = pd.read_csv('data/IMDB_Dataset.csv')
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [14]:
token = RegexpTokenizer(r'[a-zA-Z0-9]+')
#vectorizer = CountVectorizer()

In [13]:
stemmer = LancasterStemmer()
class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)])

In [15]:
vectorizer = StemmedCountVectorizer(min_df=3, analyzer="word", stop_words='english',ngram_range = (1,1),tokenizer = token.tokenize)

In [16]:
vector = vectorizer.fit_transform(df['review'])

In [33]:
df['review']

0        One of the other reviewers has mentioned that ...
1        A wonderful little production. <br /><br />The...
2        I thought this was a wonderful way to spend ti...
3        Basically there's a family where a little boy ...
4        Petter Mattei's "Love in the Time of Money" is...
                               ...                        
49995    I thought this movie did a down right good job...
49996    Bad plot, bad dialogue, bad acting, idiotic di...
49997    I am a Catholic taught in parochial elementary...
49998    I'm going to have to disagree with the previou...
49999    No one expects the Star Trek movies to be high...
Name: review, Length: 50000, dtype: object

In [9]:
df['sentiment_bin'] = 1
df.loc[df['sentiment'] == 'negative', 'sentiment_bin' ] = 0 

In [17]:
df

Unnamed: 0,review,sentiment,sentiment_bin
0,One of the other reviewers has mentioned that ...,positive,1
1,A wonderful little production. <br /><br />The...,positive,1
2,I thought this was a wonderful way to spend ti...,positive,1
3,Basically there's a family where a little boy ...,negative,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,1
...,...,...,...
49995,I thought this movie did a down right good job...,positive,1
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative,0
49997,I am a Catholic taught in parochial elementary...,negative,0
49998,I'm going to have to disagree with the previou...,negative,0


In [20]:
X_train, X_test, Y_train, Y_test = train_test_split(vector, df['sentiment_bin'], test_size=0.25, random_state=5)

# Testing Multinomial Bayes

In [21]:
#Training the model

MNB = MultinomialNB()
MNB.fit(X_train, Y_train)

MultinomialNB()

In [22]:
#Caluclating the accuracy score of the model

predicted = MNB.predict(X_test)
accuracy_score = metrics.accuracy_score(predicted, Y_test)
print("Accuracuy Score: ",accuracy_score)

Accuracuy Score:  0.84896


# Testing pickling to check if it works

## Pickling vectorizer

In [25]:
with open('vectorizer_pickled.pickle', 'wb') as pickle_file:
    pickle.dump(vectorizer, pickle_file)

In [27]:
with open('vectorizer_pickled.pickle', 'rb') as pickle_file:
  loaded_vectorizer = pickle.load(pickle_file)

In [41]:
d = {0:'I love this'}
ser = pd.Series(data=d)

In [42]:
vector = loaded_vectorizer.transform(ser)

In [43]:
MNB.predict(vector)

array([1])

## Pickling model

In [44]:
with open('model_pickled.pickle', 'wb') as pickle_file:
    pickle.dump(MNB, pickle_file)

In [45]:
with open('model_pickled.pickle', 'rb') as pickle_file:
  loaded_MNB = pickle.load(pickle_file)

In [46]:
loaded_MNB.predict(vector)

array([1])

# Writing the code for the application

In [47]:
def predict(phrase, model_filepath='model_pickled.pickle', vectorizer_filepath = 'vectorizer_pickled.pickle' ): 
  with open(vectorizer_filepath, 'rb') as pickle_file:
    vectorizer = pickle.load(pickle_file)
  with open(model_filepath, 'rb') as pickle_file:
    loaded_MNB = pickle.load(pickle_file)

  d = {0:phrase}
  ser = pd.Series(data=d)
  vector = vectorizer.transform(ser)

  prediction = loaded_MNB.predict(vector)

  if prediction[0] == 0: 
    return 'negative'
  else:
     return 'positive'