### This is the second part of the web scraping project. This part we will take the reviews and create a pipeline that will determine the sentiment of future reviews and see if it is Good, Neutral, Or Bad. 

In [1]:
#load the Packages we Need 

import pandas as pd
import numpy as np

In [2]:
#Lets load our training dataset we have created, This is from the first part of the project.  

training_df = pd.read_csv('training.csv')

In [3]:
#load Validation data for later use

valid_df = pd.read_csv('valid.csv')

In [4]:
#Lets check that we imported it 

training_df 

Unnamed: 0,Name,RatingValue,DatePublished,Review
0,Mariachi’s Restaurant,negative,2018-06-29,If you want to go for excellent customer servi...
1,98 Aroma,negative,2019-04-07,"If I was able to give 0 stars, I would. This i..."
2,Dynasty Chinese Cuisine,negative,2019-02-21,"Came here to celebrate my Friends Birthday, Fo..."
3,Carisma,negative,2019-01-10,Disappointed.\n\nI came here for dinner and th...
4,Carisma,negative,2019-10-20,- Not worth for its price. Both service and qu...
...,...,...,...,...
310,KINKA IZAKAYA ORIGINAL,positive,2020-05-04,My friend got the lunch set which includes a d...
311,KINKA IZAKAYA ORIGINAL,positive,2020-05-04,Absolutely loved this spot when I went with fr...
312,Giulietta,positive,2019-06-12,Good date place. The atmosphere was phenomenal...
313,KINKA IZAKAYA ORIGINAL,positive,2020-05-14,"I would share this, but I'm too shellfish \n.\..."


In [5]:
#reindex training set to clean it up.

training_df.set_index('Name',inplace=True)

In [6]:
#reindex validation set to clean it up.

valid_df.set_index('Name',inplace=True)

In [7]:
#Convert into categorical 

training_df.RatingValue = training_df.RatingValue.astype('category')

In [8]:
#Convert into categorical 

valid_df.RatingValue = valid_df.RatingValue.astype('category')

In [9]:
#Check the Data types are okay 

training_df.dtypes

RatingValue      category
DatePublished      object
Review             object
dtype: object

In [10]:
#Lets Create our cateogories 

categories = ['negative', 'neutral', 'positive']

In [11]:
#Make sure that there are only three. 

training_df.RatingValue.cat.categories

Index(['negative', 'neutral', 'positive'], dtype='object')

In [12]:
#There should be 315 entries, lets verify. 

print(len(training_df.Review))
print(len(training_df.RatingValue))

315
315


In [13]:
#I am just gonna check that the first review is good when i load it. 

print("\n".join(training_df.Review[0].split("\n")))

If you want to go for excellent customer service, go for it. Great people, great location. 

However, if you are going for the food. Skip this place and try something else. Food was mediocre, tasteless and expensive. I'm sorry but I wish I could go back in time and save myself $66.


In [14]:
#Use count vectorizer for counts for the reviews 

from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(training_df.Review)
X_train_counts.shape

(315, 4359)

In [15]:
#Lets check a random word. See that we get around 3824 instances. 

count_vect.vocabulary_['the']

3824

In [16]:
#Now we use the trasnformer for tf-idf instead. 

from sklearn.feature_extraction.text import TfidfTransformer

tf_transformer = TfidfTransformer(use_idf=True).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
X_train_tf.shape

(315, 4359)

In [17]:
#Just for cosmetic purposes 

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(315, 4359)

In [18]:
#We now use MNB model here for our lanaguage model

from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB().fit(X_train_tfidf, training_df.RatingValue)

In [19]:
#test run of data. Lets just create some random reviews and see if the model is able to correctly label review. 

docs_new = ['Food was so good i loved it so much', 'food was okay.']

X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predicted = clf.predict(X_new_tfidf)

In [20]:
#Looks good so far. Pretty cool, it's able to determine the sentiment of a random review we created. 

for doc, category in zip(docs_new, predicted):
    print('%r => %s' % (doc, category))

'Food was so good i loved it so much' => positive
'food was okay.' => neutral


In [21]:
#Since we already created the seperate parts for the sentiment classifer, lets load a data pipeline so that we can streamline
#the process. 

from sklearn.pipeline import Pipeline

text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])

In [22]:
#Fit the pipeline to the traning data set that we created eariler. 

text_clf.fit(training_df.Review, training_df.RatingValue)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [29]:
#Lets now use our model to test of the validation set
# Load test data here. 

docs_test = valid_df.Review
predicted = text_clf.predict(docs_test)
Acc = np.mean(predicted == valid_df.RatingValue)

#looking at some metrics. 

print("The Accuracy of the model is then:")
print( Acc)

The Accuracy of the model is then:
0.562962962962963


In [33]:
#Here is the confusion matrix/ F1 score. 
from sklearn import metrics
print(metrics.classification_report(valid_df.RatingValue, predicted,
    target_names=categories))

              precision    recall  f1-score   support

    negative       0.69      0.56      0.62        45
     neutral       0.44      0.71      0.54        45
    positive       0.73      0.42      0.54        45

    accuracy                           0.56       135
   macro avg       0.62      0.56      0.56       135
weighted avg       0.62      0.56      0.56       135



### And we are done not bad for a simple model we created. 