# Text Mining

### Multinomial naïve Bayes for Fake Review Detection

In [158]:
import numpy as np
import pandas as p
file=p.read_csv("deception_data_converted_final.tsv", delimiter='\t')
file

Unnamed: 0,lie,sentiment,review
0,f,n,"'Mike\'s Pizza High Point, NY Service was very..."
1,f,n,'i really like this buffet restaurant in Marsh...
2,f,n,"'After I went shopping with some of my friend,..."
3,f,n,'Olive Oil Garden was very disappointing. I ex...
4,f,n,'The Seven Heaven restaurant was never known f...
...,...,...,...
87,t,p,'Pastablities is a locally owned restaurant in...
88,t,p,'I like the Pizza at Dominoes for their specia...
89,t,p,'It was a really amazing Japanese restaurant. ...
90,t,p,'How do I even pick a best experience at Joe\'...


In [159]:
X = file['review'].values
y_sentiment = file['sentiment'].values
y_authenticity = file['lie'].values

## Train and Test split for Sentiment anlaysis

In [160]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train_st, y_test_st = train_test_split(X, y_sentiment, test_size=0.3, random_state=0)

print(X_train.shape, y_train_st.shape, X_test.shape, y_test_st.shape)
print(X_train[0])
print(y_train_st[0])
print(X_test[0])
print(y_test_st[0])

(64,) (64,) (28,) (28,)
'The restaurant looked pretty good, the people around me all ate and talked happily. The environment was comfortable and it calmed me down from the busy life. The food tasted fantastic and the price was suitable'
p
'After I went shopping with some of my friend, we went to DODO restaurant for dinner. I found worm in one of the dishes .'
n


In [161]:
# Check how many training examples in each category
# this is important to see whether the data set is balanced or skewed

unique, counts = np.unique(y_train_st, return_counts=True)
print(np.asarray((unique, counts)))

[['n' 'p']
 [31 33]]


In [162]:
#Checking on the test data
unique, counts = np.unique(y_test_st, return_counts=True)
print(np.asarray((unique, counts)))

[['n' 'p']
 [15 13]]


## Vectorization

In [163]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [164]:
#  unigram term frequency vectorizer
unigram_count_vectorizer = CountVectorizer(encoding='latin-1', min_df=2, binary=False, stop_words = 'english')
unigram_tfidf_vectorizer = TfidfVectorizer(encoding='latin-1', use_idf=True, min_df=2, stop_words = 'english')


In [165]:
X_trainst_vec1 = unigram_count_vectorizer.fit_transform(X_train)
X_trainst_vec2 = unigram_tfidf_vectorizer.fit_transform(X_train)

# check the content of a document vector for unigram vectorizer
print(X_trainst_vec1.shape)
print(X_trainst_vec1[0].toarray())
 
# check the content of a document vector for tfidf vectorizer
print(X_trainst_vec2.shape)
print(X_trainst_vec2[0].toarray())

(64, 310)
[[0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0
  0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]
(64, 310)
[[0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.30631198 0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.   

In [166]:
# check the size of the constructed vocabulary
print(len(unigram_count_vectorizer.vocabulary_))

# print out the first 10 items in the vocabulary
print(list(unigram_count_vectorizer.vocabulary_.items())[:10])

# check the size of the constructed vocabulary
print(len(unigram_tfidf_vectorizer.vocabulary_))

# print out the first 10 items in the vocabulary
print(list(unigram_tfidf_vectorizer.vocabulary_.items())[:10])

310
[('restaurant', 219), ('looked', 156), ('pretty', 202), ('good', 112), ('people', 193), ('ate', 12), ('talked', 259), ('environment', 80), ('comfortable', 50), ('life', 147)]
310
[('restaurant', 219), ('looked', 156), ('pretty', 202), ('good', 112), ('people', 193), ('ate', 12), ('talked', 259), ('environment', 80), ('comfortable', 50), ('life', 147)]


In [167]:
X_testst_vec1 = unigram_count_vectorizer.transform(X_test)
# print out #examples and #features in the test set
print(X_testst_vec1.shape)

X_testst_vec2 = unigram_tfidf_vectorizer.transform(X_test)
# print out #examples and #features in the test set
print(X_testst_vec2.shape)

(28, 310)
(28, 310)


### Training a MNB Classsifier using Count Vectorizer on analysing Sentiment 

In [168]:
from sklearn.naive_bayes import MultinomialNB
#from sklearn.naive_bayes import BernoulliNB

# initialize the MNB model
nb_clf1= MultinomialNB()
nb_clf2= MultinomialNB()

# use the training data to train the MNB model
# feature_log_prob_ stores the conditional probs for all categories
# if the labels are strings, the index is in alphabetic order
# e.g. 'f' comes before 't' in alphabet, so 'f' is in [0] dimension and 't' in [1]

nb_clf1.fit(X_trainst_vec1,y_train_st)
print(nb_clf1.classes_)
print(nb_clf1.feature_log_prob_.shape)

nb_clf2.fit(X_trainst_vec2,y_train_st)
print(nb_clf2.classes_)
print(nb_clf2.feature_log_prob_.shape)
#print(nb_clf.classes_[1])

['n' 'p']
(2, 310)
['n' 'p']
(2, 310)


In [169]:
#sort the conditional probability for category 0 "negative"
print("negative Feature ranks using Count Vectorizer")
feature_ranks = sorted(zip(nb_clf1.feature_log_prob_[0], unigram_count_vectorizer.get_feature_names()))
negative_features = feature_ranks[-10:]
print(negative_features)

print("Negative Feature ranks using tfidf Vectorizer")
feature_ranks = sorted(zip(nb_clf2.feature_log_prob_[0], unigram_tfidf_vectorizer.get_feature_names()))
negative_features = feature_ranks[-10:]
print(negative_features)

negative Feature ranks using Count Vectorizer
[(-4.685828089005546, 'asked'), (-4.590517909201221, 'good'), (-4.590517909201221, 'salad'), (-4.503506532211592, 'experience'), (-4.4234638245380555, 'minutes'), (-4.4234638245380555, 'service'), (-4.280362980897381, 'went'), (-4.155199837943376, 'place'), (-3.5872158003374364, 'food'), (-3.5226772791998653, 'restaurant')]
Negative Feature ranks using tfidf Vectorizer
[(-5.227846168107487, 'asked'), (-5.222577154051034, 'terrible'), (-5.2103084299962354, 'salad'), (-5.187166247253829, 'dishes'), (-5.143151384891839, 'bad'), (-5.133996987257023, 'minutes'), (-5.1030489617666035, 'went'), (-4.938763368455484, 'place'), (-4.8430324276305585, 'food'), (-4.734849461080151, 'restaurant')]


In [170]:
#sort the conditional probability for category 1 "Postivie"
print("positive Feature ranks using Count Vectorizer")
feature_ranks = sorted(zip(nb_clf1.feature_log_prob_[1], unigram_count_vectorizer.get_feature_names()))
positive_features = feature_ranks[-10:]
print(positive_features)

print("Positive Feature ranks using tfidf Vectorizer")
feature_ranks = sorted(zip(nb_clf2.feature_log_prob_[1], unigram_tfidf_vectorizer.get_feature_names()))
positive_features = feature_ranks[-10:]
print(positive_features)

positive Feature ranks using Count Vectorizer
[(-4.558078578454241, 'fresh'), (-4.558078578454241, 'place'), (-4.558078578454241, 'really'), (-4.558078578454241, 'service'), (-4.462768398649916, 'great'), (-4.375757021660286, 'amazing'), (-4.152613470346077, 'good'), (-3.7251694555191373, 'best'), (-3.682609841100341, 'restaurant'), (-3.564826805443958, 'food')]
Positive Feature ranks using tfidf Vectorizer
[(-5.219359097661734, 'nice'), (-5.181737794729136, 'really'), (-5.177689513221689, 'friendly'), (-5.141203834989176, 'fresh'), (-5.0652075388691316, 'amazing'), (-4.991196381116305, 'good'), (-4.9816342151921384, 'great'), (-4.842458587375431, 'restaurant'), (-4.7986462614697345, 'food'), (-4.741718621185967, 'best')]


In [171]:
# feature analysis for sentiment using Unigram count vectorizer
log_ratios = []
features = unigram_count_vectorizer.get_feature_names()
print(nb_clf1.classes_)
print(nb_clf1.feature_log_prob_.shape)
log_ratios = nb_clf1.feature_log_prob_[1] - nb_clf1.feature_log_prob_[0]

exercise_C_ranks = sorted(zip(log_ratios, features))
print(exercise_C_ranks[:10])
print(exercise_C_ranks[-10:])

['n' 'p']
(2, 310)
[(-2.174835582442741, 'asked'), (-2.069475066784915, 'terrible'), (-2.069475066784915, 'took'), (-1.9516920311285313, 'said'), (-1.8181606385040086, 'came'), (-1.8181606385040086, 'come'), (-1.8181606385040086, 'indian'), (-1.6640099586767496, 'worst'), (-1.4816884018827956, 'bread'), (-1.4816884018827956, 'calling')]
[(1.6318269073275786, 'friendly'), (1.7371874229854054, 'atmosphere'), (1.7371874229854054, 'flavors'), (1.7371874229854054, 'fresh'), (1.7371874229854054, 'ice'), (1.7371874229854054, 'ingredients'), (1.7371874229854054, 'noodle'), (1.7371874229854054, 'special'), (1.8769493653605638, 'best'), (2.6126561603393057, 'amazing')]


In [172]:
# feature analysis for sentiment using tfidf count vectorizer
log_ratios = []
features = unigram_tfidf_vectorizer.get_feature_names()
print(nb_clf2.classes_)
print(nb_clf2.feature_log_prob_.shape)
vneg_cond_prob = nb_clf2.feature_log_prob_[1]
vpos_cond_prob = nb_clf2.feature_log_prob_[0]

log_ratios = nb_clf2.feature_log_prob_[1] - nb_clf2.feature_log_prob_[0]

exercise_C_ranks = sorted(zip(log_ratios, features))
print(exercise_C_ranks[:10])
print(exercise_C_ranks[-10:])


['n' 'p']
(2, 310)
[(-0.8392835260607994, 'terrible'), (-0.8340145120043472, 'asked'), (-0.8185685631223674, 'bad'), (-0.7775223169269054, 'took'), (-0.7449907213541067, 'said'), (-0.7212540980234792, 'indian'), (-0.71116743345695, 'come'), (-0.6833202704970631, 'calling'), (-0.65438056224255, 'worst'), (-0.6507018512260512, 'minutes')]
[(0.6415435014206254, 'ate'), (0.6632204158005521, 'need'), (0.672675609201967, 'ice'), (0.7088874544298278, 'noodle'), (0.7393270662265934, 'atmosphere'), (0.7617007305608157, 'friendly'), (0.8474594141067371, 'fresh'), (0.8693826879421644, 'great'), (1.0148539818198117, 'amazing'), (1.0810857768119515, 'best')]


In [173]:
nb_clf1.score(X_testst_vec1,y_test_st) #For count vectorizer

0.8928571428571429

In [174]:
nb_clf2.score(X_testst_vec2,y_test_st) # for tfidf vectorizer

0.8571428571428571

In [175]:
# For count vectorizer
from sklearn.metrics import confusion_matrix
y_pred_st1 = nb_clf1.fit(X_trainst_vec1, y_train_st).predict(X_testst_vec1)
cm=confusion_matrix(y_test_st, y_pred_st1, labels=['n','p'])
print(cm)

[[12  3]
 [ 0 13]]


In [176]:
# For tfidf vectorizer
from sklearn.metrics import confusion_matrix
y_pred_st2 = nb_clf2.fit(X_trainst_vec2, y_train_st).predict(X_testst_vec2)
cm=confusion_matrix(y_test_st, y_pred_st2, labels=['n','p'])
print(cm)

[[11  4]
 [ 0 13]]


In [177]:
# print classification report for count vectorizer

from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
print(precision_score(y_test_st, y_pred_st1, average=None))
print(recall_score(y_test_st, y_pred_st1, average=None))

from sklearn.metrics import classification_report
target_names = ['n','p']
print(classification_report(y_test_st, y_pred_st1, target_names=target_names))

[1.     0.8125]
[0.8 1. ]
              precision    recall  f1-score   support

           n       1.00      0.80      0.89        15
           p       0.81      1.00      0.90        13

    accuracy                           0.89        28
   macro avg       0.91      0.90      0.89        28
weighted avg       0.91      0.89      0.89        28



In [178]:
# print classification report for tfidf vectorizer

from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
print(precision_score(y_test_st, y_pred_st2, average=None))
print(recall_score(y_test_st, y_pred_st2, average=None))

from sklearn.metrics import classification_report
target_names = ['n','p']
print(classification_report(y_test_st, y_pred_st2, target_names=target_names))

[1.         0.76470588]
[0.73333333 1.        ]
              precision    recall  f1-score   support

           n       1.00      0.73      0.85        15
           p       0.76      1.00      0.87        13

    accuracy                           0.86        28
   macro avg       0.88      0.87      0.86        28
weighted avg       0.89      0.86      0.86        28



In [179]:
## find the calculated posterior probability for count vectorizer
posterior_probs1 = nb_clf1.predict_proba(X_testst_vec1)

## find the posterior probabilities for the first test example
print(posterior_probs1[0])

# find the category prediction for the first test example
y_pred1 = nb_clf1.predict(X_testst_vec1)
print(y_pred1[0])

# check the actual label for the first test example
print(y_test_st[0])

[0.78085003 0.21914997]
n
n


In [180]:
## find the calculated posterior probability for count vectorizer
posterior_probs2 = nb_clf2.predict_proba(X_testst_vec2)

## find the posterior probabilities for the first test example
print(posterior_probs2[0])

# find the category prediction for the first test example
y_pred2 = nb_clf2.predict(X_testst_vec2)
print(y_pred2[0])

# check the actual label for the first test example
print(y_test_st[0])

[0.55908436 0.44091564]
n
n


## Error Analysis

In [198]:
#Negative review predicted as positive for Count vectorizer
err_cnt = 0
for i in range(0, len(y_test_st)):
    if(y_test_st[i]=='n' and y_pred1[i]=='p'):
        print(X_test[i])
        err_cnt = err_cnt+1
print("errors:", err_cnt)

'This place used to be great. I can\'t believe it\'s current state. Instead of the cool, dimly-lit lounge that I was used to, I was in a cheap, smelly bar. The music has no soul, the bartender is mean. This place no longer exudes a welcoming spirit. The crowd is awkward and old. I want my old hangout back!!'
'Carlo\'s Plate Shack was the worst dining experience of my life. Although my Southern Comfort Plate sounded to die for, the staff was extremely unhelpful at every turn. We started off with drinks, I had a sick Loganberry milkshake, and my friends had fresh brewed, but bland, iced tea (the ice likely melted and diluted). Eventually our server returned a half hour later to take our orders. I had the aforementioned Southern Comfort Plate, while my friends ordered the Buffalo Chicken Plate and the Hawaiian Plate Lunch. The Southern Comfort Plate came out first, a good 15 minutes before the others, and was extremely greasy. The other 2 ended up being nearly room temperature when they c

In [52]:
#Negative review predicted as positive for tfidf vectorizer
err_cnt = 0
for i in range(0, len(y_test_st)):
    if(y_test_st[i]=='n' and y_pred2[i]=='p'):
        print(X_test[i])
        err_cnt = err_cnt+1
print("errors:", err_cnt)

'This place used to be great. I can\'t believe it\'s current state. Instead of the cool, dimly-lit lounge that I was used to, I was in a cheap, smelly bar. The music has no soul, the bartender is mean. This place no longer exudes a welcoming spirit. The crowd is awkward and old. I want my old hangout back!!'
'Carlo\'s Plate Shack was the worst dining experience of my life. Although my Southern Comfort Plate sounded to die for, the staff was extremely unhelpful at every turn. We started off with drinks, I had a sick Loganberry milkshake, and my friends had fresh brewed, but bland, iced tea (the ice likely melted and diluted). Eventually our server returned a half hour later to take our orders. I had the aforementioned Southern Comfort Plate, while my friends ordered the Buffalo Chicken Plate and the Hawaiian Plate Lunch. The Southern Comfort Plate came out first, a good 15 minutes before the others, and was extremely greasy. The other 2 ended up being nearly room temperature when they c

In [182]:
#Positive review predicted as negative for count vectorizer
err_cnt = 0
for i in range(0, len(y_test_st)):
    if(y_test_st[i]=='p' and y_pred1[i]=='n'):
        print(X_test[i])
        err_cnt = err_cnt+1
print("errors:", err_cnt)

errors: 0


In [183]:
#Positive review predicted as negative for TFIDF vectorizer
err_cnt = 0
for i in range(0, len(y_test_st)):
    if(y_test_st[i]=='p' and y_pred2[i]=='n'):
        print(X_test[i])
        err_cnt = err_cnt+1
print("errors:", err_cnt)

errors: 0


## Cross validation on Sentiment Analysis

In [184]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score

In [185]:
##MNB with Bool
nb_clf_pipe = Pipeline([('vect', CountVectorizer(encoding='latin-1', binary=True)),('nb', MultinomialNB())])
scores = cross_val_score(nb_clf_pipe, X, y_sentiment, cv=5)
avg=sum(scores)/len(scores)
print(avg)


0.8251461988304094


In [186]:
##MNB TFIDF
mNB_tfidf_pipe = Pipeline([('nb_tf',TfidfVectorizer(encoding='latin-1',use_idf=True,binary=False)),('nb',MultinomialNB())])
scores = cross_val_score(mNB_tfidf_pipe,X,y_sentiment,cv=5)
print(sum(scores)/len(scores))


0.8801169590643274


## Train and Test split for authenticity anlaysis

In [187]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train_at, y_test_at = train_test_split(X, y_authenticity, test_size=0.3, random_state=0)

print(X_train.shape, y_train_at.shape, X_test.shape, y_test_at.shape)
print(X_train[0])
print(y_train_at[0])
print(X_test[0])
print(y_test_at[0])

(64,) (64,) (28,) (28,)
'The restaurant looked pretty good, the people around me all ate and talked happily. The environment was comfortable and it calmed me down from the busy life. The food tasted fantastic and the price was suitable'
t
'After I went shopping with some of my friend, we went to DODO restaurant for dinner. I found worm in one of the dishes .'
f


In [188]:
# Check how many training examples in each category
# this is important to see whether the data set is balanced or skewed

unique, counts = np.unique(y_train_at, return_counts=True)
print(np.asarray((unique, counts)))

[['f' 't']
 [33 31]]


## Vectorization

In [189]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [190]:
#  unigram term frequency vectorizer
unigram_count_vectorizer = CountVectorizer(encoding='latin-1', min_df=2, binary=False, stop_words = 'english')
unigram_tfidf_vectorizer = TfidfVectorizer(encoding='latin-1', use_idf=True, min_df=2, stop_words = 'english')

In [191]:
X_trainat_vec1 = unigram_count_vectorizer.fit_transform(X_train)
X_trainat_vec2 = unigram_tfidf_vectorizer.fit_transform(X_train)

# check the content of a document vector for unigram vectorizer
print(X_trainat_vec1.shape)
print(X_trainat_vec1[0].toarray())
 
# check the content of a document vector for tfidf vectorizer
print(X_trainat_vec2.shape)
print(X_trainat_vec2[0].toarray())

(64, 310)
[[0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0
  0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]
(64, 310)
[[0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.30631198 0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.   

In [192]:
# check the size of the constructed vocabulary
print(len(unigram_count_vectorizer.vocabulary_))

# print out the first 10 items in the vocabulary
print(list(unigram_count_vectorizer.vocabulary_.items())[:10])

# check the size of the constructed vocabulary
print(len(unigram_tfidf_vectorizer.vocabulary_))

# print out the first 10 items in the vocabulary
print(list(unigram_tfidf_vectorizer.vocabulary_.items())[:10])

310
[('restaurant', 219), ('looked', 156), ('pretty', 202), ('good', 112), ('people', 193), ('ate', 12), ('talked', 259), ('environment', 80), ('comfortable', 50), ('life', 147)]
310
[('restaurant', 219), ('looked', 156), ('pretty', 202), ('good', 112), ('people', 193), ('ate', 12), ('talked', 259), ('environment', 80), ('comfortable', 50), ('life', 147)]


In [193]:
X_testat_vec1 = unigram_count_vectorizer.transform(X_test)
# print out #examples and #features in the test set
print(X_testat_vec1.shape)

X_testat_vec2 = unigram_tfidf_vectorizer.transform(X_test)
# print out #examples and #features in the test set
print(X_testat_vec2.shape)

(28, 310)
(28, 310)


In [194]:
### Training a MNB Classsifier using Count Vectorizer on analysing authenticity

In [195]:
from sklearn.naive_bayes import MultinomialNB
#from sklearn.naive_bayes import BernoulliNB

# initialize the MNB model
nb_clf3= MultinomialNB()
nb_clf4= MultinomialNB()

# use the training data to train the MNB model
# feature_log_prob_ stores the conditional probs for all categories
# if the labels are strings, the index is in alphabetic order
# e.g. 'f' comes before 't' in alphabet, so 'f' is in [0] dimension and 't' in [1]

nb_clf3.fit(X_trainat_vec1,y_train_at)
print(nb_clf3.classes_)
print(nb_clf3.feature_log_prob_.shape)

nb_clf4.fit(X_trainat_vec2,y_train_at)
print(nb_clf4.classes_)
print(nb_clf4.feature_log_prob_.shape)
#print(nb_clf.classes_[1])

['f' 't']
(2, 310)
['f' 't']
(2, 310)


In [196]:
print("fake Feature ranks using Count Vectorizer")
feature_ranks = sorted(zip(nb_clf3.feature_log_prob_[0], unigram_count_vectorizer.get_feature_names()))
negative_features = feature_ranks[-10:]
print(negative_features)

print("fake Feature ranks using tfidf Vectorizer")
feature_ranks = sorted(zip(nb_clf4.feature_log_prob_[0], unigram_tfidf_vectorizer.get_feature_names()))
negative_features = feature_ranks[-10:]
print(negative_features)

fake Feature ranks using Count Vectorizer
[(-4.599700710183556, 'good'), (-4.599700710183556, 'like'), (-4.512689333193926, 'minutes'), (-4.512689333193926, 'went'), (-4.358538653366669, 'experience'), (-4.289545781879717, 'service'), (-4.225007260742146, 'best'), (-4.225007260742146, 'place'), (-3.819542152633981, 'restaurant'), (-3.596398601319771, 'food')]
fake Feature ranks using tfidf Vectorizer
[(-5.222621534700683, 'menu'), (-5.204141206095463, 'minutes'), (-5.194766449563898, 'want'), (-5.191439930725854, 'like'), (-5.169589079803627, 'experience'), (-5.091523346372281, 'service'), (-5.036297924807053, 'place'), (-4.9902384138165345, 'best'), (-4.931710867417781, 'restaurant'), (-4.8395399172187705, 'food')]


In [197]:
print("true Feature ranks using Count Vectorizer")
feature_ranks = sorted(zip(nb_clf3.feature_log_prob_[1], unigram_count_vectorizer.get_feature_names()))
positive_features = feature_ranks[-10:]
print(positive_features)

print("true Feature ranks using tfidf Vectorizer")
feature_ranks = sorted(zip(nb_clf4.feature_log_prob_[1], unigram_tfidf_vectorizer.get_feature_names()))
positive_features = feature_ranks[-10:]
print(positive_features)

true Feature ranks using Count Vectorizer
[(-4.65290158880928, 'really'), (-4.65290158880928, 'salad'), (-4.65290158880928, 'time'), (-4.5475410731514545, 'ordered'), (-4.5475410731514545, 'went'), (-4.45223089334713, 'best'), (-4.45223089334713, 'place'), (-4.14207596504329, 'good'), (-3.5542893001411713, 'food'), (-3.384390263345774, 'restaurant')]
true Feature ranks using tfidf Vectorizer
[(-5.3097769800584675, 'went'), (-5.279450072677803, 'bad'), (-5.2643763095714355, 'dishes'), (-5.242117858118911, 'really'), (-5.22604674402891, 'salad'), (-5.209498165662844, 'ordered'), (-5.123975695678814, 'place'), (-5.008451736461623, 'good'), (-4.801344505308265, 'food'), (-4.653277927142044, 'restaurant')]


In [118]:
# feature analysis for sentiment using Unigram count vectorizer
log_ratios = []
features = unigram_count_vectorizer.get_feature_names()
vneg_cond_prob = nb_clf3.feature_log_prob_[1]
vpos_cond_prob = nb_clf3.feature_log_prob_[0]

log_ratios = nb_clf3.feature_log_prob_[1] - nb_clf3.feature_log_prob_[0]

exercise_C_ranks = sorted(zip(log_ratios, features))
print(exercise_C_ranks[:10])
print(exercise_C_ranks[-10:])

[(-2.049754760499793, 'plate'), (-1.9319717248434092, 'want'), (-1.6442896523916293, 'cold'), (-1.6442896523916293, 'delicious'), (-1.4619680955976744, '15'), (-1.4619680955976744, 'bread'), (-1.4619680955976744, 'cooked'), (-1.4619680955976744, 'free'), (-1.4619680955976744, 'ice'), (-1.4619680955976744, 'outstanding')]
[(1.533764177956317, 'did'), (1.533764177956317, 'ignored'), (1.533764177956317, 'makes'), (1.533764177956317, 'pretty'), (1.533764177956317, 'shrimp'), (1.533764177956317, 'thing'), (1.7569077292705266, 'calling'), (1.7569077292705266, 'tables'), (1.9392292860644806, 'worst'), (2.2269113585162623, 'glass')]


In [119]:
# feature analysis for sentiment using tfidf count vectorizer
log_ratios = []
features = unigram_count_vectorizer.get_feature_names()
vneg_cond_prob = nb_clf4.feature_log_prob_[1]
vpos_cond_prob = nb_clf4.feature_log_prob_[0]

log_ratios = nb_clf4.feature_log_prob_[1] - nb_clf4.feature_log_prob_[0]

exercise_C_ranks = sorted(zip(log_ratios, features))
print(exercise_C_ranks[:10])
print(exercise_C_ranks[-10:])

[(-0.8478165835813405, 'want'), (-0.6619293787971818, 'delicious'), (-0.6190970810570091, 'cold'), (-0.6007795313464701, 'steak'), (-0.5984212150128023, 'ice'), (-0.5789412885558667, 'plate'), (-0.5445522419926903, 'free'), (-0.5357420718877588, 'cooked'), (-0.5281734588758047, 'bring'), (-0.5079079137785811, 'beer')]
[(0.5676775438136312, 'suitable'), (0.5710307503215892, 'smile'), (0.5878328278079206, 'big'), (0.6079274574779987, 'shrimp'), (0.6169490822281132, 'environment'), (0.671896846583147, 'did'), (0.7286349564317156, 'worst'), (0.7312451103525666, 'glass'), (0.7369803193402422, 'tables'), (0.7575746646862287, 'calling')]


In [76]:
nb_clf3.score(X_testat_vec1,y_test_at) #For count vectorizer

0.5357142857142857

In [79]:
nb_clf4.score(X_testat_vec2,y_test_at) # for tfidf vectorizer

0.5357142857142857

In [84]:
# For count vectorizer
from sklearn.metrics import confusion_matrix
y_pred_at1 = nb_clf3.fit(X_trainat_vec1, y_train_at).predict(X_testat_vec1)
cm=confusion_matrix(y_test_at, y_pred_at1, labels=['f','t'])
print(cm)

[[9 4]
 [9 6]]


In [86]:
# For tf_idf vectorizer
from sklearn.metrics import confusion_matrix
y_pred_at2 = nb_clf4.fit(X_trainat_vec2, y_train_at).predict(X_testat_vec2)
cm=confusion_matrix(y_test_at, y_pred_at2, labels=['f','t'])
print(cm)

[[10  3]
 [10  5]]


In [87]:
# print classification report for count vectorizer

from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
print(precision_score(y_test_at, y_pred_at1, average=None))
print(recall_score(y_test_at, y_pred_at1, average=None))

from sklearn.metrics import classification_report
target_names = ['f','t']
print(classification_report(y_test_at, y_pred_at1, target_names=target_names))

[0.5 0.6]
[0.69230769 0.4       ]
              precision    recall  f1-score   support

           f       0.50      0.69      0.58        13
           t       0.60      0.40      0.48        15

    accuracy                           0.54        28
   macro avg       0.55      0.55      0.53        28
weighted avg       0.55      0.54      0.53        28



In [88]:
# print classification report for tfidf vectorizer

from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
print(precision_score(y_test_at, y_pred_at2, average=None))
print(recall_score(y_test_at, y_pred_at2, average=None))

from sklearn.metrics import classification_report
target_names = ['f','t']
print(classification_report(y_test_at, y_pred_at2, target_names=target_names))

[0.5   0.625]
[0.76923077 0.33333333]
              precision    recall  f1-score   support

           f       0.50      0.77      0.61        13
           t       0.62      0.33      0.43        15

    accuracy                           0.54        28
   macro avg       0.56      0.55      0.52        28
weighted avg       0.57      0.54      0.51        28



In [70]:
## find the calculated posterior probability for count vectorizer
posterior_probs3 = nb_clf3.predict_proba(X_testat_vec1)

## find the posterior probabilities for the first test example
print(posterior_probs3[0])

# find the category prediction for the first test example
y_pred3 = nb_clf3.predict(X_testat_vec1)
print(y_pred1[0])

# check the actual label for the first test example
print(y_test_at[0])

[0.65496973 0.34503027]
n
f


In [89]:
## find the calculated posterior probability for tfidf vectorizer
posterior_probs4 = nb_clf4.predict_proba(X_testat_vec2)

## find the posterior probabilities for the first test example
print(posterior_probs4[0])

# find the category prediction for the first test example
y_pred4 = nb_clf4.predict(X_testat_vec2)
print(y_pred4[0])

# check the actual label for the first test example
print(y_test_at[0])

[0.51726141 0.48273859]
f
f


## Error Analysis

In [93]:
#fake review predicted as true review for Count vectorizer
err_cnt = 0
for i in range(0, len(y_test_at)):
    if(y_test_at[i]=='f' and y_pred3[i]=='t'):
        print(X_test[i])
        err_cnt = err_cnt+1
print("errors:", err_cnt)

'I ate at this restaurant called Banana Leaf. As I entered the restaurant I really liked the ambiance. I ordered noodle soup and fried rice with spicy black bean curry. The service was pretty fast and the food tasted amazing. There was a lot flavor in the food which I truly enjoyed. Two thumbs up for Banana Leaf and I would totally recommend this restaurant.'
'OMG. This restaurant is horrible. The receptionist did not greet us, we just stood there and waited for five minutes. The food came late and served not warm. Me and my pet ordered a bowl of salad and a cheese pizza. The salad was not fresh, the crust of a pizza was so hard like plastics. My dog didn\'t even eat that pizza. I hate this place!!!!!!!!!!'
'This restaurant ROCKS! I mean the food is great and people are great. Everything is great great just great!!! I love it. I like it. '
'I went to ABC restaurant two days ago and I hated the food and the service. We were kept waiting for over an hour just to get seated and once we or

In [95]:
# True review predicted as fake review for count vectorizer
err_cnt = 0
for i in range(0, len(y_test_at)):
    if(y_test_at[i]=='f' and y_pred4[i]=='t'):
        print(X_test[i])
        err_cnt = err_cnt+1
print("errors:", err_cnt)

'I ate at this restaurant called Banana Leaf. As I entered the restaurant I really liked the ambiance. I ordered noodle soup and fried rice with spicy black bean curry. The service was pretty fast and the food tasted amazing. There was a lot flavor in the food which I truly enjoyed. Two thumbs up for Banana Leaf and I would totally recommend this restaurant.'
'OMG. This restaurant is horrible. The receptionist did not greet us, we just stood there and waited for five minutes. The food came late and served not warm. Me and my pet ordered a bowl of salad and a cheese pizza. The salad was not fresh, the crust of a pizza was so hard like plastics. My dog didn\'t even eat that pizza. I hate this place!!!!!!!!!!'
'I went to ABC restaurant two days ago and I hated the food and the service. We were kept waiting for over an hour just to get seated and once we ordered, our food came out cold. I ordered the pasta and it was terrible - completely bland and very unappatizing. I definitely would not

In [96]:
#fake review predicted as true review for tfidf vectorizer
err_cnt = 0
for i in range(0, len(y_test_at)):
    if(y_test_at[i]=='f' and y_pred4[i]=='t'):
        print(X_test[i])
        err_cnt = err_cnt+1
print("errors:", err_cnt)

'I ate at this restaurant called Banana Leaf. As I entered the restaurant I really liked the ambiance. I ordered noodle soup and fried rice with spicy black bean curry. The service was pretty fast and the food tasted amazing. There was a lot flavor in the food which I truly enjoyed. Two thumbs up for Banana Leaf and I would totally recommend this restaurant.'
'OMG. This restaurant is horrible. The receptionist did not greet us, we just stood there and waited for five minutes. The food came late and served not warm. Me and my pet ordered a bowl of salad and a cheese pizza. The salad was not fresh, the crust of a pizza was so hard like plastics. My dog didn\'t even eat that pizza. I hate this place!!!!!!!!!!'
'I went to ABC restaurant two days ago and I hated the food and the service. We were kept waiting for over an hour just to get seated and once we ordered, our food came out cold. I ordered the pasta and it was terrible - completely bland and very unappatizing. I definitely would not

In [97]:
#fake review predicted as true review for tfidf vectorizer
err_cnt = 0
for i in range(0, len(y_test_at)):
    if(y_test_at[i]=='f' and y_pred4[i]=='t'):
        print(X_test[i])
        err_cnt = err_cnt+1
print("errors:", err_cnt)

'I ate at this restaurant called Banana Leaf. As I entered the restaurant I really liked the ambiance. I ordered noodle soup and fried rice with spicy black bean curry. The service was pretty fast and the food tasted amazing. There was a lot flavor in the food which I truly enjoyed. Two thumbs up for Banana Leaf and I would totally recommend this restaurant.'
'OMG. This restaurant is horrible. The receptionist did not greet us, we just stood there and waited for five minutes. The food came late and served not warm. Me and my pet ordered a bowl of salad and a cheese pizza. The salad was not fresh, the crust of a pizza was so hard like plastics. My dog didn\'t even eat that pizza. I hate this place!!!!!!!!!!'
'I went to ABC restaurant two days ago and I hated the food and the service. We were kept waiting for over an hour just to get seated and once we ordered, our food came out cold. I ordered the pasta and it was terrible - completely bland and very unappatizing. I definitely would not

## Cross Validation

In [103]:
##MNB with Bool
nb_clf_pipe3 = Pipeline([('vect', CountVectorizer(encoding='latin-1', binary=True)),('nb', MultinomialNB())])
scores = cross_val_score(nb_clf_pipe3, X, y_authenticity, cv=5)
avg=sum(scores)/len(scores)
print(avg)


0.587719298245614


In [105]:
##MNB TFIDF
mNB_tfidf_pipe4 = Pipeline([('nb_tf',TfidfVectorizer(encoding='latin-1',use_idf=True,binary=False)),('nb',MultinomialNB())])
scores = cross_val_score(mNB_tfidf_pipe4,X,y_authenticity,cv=5)
print(sum(scores)/len(scores))

0.587719298245614
