# Data Preprocessing

In [1]:
!pip install pandas
!pip install scikit-learn



In [2]:
import pandas as pd

file = "C:/Users/Patrick/Syracuse_courses/IST_736/HW4/deception_data_two_labels.csv"

original_df = pd.read_csv(file)
original_df.head()

Unnamed: 0,lie,sentiment,review,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22,Unnamed: 23
0,f,n,'Mike\'s Pizza High Point,NY Service was very slow and the quality was ...,not. Stick to pre-made dishes like stuffed pa...,,,,,,...,,,,,,,,,,
1,f,n,'i really like this buffet restaurant in Marsh...,japanese,and chinese dishes. we also got a free drink ...,,,,,,...,,,,,,,,,,
2,f,n,'After I went shopping with some of my friend,we went to DODO restaurant for dinner. I foun...,,,,,,,...,,,,,,,,,,
3,f,n,'Olive Oil Garden was very disappointing. I ex...,and the waitor had no manners whatsoever. Don...,,,,,,,...,,,,,,,,,,
4,f,n,'The Seven Heaven restaurant was never known f...,never more. ',,,,,,,...,,,,,,,,,,


In [3]:
# combine review columns into one column and drop remaining, empty columns
original_df['review'] = original_df.iloc[:, 2:].apply(lambda row: ' '.join(row.dropna().astype(str)), axis=1)
cleaned_df = original_df.drop(original_df.columns[3:], axis=1)
cleaned_df.head()

Unnamed: 0,lie,sentiment,review
0,f,n,'Mike\'s Pizza High Point NY Service was very...
1,f,n,'i really like this buffet restaurant in Marsh...
2,f,n,'After I went shopping with some of my friend ...
3,f,n,'Olive Oil Garden was very disappointing. I ex...
4,f,n,'The Seven Heaven restaurant was never known f...


In [4]:
# remove miscellaneous characters from review column
cleaned_df['review'] = (cleaned_df['review'].str.replace("f'", '', regex=True)
                .str.replace("t'", '', regex=True)
                .str.replace('\\', '', regex=True))  
cleaned_df['review'] = cleaned_df['review'].str.rstrip("'")  # remove final single quote
cleaned_df['review'] = cleaned_df['review'].str.replace("'", "", 1)  # remove first single quote from each row
cleaned_df.head()

Unnamed: 0,lie,sentiment,review
0,f,n,Mike's Pizza High Point NY Service was very s...
1,f,n,i really like this buffet restaurant in Marsha...
2,f,n,After I went shopping with some of my friend ...
3,f,n,Olive Oil Garden was very disappointing. I exp...
4,f,n,The Seven Heaven restaurant was never known fo...


In [5]:
cleaned_df['review'][0]

"Mike's Pizza High Point  NY Service was very slow and the quality was low. You would think they would know at least how to make good pizza  not. Stick to pre-made dishes like stuffed pasta or a salad. You should consider dining else where."

In [6]:
# extract lie label and create separate df from cleaned df
lie_df = cleaned_df.drop(cleaned_df.columns[1],axis=1)
lie_df = lie_df.rename(columns={'lie':'LABEL'})
lie_df.head()

Unnamed: 0,LABEL,review
0,f,Mike's Pizza High Point NY Service was very s...
1,f,i really like this buffet restaurant in Marsha...
2,f,After I went shopping with some of my friend ...
3,f,Olive Oil Garden was very disappointing. I exp...
4,f,The Seven Heaven restaurant was never known fo...


In [7]:
# extract sentiment label and create separate df from cleaned df
sentiment_df = cleaned_df.drop(cleaned_df.columns[0],axis=1)
sentiment_df = sentiment_df.rename(columns={'sentiment':'LABEL'})
sentiment_df.head()

Unnamed: 0,LABEL,review
0,n,Mike's Pizza High Point NY Service was very s...
1,n,i really like this buffet restaurant in Marsha...
2,n,After I went shopping with some of my friend ...
3,n,Olive Oil Garden was very disappointing. I exp...
4,n,The Seven Heaven restaurant was never known fo...


# Tokenization/Vectorization
This code will produce 4 dataframes: 
1. a vectorized df of the lie dataset using CountVectorizer, 
2. a vectorized df
of the lie dataset using TfidfVectorizer, 
3. a vectorized df of the sentiment dataset using CountVectorizer, and 
4. a 
vectorized df of the sentiment dataset using TfidfVectorizer.

In [8]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# create CountVectorizer() object
MyLieCV = CountVectorizer(input='content', stop_words='english')
# create TfidfVectorizer() object
MyLieTFIDF = TfidfVectorizer(input='content', stop_words='english')

MySentCV = CountVectorizer(input='content', stop_words='english')
MySentTFIDF = TfidfVectorizer(input='content', stop_words='english')

In [9]:
# create document term matrix using CV object
dtm_lie_cv = MyLieCV.fit_transform(lie_df['review'])
# vectorize df
vectorized_lie_cv = pd.DataFrame(dtm_lie_cv.toarray(), columns=MyLieCV.get_feature_names_out())

vectorized_lie_cv.insert(0, 'LABEL', lie_df['LABEL'])
print(vectorized_lie_cv.head())

  LABEL  10  100  15  16  20  25  2nd  30  50  ...  write  written  wrong  \
0     f   0    0   0   0   0   0    0   0   0  ...      0        0      0   
1     f   0    0   0   0   0   0    0   0   0  ...      0        0      0   
2     f   0    0   0   0   0   0    0   0   0  ...      0        0      0   
3     f   0    0   0   0   0   0    0   0   0  ...      0        0      0   
4     f   0    0   0   0   0   0    0   0   0  ...      0        0      0   

   wrote  xyz  yeah  yelp  yesterday  york  yuenan  
0      0    0     0     0          0     0       0  
1      0    0     0     0          0     0       0  
2      0    0     0     0          0     0       0  
3      0    0     0     0          0     0       0  
4      0    0     0     0          0     0       0  

[5 rows x 1255 columns]


In [10]:
# create document term matrix using CV object
dtm_lie_tfidf = MyLieTFIDF.fit_transform(lie_df['review'])
# vectorize df
vectorized_lie_tfidf = pd.DataFrame(dtm_lie_tfidf.toarray(), columns=MyLieTFIDF.get_feature_names_out())

vectorized_lie_tfidf.insert(0, 'LABEL', lie_df['LABEL'])
print(vectorized_lie_tfidf.head())

  LABEL   10  100   15   16   20   25  2nd   30   50  ...  write  written  \
0     f  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...    0.0      0.0   
1     f  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...    0.0      0.0   
2     f  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...    0.0      0.0   
3     f  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...    0.0      0.0   
4     f  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...    0.0      0.0   

   wrong  wrote  xyz  yeah  yelp  yesterday  york  yuenan  
0    0.0    0.0  0.0   0.0   0.0        0.0   0.0     0.0  
1    0.0    0.0  0.0   0.0   0.0        0.0   0.0     0.0  
2    0.0    0.0  0.0   0.0   0.0        0.0   0.0     0.0  
3    0.0    0.0  0.0   0.0   0.0        0.0   0.0     0.0  
4    0.0    0.0  0.0   0.0   0.0        0.0   0.0     0.0  

[5 rows x 1255 columns]


In [11]:
dtm_sentiment_cv = MySentCV.fit_transform(sentiment_df['review'])
vectorized_sentiment_cv = pd.DataFrame(dtm_sentiment_cv.toarray(), columns=MySentCV.get_feature_names_out())

vectorized_sentiment_cv.insert(0, 'LABEL', sentiment_df['LABEL'])
print(vectorized_sentiment_cv.head())

  LABEL  10  100  15  16  20  25  2nd  30  50  ...  write  written  wrong  \
0     n   0    0   0   0   0   0    0   0   0  ...      0        0      0   
1     n   0    0   0   0   0   0    0   0   0  ...      0        0      0   
2     n   0    0   0   0   0   0    0   0   0  ...      0        0      0   
3     n   0    0   0   0   0   0    0   0   0  ...      0        0      0   
4     n   0    0   0   0   0   0    0   0   0  ...      0        0      0   

   wrote  xyz  yeah  yelp  yesterday  york  yuenan  
0      0    0     0     0          0     0       0  
1      0    0     0     0          0     0       0  
2      0    0     0     0          0     0       0  
3      0    0     0     0          0     0       0  
4      0    0     0     0          0     0       0  

[5 rows x 1255 columns]


In [12]:
dtm_sentiment_tfidf = MySentTFIDF.fit_transform(sentiment_df['review'])
vectorized_sentiment_tfidf = pd.DataFrame(dtm_sentiment_tfidf.toarray(), columns=MySentTFIDF.get_feature_names_out())

vectorized_sentiment_tfidf.insert(0, 'LABEL', sentiment_df['LABEL'])
print(vectorized_sentiment_tfidf.head())

  LABEL   10  100   15   16   20   25  2nd   30   50  ...  write  written  \
0     n  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...    0.0      0.0   
1     n  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...    0.0      0.0   
2     n  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...    0.0      0.0   
3     n  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...    0.0      0.0   
4     n  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...    0.0      0.0   

   wrong  wrote  xyz  yeah  yelp  yesterday  york  yuenan  
0    0.0    0.0  0.0   0.0   0.0        0.0   0.0     0.0  
1    0.0    0.0  0.0   0.0   0.0        0.0   0.0     0.0  
2    0.0    0.0  0.0   0.0   0.0        0.0   0.0     0.0  
3    0.0    0.0  0.0   0.0   0.0        0.0   0.0     0.0  
4    0.0    0.0  0.0   0.0   0.0        0.0   0.0     0.0  

[5 rows x 1255 columns]


In [13]:
# remove columns with digits
def alpha_only(df):
    columns_to_drop = [col for col in df.columns if any(char.isdigit() for char in col)]
    df = df.drop(columns=columns_to_drop)
    
    return df

print('Lie DF, CV:')
vectorized_lie_cv = alpha_only(vectorized_lie_cv)
vectorized_lie_cv.head()

Lie DF, CV:


Unnamed: 0,LABEL,abc,abruptly,absolutely,acceptable,accord,acknowledge,actual,actually,ad,...,write,written,wrong,wrote,xyz,yeah,yelp,yesterday,york,yuenan
0,f,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,f,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,f,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,f,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,f,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
print('Lie DF, TFIDF:')
vectorized_lie_tfidf = alpha_only(vectorized_lie_tfidf)
vectorized_lie_tfidf.head()

Lie DF, TFIDF:


Unnamed: 0,LABEL,abc,abruptly,absolutely,acceptable,accord,acknowledge,actual,actually,ad,...,write,written,wrong,wrote,xyz,yeah,yelp,yesterday,york,yuenan
0,f,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,f,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,f,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,f,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,f,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
print('Sentiment DF, CV:')
vectorized_sentiment_cv = alpha_only(vectorized_sentiment_cv)
vectorized_sentiment_cv.head()

Sentiment DF, CV:


Unnamed: 0,LABEL,abc,abruptly,absolutely,acceptable,accord,acknowledge,actual,actually,ad,...,write,written,wrong,wrote,xyz,yeah,yelp,yesterday,york,yuenan
0,n,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,n,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,n,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,n,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,n,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
print('Sentiment DF, TFIDF:')
vectorized_sentiment_tfidf = alpha_only(vectorized_sentiment_tfidf)
vectorized_sentiment_tfidf.head()

Sentiment DF, TFIDF:


Unnamed: 0,LABEL,abc,abruptly,absolutely,acceptable,accord,acknowledge,actual,actually,ad,...,write,written,wrong,wrote,xyz,yeah,yelp,yesterday,york,yuenan
0,n,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,n,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,n,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,n,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,n,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


The preceeding blocks of code created the 4 dataframes using CountVectorizer and TfidfVectorizer. Now it's time to train a model.

# Train Naive Bayes model
These code blocks split the 4 dataframes created in the prior blocks into training and testing segments with a 
70% - 30% split for training and testing. Each of the 4 training segments will be used to create 4 separate models:

Model 1: Lie detection model trained on a CountVectorizer() df,

Model 2: Lie detection model trained on a TfidfVectorizer() df,

Model 3: Sentiment classification model trained on a CountVectorizer() df, and

Model 4: Sentiment classification model trained on a TfidfVectorizer() df.

In [17]:
from sklearn.model_selection import train_test_split

TrainLieCV, TestLieCV = train_test_split(vectorized_lie_cv, test_size=0.3)  # model 1
TrainLieTFIDF, TestLieTFIDF = train_test_split(vectorized_lie_tfidf, test_size=0.3)  # model 2
TrainSentCV, TestSentCV = train_test_split(vectorized_sentiment_cv, test_size=0.3)  # model 3
TrainSentTFIDF, TestSentTFIDF = train_test_split(vectorized_sentiment_tfidf, test_size=0.3)  # model 4

In [18]:
# split testing labels from testing data
TestLieCVLabels = TestLieCV['LABEL']
TestLieTFIDFLabels = TestLieTFIDF['LABEL']
TestSentCVLabels = TestSentCV['LABEL']
TestSentTFIDFLabels = TestSentTFIDF['LABEL']

TestLieCV = TestLieCV.drop(["LABEL"], axis=1)  # remove the entire column
TestLieTFIDF = TestLieTFIDF.drop(["LABEL"], axis=1)
TestSentCV = TestSentCV.drop(["LABEL"], axis=1)
TestSentTFIDF = TestSentTFIDF.drop(["LABEL"], axis=1)

In [19]:
# split training labels from training data
TrainLieCVLabels = TrainLieCV['LABEL']
TrainLieTFIDFLabels = TrainLieTFIDF['LABEL']
TrainSentCVLabels = TrainSentCV['LABEL']
TrainSentTFIDFLabels = TrainSentTFIDF['LABEL']

TrainLieCV = TrainLieCV.drop(["LABEL"], axis=1)  # remove the entire column
TrainLieTFIDF = TrainLieTFIDF.drop(["LABEL"], axis=1)
TrainSentCV = TrainSentCV.drop(["LABEL"], axis=1)
TrainSentTFIDF = TrainSentTFIDF.drop(["LABEL"], axis=1)

Create models and fit training sets to models

In [20]:
from sklearn.naive_bayes import MultinomialNB
import numpy as np

MyModel1 = MultinomialNB()
MyModel2 = MultinomialNB()
MyModel3 = MultinomialNB()
MyModel4 = MultinomialNB()

Model 1

In [21]:
LieCV = MyModel1.fit(TrainLieCV, TrainLieCVLabels)
PredLieCV = MyModel1.predict(TestLieCV)
print(PredLieCV)
print(np.round(MyModel1.predict_proba(TestLieCV),2))

['f' 'f' 'f' 't' 'f' 'f' 'f' 'f' 'f' 't' 't' 't' 'f' 'f' 'f' 'f' 'f' 'f'
 't' 'f' 'f' 't' 't' 't' 't' 'f' 't' 'f']
[[0.91 0.09]
 [1.   0.  ]
 [0.75 0.25]
 [0.1  0.9 ]
 [0.93 0.07]
 [0.74 0.26]
 [1.   0.  ]
 [0.82 0.18]
 [1.   0.  ]
 [0.03 0.97]
 [0.07 0.93]
 [0.3  0.7 ]
 [0.93 0.07]
 [0.94 0.06]
 [0.99 0.01]
 [0.98 0.02]
 [0.98 0.02]
 [0.95 0.05]
 [0.44 0.56]
 [0.87 0.13]
 [1.   0.  ]
 [0.24 0.76]
 [0.08 0.92]
 [0.01 0.99]
 [0.03 0.97]
 [0.56 0.44]
 [0.09 0.91]
 [0.59 0.41]]


Model 2

In [22]:
LieTFIDF = MyModel2.fit(TrainLieTFIDF, TrainLieTFIDFLabels)
PredLieTFIDF = MyModel2.predict(TestLieTFIDF)

Model 3

In [23]:
SentimentCV = MyModel3.fit(TrainSentCV, TrainSentCVLabels)
PredSentimentCV = MyModel3.predict(TestSentCV)

Model 4

In [24]:
SentimentTFIDF = MyModel4.fit(TrainSentTFIDF, TrainSentTFIDFLabels)
PredSentimentTFIDF = MyModel4.predict(TestSentTFIDF)

# Test Models

### Fake Review Detection

In [25]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

cnf_matrix1 = confusion_matrix(TestLieCVLabels, PredLieCV)
print("\nThe Model 1 confusion matrix is:")
print(cnf_matrix1)

print('\nAccuracy:')
accuracy_score(TestLieCVLabels, PredLieCV)


The Model 1 confusion matrix is:
[[ 5  5]
 [13  5]]

Accuracy:


0.35714285714285715

In [26]:
cnf_matrix2 = confusion_matrix(TestLieTFIDFLabels, PredLieTFIDF)
print("\nThe Model 2 confusion matrix is:")
print(cnf_matrix2)

print('\nAccuracy:')
accuracy_score(TestLieTFIDFLabels, PredLieTFIDF)


The Model 2 confusion matrix is:
[[5 8]
 [7 8]]

Accuracy:


0.4642857142857143

### Sentiment Classification

In [27]:
cnf_matrix3 = confusion_matrix(TestSentCVLabels, PredSentimentCV)
print("\nThe Model 3 confusion matrix is:")
print(cnf_matrix3)

print('\nAccuracy:')
accuracy_score(TestSentCVLabels, PredSentimentCV)


The Model 3 confusion matrix is:
[[14  2]
 [ 0 12]]

Accuracy:


0.9285714285714286

In [28]:
cnf_matrix4 = confusion_matrix(TestSentTFIDFLabels, PredSentimentTFIDF)
print("\nThe Model 4 confusion matrix is:")
print(cnf_matrix4)

print('\nAccuracy:')
accuracy_score(TestSentTFIDFLabels, PredSentimentTFIDF)


The Model 4 confusion matrix is:
[[13  0]
 [ 5 10]]

Accuracy:


0.8214285714285714

# Feature Importance

### Lie Detection

In [29]:
print('MODEL 1')
false_class_prob_sorted = MyModel1.feature_log_prob_[0, :].argsort()[::-1]
true_class_prob_sorted = MyModel1.feature_log_prob_[1, :].argsort()[::-1]

print('Features for FALSE class:')
false_class = np.take(MyLieCV.get_feature_names_out(), false_class_prob_sorted[:10])
print(false_class)
print('\nFeatures for TRUE class:')
true_class = np.take(MyLieCV.get_feature_names_out(), true_class_prob_sorted[:10])
print(true_class)

unique_false_words = []
for word in false_class:
    if word not in true_class:
        unique_false_words.append(word)
print('\nWords unique to FALSE class:')
print(unique_false_words)

unique_true_words = []
for word in true_class:
    if word not in false_class:
        unique_true_words.append(word)
print('\nWords unique to TRUE class:')
print(unique_true_words)

MODEL 1
Features for FALSE class:
['regrettably' 'flavor' 'warmly' 'perplexing' 'seated' 'bean' 'meats'
 'piano' 'ethic' 'lasagna']

Features for TRUE class:
['regrettably' 'flavor' 'perplexing' 'gently' 'offered' 'warmly' 'gannon'
 'views' 'chairs' 'seated']

Words unique to FALSE class:
['bean', 'meats', 'piano', 'ethic', 'lasagna']

Words unique to TRUE class:
['gently', 'offered', 'gannon', 'views', 'chairs']


In [30]:
print('MODEL 2')
false_class_prob_sorted = MyModel2.feature_log_prob_[0, :].argsort()[::-1]
true_class_prob_sorted = MyModel2.feature_log_prob_[1, :].argsort()[::-1]

print('Features for FALSE class:')
false_class = np.take(MyLieTFIDF.get_feature_names_out(), false_class_prob_sorted[:10])
print(false_class)
print('\nFeatures for TRUE class:')
true_class = np.take(MyLieTFIDF.get_feature_names_out(), true_class_prob_sorted[:10])
print(true_class)

unique_false_words = []
for word in false_class:
    if word not in true_class:
        unique_false_words.append(word)
print('\nWords unique to FALSE class:')
print(unique_false_words)

unique_true_words = []
for word in true_class:
    if word not in false_class:
        unique_true_words.append(word)
print('\nWords unique to TRUE class:')
print(unique_true_words)

MODEL 2
Features for FALSE class:
['flavor' 'regrettably' 'gives' 'bean' 'seated' 'meats' 'classic' 'ethic'
 'warmly' 'gently']

Features for TRUE class:
['regrettably' 'flavor' 'perplexing' 'gently' 'named' 'einstein'
 'perfection' 'terrific' 'pasta' 'veggie']

Words unique to FALSE class:
['gives', 'bean', 'seated', 'meats', 'classic', 'ethic', 'warmly']

Words unique to TRUE class:
['perplexing', 'named', 'einstein', 'perfection', 'terrific', 'pasta', 'veggie']


### Sentiment Classification

In [31]:
print('MODEL 3')
neg_class_prob_sorted = MyModel3.feature_log_prob_[0, :].argsort()[::-1]
pos_class_prob_sorted = MyModel3.feature_log_prob_[1, :].argsort()[::-1]

print('Features for NEGATIVE class:')
neg_class = np.take(MySentCV.get_feature_names_out(), neg_class_prob_sorted[:10])
print(neg_class)
print('\nFeatures for POSITIVE class:')
pos_class = np.take(MySentCV.get_feature_names_out(), pos_class_prob_sorted[:10])
print(pos_class)

unique_neg_words = []
for word in neg_class:
    if word not in pos_class:
        unique_neg_words.append(word)
print('\nWords unique to NEGATIVE class:')
print(unique_neg_words)

unique_pos_words = []
for word in pos_class:
    if word not in neg_class:
        unique_pos_words.append(word)
print('\nWords unique to POSITIVE class:')
print(unique_pos_words)

MODEL 3
Features for NEGATIVE class:
['flavor' 'regrettably' 'perplexing' 'warmly' 'meats' 'offered' 'gently'
 'offer' 'talking' 'lasagna']

Features for POSITIVE class:
['flavor' 'regrettably' 'bean' 'gently' 'add' 'gives' 'seated' 'safe'
 'perplexing' 'quiet']

Words unique to NEGATIVE class:
['warmly', 'meats', 'offered', 'offer', 'talking', 'lasagna']

Words unique to POSITIVE class:
['bean', 'add', 'gives', 'seated', 'safe', 'quiet']


In [32]:
print('MODEL 4')
neg_class_prob_sorted = MyModel4.feature_log_prob_[0, :].argsort()[::-1]
pos_class_prob_sorted = MyModel4.feature_log_prob_[1, :].argsort()[::-1]

print('Features for NEGATIVE class:')
neg_class = np.take(MySentTFIDF.get_feature_names_out(), neg_class_prob_sorted[:10])
print(neg_class)
print('\nFeatures for POSITIVE class:')
pos_class = np.take(MySentTFIDF.get_feature_names_out(), pos_class_prob_sorted[:10])
print(pos_class)

unique_neg_words = []
for word in neg_class:
    if word not in pos_class:
        unique_neg_words.append(word)
print('\nWords unique to NEGATIVE class:')
print(unique_neg_words)

unique_pos_words = []
for word in pos_class:
    if word not in neg_class:
        unique_pos_words.append(word)
print('\nWords unique to POSITIVE class:')
print(unique_pos_words)

MODEL 4
Features for NEGATIVE class:
['flavor' 'regrettably' 'perplexing' 'meats' 'dirty' 'warmly' 'perfection'
 'romantic' 'seated' 'lasagna']

Features for POSITIVE class:
['flavor' 'bean' 'gives' 'regrettably' 'flute' 'followed' 'add' 'quiet'
 'perplexing' 'gently']

Words unique to NEGATIVE class:
['meats', 'dirty', 'warmly', 'perfection', 'romantic', 'seated', 'lasagna']

Words unique to POSITIVE class:
['bean', 'gives', 'flute', 'followed', 'add', 'quiet', 'gently']
