## Text Classification Model using Latent Dirichlet Allocation

In [1]:
import pandas as pd
import numpy as np
import nltk
en_stop = set(nltk.corpus.stopwords.words('english'))
from tqdm._tqdm_notebook import tqdm_notebook
tqdm_notebook.pandas()
import gensim
from gensim import corpora
import pickle
import random
import pyLDAvis.gensim
from gensim.models.callbacks import PerplexityMetric



In [2]:
data = pd.read_csv(r"E:\Yelp\Unfiltered Data\YelpZip\Customs\checkpoint")
data.shape

(608458, 7)

### The text data has already been cleaned. The stop words need to be removed though as we are not using TFIDF at this point for normalization.

In [47]:
def preprocess_text(text):
    #Getting tokens from text
    tokens = []
    text_split = text.split()
    for token in text_split:
        tokens.append(token)
    #Removing stop words from the list of tokens
    tokens = [token for token in tokens if token not in en_stop]
    return tokens

## Create data split of both labels and develop a separate topic model for each set.

In [48]:
real = data[data['real'] == 1]
fake = data[data['real'] == -1]
print("Real:",real.shape,"Fake:",fake.shape)

Real: (528019, 7) Fake: (80439, 7)


In [5]:
#Converting all our text to a list of preprocessed tokens
review_data = []
pbar = tqdm_notebook(total=data.shape[0])
for text in data['review']:
    tokens = preprocess_text(str(text))
    review_data.append(tokens)
    pbar.update(1)
pbar.close()

HBox(children=(IntProgress(value=0, max=608458), HTML(value='')))




In [6]:
#Separating real and fake reviews in different lists
real = []
fake = []

for i in range(len(review_data)):
    if data['real'][i] == 1:
        real.append(review_data[i])
    else:
        fake.append(review_data[i])

print(len(real), len(fake))

528019 80439


In [7]:
#Don't use now
#Randomly creating a test and train set on both lists
random.shuffle(real)
random.shuffle(fake)

test_size = 200

real_train = real[200:]
real_test = real[:200]
fake_train = fake[200:]
fake_test = fake[:200]

In [29]:
X_train = pd.read_csv(r"E:\Yelp\Unfiltered Data\WorkingSet\X_train") 
X_train = X_train['review']
X_test = pd.read_csv(r"E:\Yelp\Unfiltered Data\WorkingSet\X_test") 
X_test = X_test['review']
Y_train= pd.read_csv(r"E:\Yelp\Unfiltered Data\WorkingSet\y_train", header=None) 
Y_train = Y_train[1]
Y_test= pd.read_csv(r"E:\Yelp\Unfiltered Data\WorkingSet\y_test", header=None) 
Y_test = Y_test[1]
X_train.head()
# index = []
# for i in Y_train:


0    i fell madly in love with these tiny sandwich ...
1    my review will strictly be on what i order let...
2    im from ny and i can say this pizza be delicio...
3    i have never have coffee i enjoy in fact i thi...
4    the mediterranean version of chipotle decor be...
Name: review, dtype: object

In [40]:
rangex = X_train.shape[0]
rangex
Y_train[2]
X_train[1]

'my review will strictly be on what i order let me start by say the texture for each macaron be phenomenal and how a macaron should be crispy thin on the exterior and beyond delicate on the interior these literally melt in your mouth salt caramel just the right amount of salt to make you salivate but unfortunately the caramel be cook a tad too long and leave an unpleasant bitterness in my mouth chocolate nothing spectacular the chocolate taste like midgrade chocolate there be nothing too luscious about the ganache rise not a fan of rise so that should say more than enough ice box seasonal refreshing but the flavor be so delicate it be almost nonexistent and leave more of a cool effect on the palette instead of an everlasting flavor coconut coconut can go in so many direction and the profile on this be more of that sunblock coconut i prefer a delicate young coconut flavor pistachio the best of the dozen true pistachio flavor lemon the instant it hit your palette you think you be eat a s

In [41]:
real_indexes = []
fake_indexes = []
real_train = []
fake_train = []

for i in range(rangex):
    if Y_train[i] == 1:
        real_train.append(X_train[i])
    else:
        fake_train.append(X_train[i])

In [44]:
real_t = []
fake_t = []
pbar = tqdm_notebook(total=data.shape[0])
for text in real_train:
    tokens = preprocess_text(str(text))
    real_t.append(tokens)
    pbar.update(1)
pbar.close()
real_train = real_t

pbar = tqdm_notebook(total=data.shape[0])
for text in fake_train:
    tokens = preprocess_text(str(text))
    fake_t.append(tokens)
    pbar.update(1)
pbar.close()
fake_train = fake_t

HBox(children=(IntProgress(value=0, max=608458), HTML(value='')))




HBox(children=(IntProgress(value=0, max=608458), HTML(value='')))




In [45]:
print("Sample of token:",real_train[2], "||",fake_train[21])

Sample of token: ['never', 'coffee', 'enjoy', 'fact', 'think', 'hat', 'coffee', 'bean', 'format', 'drink', 'need', 'someone', 'would', 'need', 'medicine', 'latte', 'w', 'whole', 'milk', 'thimble', 'simple', 'syrup', 'comfort', 'honestly', 'nothing', 'ever', 'review', 'perfect', 'im', 'extremely', 'sensitive', 'service', 'spot', 'move', 'quickly', 'update', 'dilly', 'dally', 'man', 'state', 'around', 'drop', 'around', 'place'] || ['great', 'first', 'experience', 'excellent', 'service']


In [49]:
review_data = []
for i in real_train:
    review_data.append(i)
for i in fake_train:
    review_data.append(i)
#Creating dictionary of our data using Gensim and saving 
dictionary = corpora.Dictionary(review_data)
dictionary.save('dictionary.gensim')

#For saving corpus, use:
#pickle.dump(corpus, open('corpus.pkl', 'wb'))

In [51]:
import logging
logging.basicConfig(filename='C:/Users/elonm/Desktop/gensim.log',format="%(asctime)s:%(levelname)s:%(message)s",level=logging.INFO)

In [52]:
NUM_TOPICS = 5

#Getting 5 topics from real_train
real_corpus = [dictionary.doc2bow(text) for text in real_train]
perplexity1 = PerplexityMetric(corpus=real_corpus, logger='shell')
real_model = gensim.models.ldamodel.LdaModel(real_corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=20, eval_every=1000)
real_topics = real_model.print_topics(num_words=7)
for topic in real_topics:
    print(topic)

#Getting 5 topics from real_test
fake_corpus = [dictionary.doc2bow(text) for text in fake_train]
perplexity2 = PerplexityMetric(corpus=fake_corpus, logger='shell')
fake_model = gensim.models.ldamodel.LdaModel(fake_corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=20, eval_every=1000)
fake_topics = fake_model.print_topics(num_words=7)
for topic in fake_topics:
    print(topic)

(0, '0.016*"good" + 0.012*"chicken" + 0.011*"place" + 0.011*"get" + 0.011*"pork" + 0.010*"food" + 0.010*"like"')
(1, '0.025*"pizza" + 0.011*"place" + 0.011*"sandwich" + 0.011*"get" + 0.010*"good" + 0.009*"best" + 0.009*"like"')
(2, '0.010*"dish" + 0.009*"good" + 0.008*"cheese" + 0.008*"order" + 0.008*"salad" + 0.007*"delicious" + 0.007*"dessert"')
(3, '0.028*"food" + 0.027*"place" + 0.027*"great" + 0.022*"good" + 0.014*"go" + 0.012*"service" + 0.010*"love"')
(4, '0.014*"get" + 0.013*"order" + 0.012*"food" + 0.011*"go" + 0.011*"us" + 0.010*"wait" + 0.010*"time"')
(0, '0.095*"pizza" + 0.017*"pie" + 0.011*"crust" + 0.011*"slice" + 0.006*"thin" + 0.005*"pat" + 0.004*"cheesesteak"')
(1, '0.011*"order" + 0.011*"good" + 0.011*"chicken" + 0.010*"sauce" + 0.010*"cheese" + 0.009*"like" + 0.009*"taste"')
(2, '0.006*"die" + 0.005*"das" + 0.004*"da" + 0.004*"de" + 0.004*"der" + 0.004*"hat" + 0.003*"zu"')
(3, '0.014*"go" + 0.013*"food" + 0.012*"get" + 0.011*"order" + 0.011*"us" + 0.010*"time" + 0.00

In [53]:
#Saving all the acquired data
dictionary.save(r'E:\Yelp\Unfiltered Data\YelpZip\models and stuff\dictionary.gensim')
pickle.dump(real_corpus, open(r'E:\Yelp\Unfiltered Data\WorkingSet\real_corpus.pkl', 'wb'))
pickle.dump(fake_corpus, open(r'E:\Yelp\Unfiltered Data\WorkingSet\fake_corpus.pkl', 'wb'))

MemoryError: 

## Visualizing the topics

In [54]:
lda_display = pyLDAvis.gensim.prepare(real_model, real_corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)

MemoryError: 

In [None]:
lda_display = pyLDAvis.gensim.prepare(fake_model, fake_corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)

In [None]:
#Fitting a document
new_doc = 'absolutely hated the place it suck bad waiter coffee smell bad'
new_doc = preprocess_text(new_doc)
new_doc_bow = dictionary.doc2bow(new_doc)
print(new_doc_bow)
print(real_model.get_document_topics(new_doc_bow))
print(fake_model.get_document_topics(new_doc_bow))
print(real_model.get_document_topics(new_doc_bow)[3][1])
max(real_model.get_document_topics(new_doc_bow))

## Now we create a function to evaluate the 400 test sets.
### We joing the test sets in order: real_test, fake_test
### We create y_actual with first 200 elements as 1 and next 200 as -1
### We pass each individual text through all the 10 topics(5 real then 5 fake).
### We find the best fit and estimate from the first 5 models(real) and then the best fit and estimate from the next 5 models(fake).
### We compare the fits and accordingly assign the prediction to our text. 
### Append the prediction(1,-1) to y_pred list.

In [None]:
X_test
Y_test

In [119]:
#Creating the test set
test_set = []

pbar = tqdm_notebook(total=X_test.shape[0])
for text in X_test:
    tokens = preprocess_text(str(text))
    test_set.append(tokens)
    pbar.update(1)
pbar.close()

y_actual = []
for i in Y_test:
    if i == 1:
        y_actual.append(1)
    else:
        y_actual.append(-1)
        
print("Size of test_set:", len(test_set[1]))
print("y_actual:", y_actual)

Size of test_set: 200
y_actual: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,

In [120]:
#Making predictions based on topics
y_pred = []

for test_text in test_set:
    test_doc_bow = dictionary.doc2bow(test_text)
    real_scores = []
    fake_scores = []
    for i in real_model.get_document_topics(test_doc_bow):
        #real_model.get_document_topics(new_doc_bow)[0][1] -> Fit value of doc on topic 0
        real_scores.append(i[1])
    for i in fake_model.get_document_topics(test_doc_bow):
        fake_scores.append(i[1])
        
    #Best Fit on real model
    real_fit = max(real_scores)
    #Best Fit on fake model
    fake_fit = max(fake_scores)
    
    if real_fit >= fake_fit:
        y_pred.append(1)
    else:
        y_pred.append(-1)    

In [121]:
#Evaluating
real_correct = 0
fake_correct = 0

for i in Y_test.shape[0]:
    if y_pred[i] == y_actual[i]:
        if i<200:
            real_correct = real_correct + 1
        else:
            fake_correct = fake_correct + 1
        
acc = ((real_correct+fake_correct)/400)*100
print("Overall Accuracy:", acc,"%")
print("Real prediction accuracy:", ((real_correct/200)*100))
print("Fake prediction accuracy:", ((fake_correct/200)*100))

Overall Accuracy: 52.25 %
Real prediction accuracy: 43.0
Fake prediction accuracy: 61.5
