## Text Classification Model using Latent Dirichlet Allocation

In [34]:
import pandas as pd
import numpy as np
import nltk
en_stop = set(nltk.corpus.stopwords.words('english'))
from tqdm._tqdm_notebook import tqdm_notebook
tqdm_notebook.pandas()
import gensim
from gensim import corpora
import pickle
import random
import pyLDAvis.gensim
from gensim.models.callbacks import PerplexityMetric
from visdom import Visdom

In [35]:
data = pd.read_csv(r"E:\Yelp\Unfiltered Data\YelpZip\Customs\checkpoint")
data.shape

(608458, 7)

### The text data has already been cleaned. The stop words need to be removed though as we are not using TFIDF at this point for normalization.

In [36]:
def preprocess_text(text):
    #Getting tokens from text
    tokens = []
    text_split = text.split()
    for token in text_split:
        tokens.append(token)
    #Removing stop words from the list of tokens
    tokens = [token for token in tokens if token not in en_stop]
    return tokens

## Create data split of both labels and develop a separate topic model for each set.

In [37]:
real = data[data['real'] == 1]
fake = data[data['real'] == -1]
print("Real:",real.shape,"Fake:",fake.shape)

Real: (528019, 7) Fake: (80439, 7)


In [38]:
#Converting all our text to a list of preprocessed tokens
review_data = []
pbar = tqdm_notebook(total=data.shape[0])
for text in data['review']:
    tokens = preprocess_text(str(text))
    review_data.append(tokens)
    pbar.update(1)
pbar.close()

HBox(children=(IntProgress(value=0, max=608458), HTML(value='')))




In [39]:
#Separating real and fake reviews in different lists
real = []
fake = []

for i in range(len(review_data)):
    if data['real'][i] == 1:
        real.append(review_data[i])
    else:
        fake.append(review_data[i])

print(len(real), len(fake))

528019 80439


In [40]:
#Randomly creating a test and train set on both lists
random.shuffle(real)
random.shuffle(fake)

test_size = 200

real_train = real[200:]
real_test = real[:200]
fake_train = fake[200:]
fake_test = fake[:200]

In [41]:
print("Sample of token:",real_train[2], "||",fake_test[21])

Sample of token: ['go', 'birthday', 'group', '13', 'us', 'take', 'reservation', 'pre', 'fix', 'menu', 'meat', 'cheese', 'plate', 'bread', 'salad', 'choice', 'different', '3', 'lasagna', 'dessert', 'drink', 'food', 'come', '60', 'person', 'affordable', 'portion', 'size', 'huge', 'food', 'deliciousand', 'ambiance', 'great', 'awesome', 'patio', 'garden', 'area', 'romantic', 'accommodate', 'big', 'group', 'definitely', 'would', 'suggest', 'place', 'especially', 'date'] || ['eat', 'alot', 'excellent', 'pizza', 'great', 'salad', 'great', 'atmospher', 'good', 'drink']


In [42]:
#Creating dictionary of our data using Gensim and saving 
dictionary = corpora.Dictionary(review_data)
dictionary.save('dictionary.gensim')

#For saving corpus, use:
#pickle.dump(corpus, open('corpus.pkl', 'wb'))

In [43]:
# Log the perplexity score at the end of each epoch.
#perplexity_logger = PerplexityMetric(corpus=common_corpus, logger='shell')

In [None]:
NUM_TOPICS = 5

#Getting 5 topics from real_train
real_corpus = [dictionary.doc2bow(text) for text in real_train]
perplexity1 = PerplexityMetric(corpus=real_corpus, logger='shell')
real_model = gensim.models.ldamodel.LdaModel(real_corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15, callbacks=[perplexity1])
real_topics = real_model.print_topics(num_words=7)
for topic in real_topics:
    print(topic)

#Getting 5 topics from real_test
fake_corpus = [dictionary.doc2bow(text) for text in fake_train]
perplexity2 = PerplexityMetric(corpus=fake_corpus, logger='shell')
fake_model = gensim.models.ldamodel.LdaModel(fake_corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15, callbacks=[perplexity2])
fake_topics = fake_model.print_topics(num_words=7)
for topic in fake_topics:
    print(topic)

In [None]:
#Saving all the acquired data
dictionary.save(r'E:\Yelp\Unfiltered Data\YelpZip\models and stuff\dictionary.gensim')
pickle.dump(real_corpus, open(r'E:\Yelp\Unfiltered Data\YelpZip\models and stuff\real_corpus.pkl', 'wb'))
pickle.dump(fake_corpus, open(r'E:\Yelp\Unfiltered Data\YelpZip\models and stuff\fake_corpus.pkl', 'wb'))

## Visualizing the topics

In [None]:
lda_display = pyLDAvis.gensim.prepare(real_model, real_corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)

In [None]:
lda_display = pyLDAvis.gensim.prepare(fake_model, fake_corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)

In [17]:
#Fitting a document
new_doc = 'absolutely hated the place it suck bad waiter coffee smell bad'
new_doc = preprocess_text(new_doc)
new_doc_bow = dictionary.doc2bow(new_doc)
print(new_doc_bow)
print(real_model.get_document_topics(new_doc_bow))
print(fake_model.get_document_topics(new_doc_bow))
print(real_model.get_document_topics(new_doc_bow)[0][1])

[(1, 2), (95, 1), (156, 1), (497, 1), (710, 1), (879, 1), (1028, 1)]
[(0, 0.02237967), (1, 0.022913761), (2, 0.22595412), (3, 0.37218377), (4, 0.35656866)]
[(0, 0.20541026), (1, 0.22215351), (2, 0.5270318), (3, 0.02258687), (4, 0.022817561)]
0.022379676


## Now we create a function to evaluate the 400 test sets.
### We joing the test sets in order: real_test, fake_test
### We create y_actual with first 200 elements as 1 and next 200 as -1
### We pass each individual text through all the 10 topics(5 real then 5 fake).
### We find the best fit and estimate from the first 5 models(real) and then the best fit and estimate from the next 5 models(fake).
### We compare the fits and accordingly assign the prediction to our text. 
### Append the prediction(1,-1) to y_pred list.

In [None]:
#Creating the test set
test_set = []
test_set.append(real_test)
test_set.append(fake_test)

y_actual = []
for i in range(400):
    if i<200:
        y_actual.append(1)
    else:
        y_actual.append(-1)
        
print("Size of test_set:", len(test_set))
print("y_actual:", y_actual)

In [None]:
#Making predictions based on topics
y_pred = []

for test_text in test_set:
    test_doc_bow = dictionary.doc2bow(test_text)
    real_scores = []
    fake_scores = []
    for i in range(5):
        #real_model.get_document_topics(new_doc_bow)[0][1] -> Fit value of doc on topic 0
        real_scores.append(real_model.get_document_topics(new_doc_bow)[i][1])
        fake_scores.append(fake_model.get_document_topics(new_doc_bow)[i][1])
    
    #Best Fit on real model
    real_fit = max(real_scores)
    #Best Fit on fake model
    fake_fit = max(fake_scores)
    
    if real_fit >= fake_fit:
        y_pred.append(1)
    else:
        y_pred.append(-1)    

In [None]:
#Evaluating
real_correct = 0
fake_correct = 0

for i in range(400):
    if y_pred[i] == y_actual[i]:
        if i<200:
            real_correct = real_correct + 1
        else:
            fake_correct = fake_correct + 1
        
acc = ((real_correct+fake_correct)/400)*100
print("Overall Accuracy:", acc,"%")
print("Real prediction accuracy:", ((real_correct/200)*100))
print("Fake prediction accuracy:", ((fake_correct/200)*100))