## Text Classification Model using Latent Dirichlet Allocation

In [1]:
import pandas as pd
import numpy as np
import nltk
en_stop = set(nltk.corpus.stopwords.words('english'))
from tqdm._tqdm_notebook import tqdm_notebook
tqdm_notebook.pandas()
import gensim
from gensim import corpora
import pickle
import random
import pyLDAvis.gensim
from gensim.models.callbacks import PerplexityMetric



In [2]:
data = pd.read_csv(r"E:\Yelp\Unfiltered Data\YelpZip\Customs\checkpoint")
data.shape

(608458, 7)

### The text data has already been cleaned. The stop words need to be removed though as we are not using TFIDF at this point for normalization.

In [3]:
def preprocess_text(text):
    #Getting tokens from text
    tokens = []
    text_split = text.split()
    for token in text_split:
        tokens.append(token)
    #Removing stop words from the list of tokens
    tokens = [token for token in tokens if token not in en_stop]
    return tokens

## Create data split of both labels and develop a separate topic model for each set.

In [4]:
real = data[data['real'] == 1]
fake = data[data['real'] == -1]
print("Real:",real.shape,"Fake:",fake.shape)

Real: (528019, 7) Fake: (80439, 7)


In [5]:
#Converting all our text to a list of preprocessed tokens
review_data = []
pbar = tqdm_notebook(total=data.shape[0])
for text in data['review']:
    tokens = preprocess_text(str(text))
    review_data.append(tokens)
    pbar.update(1)
pbar.close()

HBox(children=(IntProgress(value=0, max=608458), HTML(value='')))




In [6]:
#Separating real and fake reviews in different lists
real = []
fake = []

for i in range(len(review_data)):
    if data['real'][i] == 1:
        real.append(review_data[i])
    else:
        fake.append(review_data[i])

print(len(real), len(fake))

528019 80439


In [7]:
#Randomly creating a test and train set on both lists
random.shuffle(real)
random.shuffle(fake)

test_size = 200

real_train = real[200:]
real_test = real[:200]
fake_train = fake[200:]
fake_test = fake[:200]

In [8]:
print("Sample of token:",real_train[2], "||",fake_test[21])

Sample of token: ['come', 'cold', 'day', 'run', 'order', 'way', 'much', 'food', 'love', 'every', 'bite', 'running', 'partner', 'get', 'fennel', 'sausage', 'pepper', 'meat', 'ball', 'sub', 'popcorn', 'tomato', 'roast', 'red', 'pepper', 'soup', '2', 'beer', 'tap', 'expect', 'fresh', 'bread', 'delicious', 'meat', 'sandwich', 'triumph', 'soup', 'soothing', 'creamy', 'beer', 'right', 'refresh', 'lovely', 'surprise', 'expect', 'jimmy', 'john', 'much'] || ['waiter', 'rude', 'food', 'average', 'go', 'much', 'good', 'italian', 'restaurant', 'nyc', 'price', 'point', 'eat', 'restaurant', 'last', 'saturday', 'night', '2', 'friend', 'waiter', 'rude', 'us', 'throughout', 'evening', 'nonhumorous', 'putdowns', 'actually', 'add', 'automatic', '18', 'tip', 'bill', 'food', 'average', 'preset', 'menu', 'steak', 'one', 'entree', 'option', 'steak', 'horrible', 'quality', 'probably', 'one', 'bad', 'cut', 'meat', 'possible', 'dish', 'ok', 'must', 'eat', 'grab', 'gnocchi', 'chicken', 'entree', 'preset', 'menu'

In [9]:
#Creating dictionary of our data using Gensim and saving 
dictionary = corpora.Dictionary(review_data)
dictionary.save('dictionary.gensim')

#For saving corpus, use:
#pickle.dump(corpus, open('corpus.pkl', 'wb'))

In [10]:
import logging
logging.basicConfig(filename='C:/Users/elonm/Desktop/gensim.log',format="%(asctime)s:%(levelname)s:%(message)s",level=logging.INFO)

In [116]:
NUM_TOPICS = 5

#Getting 5 topics from real_train
real_corpus = [dictionary.doc2bow(text) for text in real_train]
perplexity1 = PerplexityMetric(corpus=real_corpus, logger='shell')
real_model = gensim.models.ldamodel.LdaModel(real_corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=20, eval_every=1000)
real_topics = real_model.print_topics(num_words=7)
for topic in real_topics:
    print(topic)

#Getting 5 topics from real_test
fake_corpus = [dictionary.doc2bow(text) for text in fake_train]
perplexity2 = PerplexityMetric(corpus=fake_corpus, logger='shell')
fake_model = gensim.models.ldamodel.LdaModel(fake_corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=20, eval_every=1000)
fake_topics = fake_model.print_topics(num_words=7)
for topic in fake_topics:
    print(topic)

(0, '0.017*"good" + 0.013*"place" + 0.013*"get" + 0.011*"chicken" + 0.011*"food" + 0.010*"sandwich" + 0.009*"like"')
(1, '0.015*"get" + 0.013*"order" + 0.012*"go" + 0.011*"wait" + 0.010*"time" + 0.010*"come" + 0.010*"food"')
(2, '0.011*"dish" + 0.010*"good" + 0.008*"order" + 0.008*"cheese" + 0.007*"delicious" + 0.007*"salad" + 0.007*"dessert"')
(3, '0.044*"pizza" + 0.016*"ramen" + 0.012*"good" + 0.012*"sushi" + 0.011*"best" + 0.010*"place" + 0.010*"slice"')
(4, '0.025*"food" + 0.024*"great" + 0.024*"place" + 0.018*"good" + 0.012*"go" + 0.012*"service" + 0.009*"love"')
(0, '0.118*"pizza" + 0.015*"pie" + 0.015*"slice" + 0.012*"crust" + 0.006*"pat" + 0.005*"cheesesteak" + 0.005*"thin"')
(1, '0.012*"chicken" + 0.010*"good" + 0.009*"order" + 0.009*"sauce" + 0.009*"dish" + 0.008*"cheese" + 0.008*"salad"')
(2, '0.014*"indian" + 0.009*"falafel" + 0.003*"de" + 0.003*"da" + 0.003*"kebab" + 0.003*"masala" + 0.002*"e"')
(3, '0.013*"go" + 0.012*"order" + 0.012*"food" + 0.012*"us" + 0.012*"get" + 0.

In [117]:
#Saving all the acquired data
dictionary.save(r'E:\Yelp\Unfiltered Data\YelpZip\models and stuff\dictionary.gensim')
pickle.dump(real_corpus, open(r'E:\Yelp\Unfiltered Data\YelpZip\models and stuff\real_corpus.pkl', 'wb'))
pickle.dump(fake_corpus, open(r'E:\Yelp\Unfiltered Data\YelpZip\models and stuff\fake_corpus.pkl', 'wb'))

## Visualizing the topics

In [15]:
lda_display = pyLDAvis.gensim.prepare(real_model, real_corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [16]:
lda_display = pyLDAvis.gensim.prepare(fake_model, fake_corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [118]:
#Fitting a document
new_doc = 'absolutely hated the place it suck bad waiter coffee smell bad'
new_doc = preprocess_text(new_doc)
new_doc_bow = dictionary.doc2bow(new_doc)
print(new_doc_bow)
print(real_model.get_document_topics(new_doc_bow))
print(fake_model.get_document_topics(new_doc_bow))
print(real_model.get_document_topics(new_doc_bow)[3][1])
max(real_model.get_document_topics(new_doc_bow))

[(1, 2), (95, 1), (156, 1), (497, 1), (710, 1), (879, 1), (1028, 1), (19272, 1)]
[(0, 0.020264352), (1, 0.62635213), (2, 0.020229438), (3, 0.02017206), (4, 0.31298202)]
[(0, 0.020768423), (1, 0.020313932), (2, 0.020985784), (3, 0.9170431), (4, 0.020888783)]
0.020172063


(4, 0.31300524)

## Now we create a function to evaluate the 400 test sets.
### We joing the test sets in order: real_test, fake_test
### We create y_actual with first 200 elements as 1 and next 200 as -1
### We pass each individual text through all the 10 topics(5 real then 5 fake).
### We find the best fit and estimate from the first 5 models(real) and then the best fit and estimate from the next 5 models(fake).
### We compare the fits and accordingly assign the prediction to our text. 
### Append the prediction(1,-1) to y_pred list.

In [119]:
#Creating the test set
test_set = []
test_set.append(real_test)
test_set.append(fake_test)

y_actual = []
for i in range(400):
    if i<200:
        y_actual.append(1)
    else:
        y_actual.append(-1)
        
print("Size of test_set:", len(test_set[1]))
print("y_actual:", y_actual)

Size of test_set: 200
y_actual: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,

In [120]:
#Making predictions based on topics
y_pred = []

for test_text in test_set[0]:
    test_doc_bow = dictionary.doc2bow(test_text)
    real_scores = []
    fake_scores = []
    for i in real_model.get_document_topics(test_doc_bow):
        #real_model.get_document_topics(new_doc_bow)[0][1] -> Fit value of doc on topic 0
        real_scores.append(i[1])
    for i in fake_model.get_document_topics(test_doc_bow):
        fake_scores.append(i[1])
        
    #Best Fit on real model
    real_fit = max(real_scores)
    #Best Fit on fake model
    fake_fit = max(fake_scores)
    
    if real_fit >= fake_fit:
        y_pred.append(1)
    else:
        y_pred.append(-1)    
        
for test_text in test_set[1]:
    test_doc_bow = dictionary.doc2bow(test_text)
    real_scores = []
    fake_scores = []
    for i in real_model.get_document_topics(test_doc_bow):
        #real_model.get_document_topics(new_doc_bow)[0][1] -> Fit value of doc on topic 0
        real_scores.append(i[1])
    for i in fake_model.get_document_topics(test_doc_bow):
        fake_scores.append(i[1])
    
    #Best Fit on real model
    real_fit = max(real_scores)
    #Best Fit on fake model
    fake_fit = max(fake_scores)
    
    if real_fit >= fake_fit:
        y_pred.append(1)
    else:
        y_pred.append(-1)   

In [121]:
#Evaluating
real_correct = 0
fake_correct = 0

for i in range(400):
    if y_pred[i] == y_actual[i]:
        if i<200:
            real_correct = real_correct + 1
        else:
            fake_correct = fake_correct + 1
        
acc = ((real_correct+fake_correct)/400)*100
print("Overall Accuracy:", acc,"%")
print("Real prediction accuracy:", ((real_correct/200)*100))
print("Fake prediction accuracy:", ((fake_correct/200)*100))

Overall Accuracy: 52.25 %
Real prediction accuracy: 43.0
Fake prediction accuracy: 61.5
