In [1]:
import nltk
from sklearn.naive_bayes import MultinomialNB
import xlrd
import string
from nltk.corpus import stopwords
from random import shuffle
import csv
from gensim import corpora

import numpy as np
from scipy.spatial import distance

# nltk.download('stopwords')  

In [2]:
from gensim import corpora
import pickle
import gensim
import copy

## Read-in list of goodreads data

The dataset contains 1,000 books grabbed from the good reads data, organized into a dictionary. One can apply the same model to thw whole dataset (or any other dataset one wants). Here we grab the tags and book description for the book.

In [3]:
with open("thousand_dict.pickle", "rb") as f:
    Goodreads_dic = pickle.load(f)

Next we train a topic model for all the books. I also print the log perplexity for model comparison. One should definitely want to do cross-validation over different hyper-parameter combos to select the models. Here I am just use some numbers that seem to work.

In [None]:
def book_topic_gen(book_dic, NUM_TOPICS = 50, NUM_PASSES = 15, seed = 123):
    dictionary = corpora.Dictionary(book_dic.values())
    corpus = [dictionary.doc2bow(text) for text in book_dic.values()]
    ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes = NUM_PASSES, random_state = seed)
    with open('GoodReads/' + str(NUM_TOPICS) + '_Topics/GoodReads_model_' + str(NUM_TOPICS) + '.pkl', 'wb') as f:
        pickle.dump(ldamodel, f, pickle.HIGHEST_PROTOCOL)
    print(ldamodel.log_perplexity(corpus))
    return(ldamodel)
    

In [None]:
## Hyper-parameter for training and for display
NUM_TOPICS = 50
NUM_PASSES = 15
NUM_WORDS = 10
seed = 123 

ldamodel = book_topic_gen(book_dic, NUM_TOPICS, NUM_PASSES, seed)

## Checking models and getting embeddings

In [138]:
with open('GoodReads/' + str(NUM_TOPICS) + '_Topics/GoodReads_model_' + str(NUM_TOPICS) + '.pkl', 'rb') as f:
    ldamodel = pickle.load(f)

Similar to what we did with twitter data, here we grab and display the topics from books

In [139]:
def get_topics(model):
    ## Getting the topics with the top words asscociated with this topic
    topics = model.print_topics(num_topics=NUM_TOPICS, num_words=NUM_WORDS)
        
    topics_dic = {}
    for i in range(NUM_TOPICS):
        topic = topics[i][1].split("+")
        topic_dic = {}
        for item in topic:
            weight = float(item.split("*")[0])
            key = item.split("*")[1].split("\"")[1]
            topic_dic[key] = weight
        topics_dic[i] = topic_dic.copy()

    return(topics_dic)

def print_topics(topic_dic):
    for i in topic_dic.keys():
        print("Topic " + str(i + 1) + ":")
        for j in topic_dic[i].keys():
            print('  ' + j + ' (' + str(topic_dic[i][j]) + ')')  

And generate topic embeddings for each topic.

In [None]:
with open("../glove_dict.pkl" , "rb") as f:
    glove_dic = pickle.load(f)
    
vocab = glove_dic.keys()

def create_embeddings(topic_dic):
    vec_dic = {}
    for key in topic_dic.keys():
        topic = topic_dic[key]
        acc = 0
        acc_vec = np.zeros(100)
        for key_word in topic: 
            if key_word in vocab:
                acc_vec += glove_dic[key_word] * topic[key_word]
                acc += topic[key_word]
        if acc == 0:
            print(key)
            print("No word found in the vocabulary :(")  
        else:
            acc_vec /= acc
        vec_dic[key] = acc_vec
    return(vec_dic)

In [141]:
GoodReads_topics_dic = get_topics(ldamodel)

In [142]:
print_topics(GoodReads_topics_dic)

Topic 1:
  books (0.081)
  children (0.051)
  childhood (0.039)
  read (0.032)
  childrens (0.028)
  fiction (0.026)
  kids (0.018)
  lit (0.017)
  favorites (0.017)
  kid (0.016)
  literature (0.014)
  book (0.013)
  school (0.013)
  library (0.012)
  young (0.012)
Topic 2:
  read (0.044)
  manga (0.043)
  graphic (0.024)
  novels (0.021)
  comics (0.018)
  books (0.018)
  series (0.012)
  owned (0.008)
  adult (0.007)
  mystery (0.007)
  comic (0.007)
  fiction (0.007)
  black (0.007)
  favorites (0.006)
  stars (0.006)
Topic 3:
  read (0.0)
  books (0.0)
  favorites (0.0)
  fiction (0.0)
  adult (0.0)
  memoir (0.0)
  biography (0.0)
  owned (0.0)
  library (0.0)
  reading (0.0)
  book (0.0)
  school (0.0)
  audio (0.0)
  fantasy (0.0)
  favorite (0.0)
Topic 4:
  food (0.051)
  cooking (0.025)
  naruto (0.02)
  books (0.015)
  cookbooks (0.011)
  new (0.009)
  drink (0.009)
  recipes (0.008)
  indian (0.007)
  booker (0.007)
  foodie (0.007)
  read (0.006)
  chapters (0.006)
  joy (

In [None]:
GoodReads_topics_vec = create_embeddings(GoodReads_topics_dic)
with open('GoodReads/' + str(NUM_TOPICS) + '_Topics/GoodReads_topics_' + str(NUM_TOPICS) + '.pkl', 'wb') as f:
    pickle.dump(GoodReads_topics_vec, f, pickle.HIGHEST_PROTOCOL)

## Save the weighting on each topics for each book

This creates a dictionary that marks what is each book's weighting on the topics. 

In [None]:
ids = list(Goodreads_dic.keys())
cnt = 0
weight_dic = {}
for l in ldamodel[corpus]:
    weight_dic[ids[cnt]] = l
    cnt += 1

In [145]:
with open('book_weights_sample.pkl', 'wb') as f:
    pickle.dump(weight_dic, f, pickle.HIGHEST_PROTOCOL)

In [148]:
weight_dic[3]

[(14, 0.63524973), (16, 0.042723272), (30, 0.3104762)]

### Calculate the distance between tweets and book topics

Store a dictionary that contains the distance between the twitter topics to each of the book topic

In [149]:
def topic_tweet_distance(tweet_dic):
    dist_dic = {}
    for key in tweet_dic.keys():
        topic_distance = np.zeros(NUM_TOPICS)
        vec = tweet_dic[key]
        for i in range(NUM_TOPICS):
            goodreads_vec = GoodReads_topics_vec[i]
            if goodreads_vec.any(): 
                topic_distance[i] = 1 - distance.cosine(vec, goodreads_vec)
        dist_dic[key] = topic_distance
    return(dist_dic)

In [150]:
## Load tweet topics
handle = "twitter_handle1"
with open(handle + '_vec_' + str(5) + '.pkl', 'rb') as f:
    H1_tweet_dic = pickle.load(f)

In [151]:
H1_tweet_dist = topic_tweet_distance(H1_tweet_dic)
with open('GoodReads/' + str(NUM_TOPICS) + '_Topics/' + handle + '_topic_dist_' + str(NUM_TOPICS) + '.pkl', 'wb') as f:
    pickle.dump(H1_tweet_dist, f, pickle.HIGHEST_PROTOCOL)