In [8]:
!pip install gensim==3.8.3

Collecting gensim==3.8.3
  Downloading gensim-3.8.3-cp36-cp36m-macosx_10_9_x86_64.whl (24.2 MB)
[K     |████████████████████████████████| 24.2 MB 274 kB/s eta 0:00:01
Installing collected packages: gensim
  Attempting uninstall: gensim
    Found existing installation: gensim 3.8.0
    Uninstalling gensim-3.8.0:
      Successfully uninstalled gensim-3.8.0
Successfully installed gensim-3.8.3


In [9]:
import nltk
import re
import gensim
from gensim import corpora
import operator
import pandas as pd

nltk.download('stopwords')
from nltk.corpus import stopwords

import numpy as np
import random

random.seed(1)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/JackMac/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
full_dream = pd.read_csv("dreams.csv.1")
dream_content = full_dream['content']
dream_content

0        In my dream, the first thing I remember was th...
1        The next thing I know, I'm in this big, fairly...
2        Last night's dreams are a little fuzzy in my m...
3        I woke up this morning knowing I had a dream i...
4        I remember being in Africa- a quiet wooded sce...
                               ...                        
26338    Merrilee was joining the United Nations ______...
26339    Apparently I was coming home from Jackson and ...
26340    I've been having some really weird dreams late...
26341    I was in some kind of grassy, open space. I fe...
26342    I had a lot of catching up to do, and I cannot...
Name: content, Length: 26343, dtype: object

In [15]:
def read_stopwords(filename):
    stopwords={}
    with open(filename) as file:
        for line in file:
            stopwords[line.rstrip()]=1
    return stopwords

In [32]:
stop_words = {k:1 for k in stopwords.words('english')}
stop_words["'s"]=1
stop_words["dream"]=1
stop_words=list(stop_words.keys())

In [33]:
def filter(word, stopwords):
    
    """ Function to exclude words from a text """
    
    # no stopwords
    if word in stopwords:
        return False
    
    # has to contain at least one letter
    if re.search("[A-Za-z]", word) is not None:
        return True
    
    return False

In [34]:
def dream_topic_preparation(dream_series, stopwords):
    docs = []
    for dream in dream_series:
        tokens=nltk.word_tokenize(dream.lower())
        tokens = [x for x in tokens if filter(x, stopwords)]
        docs.append(tokens)
        
    return docs

In [35]:
tokenized_dreams = dream_topic_preparation(dream_content, stop_words)

In [36]:
tokenized_dreams

[['first',
  'thing',
  'remember',
  'nerdy',
  'guy',
  'goes',
  'school',
  'opinion',
  'biggest',
  'nerd',
  'school',
  'saw',
  'girl',
  'talking',
  'sudden',
  'kissed',
  'event',
  'dramatic',
  'started',
  'crying',
  'selflessly',
  'happened-',
  'completely',
  'beautiful',
  'girl',
  'kissed',
  'complete',
  'geek',
  'touched',
  'moment',
  'think',
  'fell',
  'love',
  'spot',
  'realized',
  'beautiful',
  'girl',
  "'d",
  'ever',
  'seen',
  'later',
  'caught',
  'cafe',
  'sort',
  'sat',
  'talked',
  "n't",
  'want',
  'tell',
  'much',
  'liked',
  'might',
  'thought',
  'trying',
  'take',
  'advantage',
  'nice',
  'girl',
  'would',
  "n't",
  'want'],
 ['next',
  'thing',
  'know',
  "'m",
  'big',
  'fairly',
  'dark',
  'room',
  'old',
  'female',
  'teacher',
  'mrs.',
  'jones',
  'right',
  'school',
  'trip',
  'vacation',
  'state',
  'looked',
  'clock',
  'soon',
  'looked',
  'clock',
  'hour',
  'hand',
  'went',
  'around',
  'clock',

In [37]:
dictionary = corpora.Dictionary(tokenized_dreams)
dictionary.filter_extremes(no_below=5, no_above=.5, keep_n=10000)

In [38]:
corpus = [dictionary.doc2bow(text) for text in tokenized_dreams]

In [39]:
num_topics=20

In [40]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=num_topics, 
                                           passes=10,
                                           alpha='auto')

In [41]:
for i in range(num_topics):
    print("topic %s:\t%s" % (i, ' '.join([term for term, freq in lda_model.show_topic(i, topn=10)])))

topic 0:	friend friends group people girlfriend mine brother university mike sister
topic 1:	men women ice bob two one cream cabin war f
topic 2:	car driving road get street bus truck back drive side
topic 3:	store wearing blue clothes white dress hair shoes black red
topic 4:	one table box two tape food eat machine put eating
topic 5:	dog tree gun stephen snow shoot picnic ladder police neighbor
topic 6:	mary bathroom toilet cat shower knife teeth slide doll sink
topic 7:	water boat pool plane beach swimming lake fish wally fly
topic 8:	race keyboard pack win candy roller guard garbage boss 4th
topic 9:	'm go see get say back walk says 're want
topic 10:	room house door bed floor living open kitchen bedroom window
topic 11:	said went came got could saw looked would told back
topic 12:	mom dad guy like playing stuff movie game ezra watching
topic 13:	man father baby woman love feel happy frank young kiss
topic 14:	school class high book teacher college test paper questions read
topic 1

In [6]:
test_book = pd.read_json('EEBO_sample_EF202003/coo.31924013131622.json.bz2', compression = 'bz2')

In [7]:
test_book

Unnamed: 0,@context,schemaVersion,id,htid,type,publisher,datePublished,metadata,features
id,https://worksets.htrc.illinois.edu/context/ef_...,https://schemas.hathitrust.org/EF_Schema_v_3.0,https://data.analytics.hathitrust.org/extracte...,coo.31924013131622,DataFeed,https://analytics.hathitrust.org,20200210,http://hdl.handle.net/2027/coo.31924013131622,http://hdl.handle.net/2027/coo.31924013131622
type,https://worksets.htrc.illinois.edu/context/ef_...,https://schemas.hathitrust.org/EF_Schema_v_3.0,https://data.analytics.hathitrust.org/extracte...,coo.31924013131622,DataFeed,Organization,20200210,"[DataFeedItem, Book]",DataFeedItem
name,https://worksets.htrc.illinois.edu/context/ef_...,https://schemas.hathitrust.org/EF_Schema_v_3.0,https://data.analytics.hathitrust.org/extracte...,coo.31924013131622,DataFeed,HathiTrust Research Center,20200210,,
schemaVersion,https://worksets.htrc.illinois.edu/context/ef_...,https://schemas.hathitrust.org/EF_Schema_v_3.0,https://data.analytics.hathitrust.org/extracte...,coo.31924013131622,DataFeed,,20200210,https://schemas.hathitrust.org/EF_Schema_Metad...,https://schemas.hathitrust.org/EF_Schema_Featu...
dateCreated,https://worksets.htrc.illinois.edu/context/ef_...,https://schemas.hathitrust.org/EF_Schema_v_3.0,https://data.analytics.hathitrust.org/extracte...,coo.31924013131622,DataFeed,,20200210,20200209,20200125
title,https://worksets.htrc.illinois.edu/context/ef_...,https://schemas.hathitrust.org/EF_Schema_v_3.0,https://data.analytics.hathitrust.org/extracte...,coo.31924013131622,DataFeed,,20200210,"The Spanish tragedy,",
contributor,https://worksets.htrc.illinois.edu/context/ef_...,https://schemas.hathitrust.org/EF_Schema_v_3.0,https://data.analytics.hathitrust.org/extracte...,coo.31924013131622,DataFeed,,20200210,"[{'id': 'http://www.viaf.org/viaf/29614087', '...",
pubDate,https://worksets.htrc.illinois.edu/context/ef_...,https://schemas.hathitrust.org/EF_Schema_v_3.0,https://data.analytics.hathitrust.org/extracte...,coo.31924013131622,DataFeed,,20200210,1898,
publisher,https://worksets.htrc.illinois.edu/context/ef_...,https://schemas.hathitrust.org/EF_Schema_v_3.0,https://data.analytics.hathitrust.org/extracte...,coo.31924013131622,DataFeed,,20200210,{'id': 'http://catalogdata.library.illinois.ed...,
pubPlace,https://worksets.htrc.illinois.edu/context/ef_...,https://schemas.hathitrust.org/EF_Schema_v_3.0,https://data.analytics.hathitrust.org/extracte...,coo.31924013131622,DataFeed,,20200210,{'id': 'http://id.loc.gov/vocabulary/countries...,
