In [16]:
from collections import Counter
from nltk.tokenize import sent_tokenize, word_tokenize
# Import WordNetLemmatizer
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords
# Import Dictionary
from gensim.corpora.dictionary import Dictionary 


In [29]:
# https://en.wikipedia.org/wiki/Space
text='Space is the boundless three-dimensional extent in which objects and events have relative position and direction.[1] Physical space is often conceived in three linear dimensions, although modern physicists usually consider it, with time, to be part of a boundless four-dimensional continuum known as spacetime. The concept of space is considered to be of fundamental importance to an understanding of the physical universe. However, disagreement continues between philosophers over whether it is itself an entity, a relationship between entities, or part of a conceptual framework.'

In [30]:
text=text.split('.')
text

['Space is the boundless three-dimensional extent in which objects and events have relative position and direction',
 '[1] Physical space is often conceived in three linear dimensions, although modern physicists usually consider it, with time, to be part of a boundless four-dimensional continuum known as spacetime',
 ' The concept of space is considered to be of fundamental importance to an understanding of the physical universe',
 ' However, disagreement continues between philosophers over whether it is itself an entity, a relationship between entities, or part of a conceptual framework',
 '']

In [58]:
# Remove all stop words: no_stops
english_stops=stopwords.words('english')
articles = [word_tokenize(t.lower()) for t in text if t not in english_stops]
articles[0]

['space',
 'is',
 'the',
 'boundless',
 'three-dimensional',
 'extent',
 'in',
 'which',
 'objects',
 'and',
 'events',
 'have',
 'relative',
 'position',
 'and',
 'direction']

In [37]:
# Create a Dictionary from the articles: dictionary
dictionary = Dictionary (articles)


<gensim.corpora.dictionary.Dictionary at 0x7fd7d709d3d0>

In [38]:
# create a dictionary of tokens and tokens' id and 
dictionary.token2id

{'and': 0,
 'boundless': 1,
 'direction': 2,
 'events': 3,
 'extent': 4,
 'have': 5,
 'in': 6,
 'is': 7,
 'objects': 8,
 'position': 9,
 'relative': 10,
 'space': 11,
 'the': 12,
 'three-dimensional': 13,
 'which': 14,
 ',': 15,
 '1': 16,
 '[': 17,
 ']': 18,
 'a': 19,
 'although': 20,
 'as': 21,
 'be': 22,
 'conceived': 23,
 'consider': 24,
 'continuum': 25,
 'dimensions': 26,
 'four-dimensional': 27,
 'it': 28,
 'known': 29,
 'linear': 30,
 'modern': 31,
 'of': 32,
 'often': 33,
 'part': 34,
 'physical': 35,
 'physicists': 36,
 'spacetime': 37,
 'three': 38,
 'time': 39,
 'to': 40,
 'usually': 41,
 'with': 42,
 'an': 43,
 'concept': 44,
 'considered': 45,
 'fundamental': 46,
 'importance': 47,
 'understanding': 48,
 'universe': 49,
 'between': 50,
 'conceptual': 51,
 'continues': 52,
 'disagreement': 53,
 'entities': 54,
 'entity': 55,
 'framework': 56,
 'however': 57,
 'itself': 58,
 'or': 59,
 'over': 60,
 'philosophers': 61,
 'relationship': 62,
 'whether': 63}

In [52]:

# Select the id for 'concept': 'concept'_id, 
# get():returns tokens from ids
concept_id = dictionary.token2id.get('concept')
concept_id

44

In [53]:
# Use computer_id with the dictionary to print the word
print(dictionary.get(concept_id))

concept


In [62]:
# Create a MmCorpus: corpus, 
# doc2bow(article): create a list of tokens' id and their frequency in each article. 
# tokens in each article sort alphabeticlaly and take id increasingly. If one token has taken id in the last articles, 
# it used the same id
corpus = [dictionary.doc2bow(article) for article in articles]

In [95]:
# Print the first 5 word ids with their frequency counts from the fifth document
print(corpus[3])


[(7, 1), (15, 3), (19, 2), (28, 1), (32, 1), (34, 1), (43, 1), (50, 2), (51, 1), (52, 1), (53, 1), (54, 1), (55, 1), (56, 1), (57, 1), (58, 1), (59, 1), (60, 1), (61, 1), (62, 1), (63, 1)]


In [98]:
bow_doc=sorted(corpus[3],key=lambda w:w[1],reverse=True)
bow_doc[:5]

[(15, 3), (19, 2), (50, 2), (7, 1), (28, 1)]

In [94]:

# Print the top 5 words of the document alongside the count
for word_id, word_count in bow_doc[:5]:
    print(dictionary.get(word_id), word_count)



, 3
a 2
between 2
is 1
it 1


In [119]:
    
# Create the defaultdict: word_id,total_word_count
import itertools
from collections import defaultdict
total_word_count = defaultdict(int)
for word_id, word_count in itertools.chain.from_iterable(corpus):
    total_word_count[word_id] += word_count

In [123]:
total_word_count

defaultdict(int,
            {0: 2,
             1: 2,
             2: 1,
             3: 1,
             4: 1,
             5: 1,
             6: 2,
             7: 4,
             8: 1,
             9: 1,
             10: 1,
             11: 3,
             12: 3,
             13: 1,
             14: 1,
             15: 6,
             16: 1,
             17: 1,
             18: 1,
             19: 3,
             20: 1,
             21: 1,
             22: 2,
             23: 1,
             24: 1,
             25: 1,
             26: 1,
             27: 1,
             28: 2,
             29: 1,
             30: 1,
             31: 1,
             32: 5,
             33: 1,
             34: 2,
             35: 2,
             36: 1,
             37: 1,
             38: 1,
             39: 1,
             40: 3,
             41: 1,
             42: 1,
             43: 2,
             44: 1,
             45: 1,
             46: 1,
             47: 1,
             48: 1,
             

In [132]:
    
# Create a sorted list from the defaultdict: sorted_word_count
# total_word_count.items(): return the key-value pair of a dictionary as tuples of a list
sorted_word_count = sorted(total_word_count.items(), key=lambda w: w[1], reverse=True) 
sorted_word_count[:5]

[(15, 6), (32, 5), (7, 4), (11, 3), (12, 3)]

In [134]:
# Print the top 5 words across all documents alongside the count
for word_id, word_totalcount  in sorted_word_count[:5]:
    print(dictionary.get(word_id), word_totalcount)

, 6
of 5
is 4
space 3
the 3
