<a href="https://colab.research.google.com/github/SidharthBhakth/Gensim-Tutorials/blob/master/2_Corpora_and_Vector_Spaces.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Corpora and Vector Spaces

In [0]:
# import logging
# logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
from smart_open import open  # for transparently opening remote files
from gensim import corpora
from pprint import pprint
from collections import defaultdict

#### **From Strings to Vectors**

In [0]:
documents = ["Human machine interface for lab abc computer applications",
              "A survey of user opinion of computer system response time",
              "The EPS user interface management system",
              "System and human system engineering testing of EPS",
              "Relation of user perceived response time to error measurement",
              "The generation of random binary unordered trees",
              "The intersection graph of paths in trees",
              "Graph minors IV Widths of trees and well quasi ordering",
              "Graph minors A survey"]

In [3]:
# remove common words and tokenize
stopwords = set('for a of the and to in'.split())

texts = [ [ word for word in document.lower().split() if word not in stopwords ] for document in documents ]

frequency = defaultdict(int)
for text in texts:
  for token in text:
    frequency[token] += 1

texts = [ [token for token in text if frequency[token] > 1] for text in texts ]

pprint(texts)

[['human', 'interface', 'computer'],
 ['survey', 'user', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'system'],
 ['system', 'human', 'system', 'eps'],
 ['user', 'response', 'time'],
 ['trees'],
 ['graph', 'trees'],
 ['graph', 'minors', 'trees'],
 ['graph', 'minors', 'survey']]


In [4]:
# Map each word in corpus to a unique integer ID
dictionary = corpora.Dictionary(texts)
print(dictionary)

Dictionary(12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...)


In [5]:
# Token : integer ID mapping
pprint(dictionary.token2id)

{'computer': 0,
 'eps': 8,
 'graph': 10,
 'human': 1,
 'interface': 2,
 'minors': 11,
 'response': 3,
 'survey': 4,
 'system': 5,
 'time': 6,
 'trees': 9,
 'user': 7}


In [6]:
new_doc = "Human computer interaction"
new_vec = dictionary.doc2bow(new_doc.lower().split())
print(new_vec) # the word "interaction" does not appear in the dictionary and is ignored

[(0, 1), (1, 1)]


In [7]:
corpus = [ dictionary.doc2bow(text) for text in texts ]
pprint(corpus)

[[(0, 1), (1, 1), (2, 1)],
 [(0, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)],
 [(2, 1), (5, 1), (7, 1), (8, 1)],
 [(1, 1), (5, 2), (8, 1)],
 [(3, 1), (6, 1), (7, 1)],
 [(9, 1)],
 [(9, 1), (10, 1)],
 [(9, 1), (10, 1), (11, 1)],
 [(4, 1), (10, 1), (11, 1)]]


#### **Corpus Streaming – One Document at a Time**

In [0]:
class MyCorpus(object):
  """
  Streaming Corpus Interface:
  This flexibility allows you to create your own corpus classes that stream the
  documents directly from disk, network, database, dataframes... 
  The models in Gensim are implemented such that they don't require all vectors to reside
  in RAM at once. You can even create the documents on the fly!
  """
  def __iter__(self):
    for line in open('https://raw.githubusercontent.com/RaRe-Technologies/gensim/develop/docs/notebooks/datasets/mycorpus.txt'):
      # Assume there is one document per line, tokens separated by whitespace
      yield dictionary.doc2bow(line.lower().split())

In [9]:
# Instantiate corpus object
corpus_memory_friendly = MyCorpus()

# load one vector at a time into memory
for vector in corpus_memory_friendly:
  print(vector)

[(0, 1), (1, 1), (2, 1)]
[(0, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)]
[(2, 1), (5, 1), (7, 1), (8, 1)]
[(1, 1), (5, 2), (8, 1)]
[(3, 1), (6, 1), (7, 1)]
[(9, 1)]
[(9, 1), (10, 1)]
[(9, 1), (10, 1), (11, 1)]
[(4, 1), (10, 1), (11, 1)]


In [10]:
# construct the dictionary without loading all texts into memory
dictionary = corpora.Dictionary(line.lower().split() for line in open('https://raw.githubusercontent.com/RaRe-Technologies/gensim/develop/docs/notebooks/datasets/mycorpus.txt'))

# Token : integer ID mapping
pprint(dictionary.token2id)

{'a': 8,
 'abc': 0,
 'and': 19,
 'applications': 1,
 'binary': 27,
 'computer': 2,
 'engineering': 20,
 'eps': 16,
 'error': 22,
 'for': 3,
 'generation': 28,
 'graph': 32,
 'human': 4,
 'in': 33,
 'interface': 5,
 'intersection': 34,
 'iv': 36,
 'lab': 6,
 'machine': 7,
 'management': 17,
 'measurement': 23,
 'minors': 37,
 'of': 9,
 'opinion': 10,
 'ordering': 38,
 'paths': 35,
 'perceived': 24,
 'quasi': 39,
 'random': 29,
 'relation': 25,
 'response': 11,
 'survey': 12,
 'system': 13,
 'testing': 21,
 'the': 18,
 'time': 14,
 'to': 26,
 'trees': 30,
 'unordered': 31,
 'user': 15,
 'well': 40,
 'widths': 41}


In [11]:
# remove stopwords and words that occur only once
stop_ids =  [ dictionary.token2id[stopword] for stopword in stopwords if stopword in dictionary.token2id ]
once_ids = [ tokenid for tokenid, docfreq in dictionary.dfs.items() if docfreq == 1 ]

dictionary.filter_tokens(stop_ids + once_ids)

# remove gaps in id sequence after words that were removed
dictionary.compactify()

print(dictionary)

Dictionary(12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...)


#### **Corpus Formats**

In [12]:
# Create a toy corpus of 2 documents with one empty document
corpus = [[(1, 0.5)], []]

# save corpus in the Matrix Market format
corpora.MmCorpus.serialize('/corpus.mm', corpus)

# save corpus in the Joachim's SVMlight format
corpora.SvmLightCorpus.serialize('/corpus.svmlight', corpus)

# save corpus in the Blei's LDA-C format
corpora.BleiCorpus.serialize('/corpus.lda-c', corpus)

# save corpus in the Gibbs LDA++ format
corpora.LowCorpus.serialize('/corpus.low', corpus)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [13]:
# Load a corpus iterator from file
corpus = corpora.MmCorpus('/corpus.mm')

# Print one document at a time making use of the streaming interface
for doc in corpus:
  print(doc)

[(1, 0.5)]
[]


  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [0]:
import gensim
import numpy as np
import scipy.sparse

#### **Compatibility with NumPy and SciPy**

In [0]:
# random numpy array
numpy_matrix = np.random.randint(10, size=(5,2))

corpus = gensim.matutils.Dense2Corpus(numpy_matrix)

In [0]:
# random sparse matrix
scipy_sparse_matrix = scipy.sparse.random(5,2)

corpus = gensim.matutils.Sparse2Corpus(scipy_sparse_matrix)
scipy_csc_matrix = gensim.matutils.corpus2csc(corpus)