In [1]:
! pip install numpy

Collecting numpy
  Obtaining dependency information for numpy from https://files.pythonhosted.org/packages/58/b0/034eb5d5ba12d66ab658ff3455a31f20add0b78df8203c6a7451bd1bee21/numpy-2.2.1-cp311-cp311-macosx_14_0_arm64.whl.metadata
  Downloading numpy-2.2.1-cp311-cp311-macosx_14_0_arm64.whl.metadata (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.0/62.0 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-2.2.1-cp311-cp311-macosx_14_0_arm64.whl (5.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.4/5.4 MB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: numpy
Successfully installed numpy-2.2.1

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [9]:
import numpy as np
from collections import Counter
from itertools import product

In [10]:
# Step 1: Input Documents
documents = [
    "I like deep learning",
    "I like NLP and deep learning"
]


In [11]:
# Step 2: Tokenize the documents and create the vocabulary
print("all the words that is contained by the document")
tokenized_docs = [doc.lower().split() for doc in documents]
print(tokenized_docs)
vocabulary = sorted(set(word for doc in tokenized_docs for word in doc))
print("Unique words in the docs")
print(vocabulary)
vocab_size = len(vocabulary)

all the words that is contained by the document
[['i', 'like', 'deep', 'learning'], ['i', 'like', 'nlp', 'and', 'deep', 'learning']]
Unique words in the docs
['and', 'deep', 'i', 'learning', 'like', 'nlp']


In [12]:
# Map each word to an index
word_to_index = {word: i for i, word in enumerate(vocabulary)}
print("Word to index mapping")
print(word_to_index)


Word to index mapping
{'and': 0, 'deep': 1, 'i': 2, 'learning': 3, 'like': 4, 'nlp': 5}


In [13]:
# Step 3: Create an empty co-occurrence matrix
co_occurrence_matrix = np.zeros((vocab_size, vocab_size), dtype=np.float32)
print("Empty Co-occurrence matrix")
print(co_occurrence_matrix)

Empty Co-occurrence matrix
[[0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]]


In [18]:
# Step 4: Count co-occurrences
for doc in tokenized_docs:
    # Count words in the document
    word_counts = Counter(doc)
    
    # Update the matrix for each pair of words
    for (word1, word2) in product(doc, repeat=2):
        print(f"Word 1: {word1}, Word 2: {word2}")
        
        i, j = word_to_index[word1], word_to_index[word2]
        co_occurrence_matrix[i, j] += word_counts[word2]
print("Co-occurrence matrix")
print(co_occurrence_matrix)

Word 1: i, Word 2: i
Word 1: i, Word 2: like
Word 1: i, Word 2: deep
Word 1: i, Word 2: learning
Word 1: like, Word 2: i
Word 1: like, Word 2: like
Word 1: like, Word 2: deep
Word 1: like, Word 2: learning
Word 1: deep, Word 2: i
Word 1: deep, Word 2: like
Word 1: deep, Word 2: deep
Word 1: deep, Word 2: learning
Word 1: learning, Word 2: i
Word 1: learning, Word 2: like
Word 1: learning, Word 2: deep
Word 1: learning, Word 2: learning
Word 1: i, Word 2: i
Word 1: i, Word 2: like
Word 1: i, Word 2: nlp
Word 1: i, Word 2: and
Word 1: i, Word 2: deep
Word 1: i, Word 2: learning
Word 1: like, Word 2: i
Word 1: like, Word 2: like
Word 1: like, Word 2: nlp
Word 1: like, Word 2: and
Word 1: like, Word 2: deep
Word 1: like, Word 2: learning
Word 1: nlp, Word 2: i
Word 1: nlp, Word 2: like
Word 1: nlp, Word 2: nlp
Word 1: nlp, Word 2: and
Word 1: nlp, Word 2: deep
Word 1: nlp, Word 2: learning
Word 1: and, Word 2: i
Word 1: and, Word 2: like
Word 1: and, Word 2: nlp
Word 1: and, Word 2: and
Wo

In [19]:
# Step 5: Normalize rows by row sum
row_sums = co_occurrence_matrix.sum(axis=1, keepdims=True)
normalized_matrix = co_occurrence_matrix / row_sums

# Display results
print("Vocabulary:", vocabulary)
print("\nCo-occurrence Matrix (Raw Counts):")
print(co_occurrence_matrix)
print("\nNormalized Co-occurrence Matrix:")
print(normalized_matrix)

Vocabulary: ['and', 'deep', 'i', 'learning', 'like', 'nlp']

Co-occurrence Matrix (Raw Counts):
[[4. 4. 4. 4. 4. 4.]
 [4. 8. 8. 8. 8. 4.]
 [4. 8. 8. 8. 8. 4.]
 [4. 8. 8. 8. 8. 4.]
 [4. 8. 8. 8. 8. 4.]
 [4. 4. 4. 4. 4. 4.]]

Normalized Co-occurrence Matrix:
[[0.16666667 0.16666667 0.16666667 0.16666667 0.16666667 0.16666667]
 [0.1        0.2        0.2        0.2        0.2        0.1       ]
 [0.1        0.2        0.2        0.2        0.2        0.1       ]
 [0.1        0.2        0.2        0.2        0.2        0.1       ]
 [0.1        0.2        0.2        0.2        0.2        0.1       ]
 [0.16666667 0.16666667 0.16666667 0.16666667 0.16666667 0.16666667]]
