In [4]:
import nltk
from nltk.corpus import reuters

nltk.download('reuters')
nltk.download('punkt')
nltk.download('stopwords')

doc_ids = reuters.fileids()[:10]
docs = [" ".join(reuters.words(fid)) for fid in doc_ids]

print("Number of documents:", len(docs))
print("Sample document:\n", docs[0][:500])


Number of documents: 10
Sample document:
 ASIAN EXPORTERS FEAR DAMAGE FROM U . S .- JAPAN RIFT Mounting trade friction between the U . S . And Japan has raised fears among many of Asia ' s exporting nations that the row could inflict far - reaching economic damage , businessmen and officials said . They told Reuter correspondents in Asian capitals a U . S . Move against Japan might boost protectionist sentiment in the U . S . And lead to curbs on American imports of their products . But some exporters said that while the conflict would 


[nltk_data] Downloading package reuters to /root/nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [17]:
# Q2a: Word-Document Matrix
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import nltk
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

# Clean the documents
docs_cleaned = []
for d in docs:
    tokens = nltk.word_tokenize(d.lower())
    tokens = [t for t in tokens if t.isalpha() and t not in stop_words]
    docs_cleaned.append(" ".join(tokens))

# Build TF-IDF Word-Document Matrix
vectorizer = TfidfVectorizer()
WDM = vectorizer.fit_transform(docs_cleaned)

wdm_df = pd.DataFrame(WDM.toarray(),
                      index=[f"Doc{i}" for i in range(len(docs_cleaned))],
                      columns=vectorizer.get_feature_names_out())
print("Word-Document Matrix (first 10 words):")
print(wdm_df.iloc[:, :10])


Word-Document Matrix (first 10 words):
          able  absorbing  according  accounting      act    action     added  \
Doc0  0.034599   0.000000   0.000000    0.034599  0.00000  0.029412  0.000000   
Doc1  0.000000   0.000000   0.000000    0.000000  0.00000  0.000000  0.000000   
Doc2  0.000000   0.000000   0.000000    0.000000  0.00000  0.000000  0.000000   
Doc3  0.000000   0.000000   0.000000    0.000000  0.00000  0.000000  0.000000   
Doc4  0.000000   0.000000   0.087781    0.000000  0.00000  0.000000  0.000000   
Doc5  0.000000   0.000000   0.000000    0.000000  0.00000  0.125977  0.000000   
Doc6  0.000000   0.000000   0.000000    0.000000  0.00000  0.000000  0.039887   
Doc7  0.000000   0.000000   0.000000    0.000000  0.00000  0.000000  0.000000   
Doc8  0.000000   0.000000   0.000000    0.000000  0.00000  0.000000  0.000000   
Doc9  0.000000   0.031805   0.000000    0.000000  0.06361  0.000000  0.027037   

      additives  adelaide  advantage  
Doc0   0.000000  0.000000   0.

In [18]:
# Q2b: Word-Context Matrix
import numpy as np

window_size = 2
tokens = " ".join(docs_cleaned).split()
vocab = list(set(tokens))
vocab_index = {word: i for i, word in enumerate(vocab)}

cooc_matrix = np.zeros((len(vocab), len(vocab)))

for i in range(len(tokens)):
    target = tokens[i]
    start = max(0, i - window_size)
    end = min(len(tokens), i + window_size + 1)
    context = tokens[start:i] + tokens[i+1:end]
    for ctx in context:
        cooc_matrix[vocab_index[target], vocab_index[ctx]] += 1

wcm_df = pd.DataFrame(cooc_matrix, index=vocab, columns=vocab)
print("Word-Context Matrix (10x10 slice):")
print(wcm_df.iloc[:10, :10])


Word-Context Matrix (10x10 slice):
           trying  mexico  movement  steagall  europe  visit  confident  \
trying        0.0     0.0       0.0       0.0     0.0    0.0        0.0   
mexico        0.0     0.0       0.0       0.0     1.0    1.0        0.0   
movement      0.0     0.0       0.0       0.0     0.0    0.0        0.0   
steagall      0.0     0.0       0.0       0.0     0.0    0.0        0.0   
europe        0.0     1.0       0.0       0.0     0.0    1.0        0.0   
visit         0.0     1.0       0.0       0.0     1.0    0.0        0.0   
confident     0.0     0.0       0.0       0.0     0.0    0.0        0.0   
possible      0.0     0.0       0.0       0.0     0.0    0.0        0.0   
stepped       0.0     0.0       0.0       0.0     0.0    0.0        0.0   
pit           0.0     0.0       0.0       0.0     0.0    0.0        0.0   

           possible  stepped  pit  
trying          0.0      0.0  0.0  
mexico          0.0      0.0  0.0  
movement        0.0      0.0  0

In [19]:
# Q2c: Pair-Pattern Matrix (using trigrams)
import nltk
from collections import Counter

bigrams = list(nltk.bigrams(tokens))
bigram_counts = Counter(bigrams)

# Collect trigram patterns
patterns = []
for i in range(len(tokens) - 2):
    pair = (tokens[i], tokens[i+1])
    pattern = tokens[i+2]
    patterns.append((pair, pattern))

pair_vocab = list(set([p for p, _ in patterns]))
pattern_vocab = list(set([pat for _, pat in patterns]))

pair_index = {p: i for i, p in enumerate(pair_vocab)}
pattern_index = {p: i for i, p in enumerate(pattern_vocab)}

ppm = np.zeros((len(pair_vocab), len(pattern_vocab)))
for pair, pat in patterns:
    ppm[pair_index[pair], pattern_index[pat]] += 1

ppm_df = pd.DataFrame(ppm, index=[str(p) for p in pair_vocab], columns=pattern_vocab)
print("Pair-Pattern Matrix (10x10 slice):")
print(ppm_df.iloc[:10, :10])


Pair-Pattern Matrix (10x10 slice):
                          trying  mexico  movement  steagall  europe  visit  \
('confident', 'bank')        0.0     0.0       0.0       0.0     0.0    0.0   
('corp', 'holdings')         0.0     0.0       0.0       0.0     0.0    0.0   
('japan', 'electric')        0.0     0.0       0.0       0.0     0.0    0.0   
('half', 'financial')        0.0     0.0       0.0       0.0     0.0    0.0   
('harahap', 'said')          0.0     0.0       0.0       0.0     0.0    0.0   
('contacts', 'end')          0.0     0.0       0.0       0.0     0.0    0.0   
('spokesman', 'leading')     0.0     0.0       0.0       0.0     0.0    0.0   
('industry', 'miti')         0.0     0.0       0.0       0.0     0.0    0.0   
('vermin', 'consume')        0.0     0.0       0.0       0.0     0.0    0.0   
('oil', 'malaysia')          0.0     0.0       0.0       0.0     0.0    0.0   

                          confident  possible  stepped  pit  
('confident', 'bank')           0

In [20]:
# Q3a: Word Similarity (using cosine similarity on WCM)
from sklearn.metrics.pairwise import cosine_similarity

word_sim = cosine_similarity(wcm_df)
word_sim_df = pd.DataFrame(word_sim, index=vocab, columns=vocab)
print("Word Similarity Matrix (10x10 slice):")
print(word_sim_df.iloc[:10, :10])


Word Similarity Matrix (10x10 slice):
           trying  mexico  movement  steagall  europe  visit  confident  \
trying       1.00    0.00       0.0  0.000000    0.00   0.00        0.0   
mexico       0.00    1.00       0.0  0.000000    0.50   0.25        0.0   
movement     0.00    0.00       1.0  0.000000    0.00   0.00        0.0   
steagall     0.00    0.00       0.0  1.000000    0.00   0.00        0.0   
europe       0.00    0.50       0.0  0.000000    1.00   0.50        0.0   
visit        0.00    0.25       0.0  0.000000    0.50   1.00        0.0   
confident    0.00    0.00       0.0  0.000000    0.00   0.00        1.0   
possible     0.00    0.00       0.0  0.176777    0.00   0.00        0.0   
stepped      0.00    0.00       0.0  0.000000    0.25   0.25        0.0   
pit          0.25    0.00       0.0  0.000000    0.00   0.00        0.0   

           possible  stepped   pit  
trying     0.000000     0.00  0.25  
mexico     0.000000     0.00  0.00  
movement   0.000000     0

In [21]:
# Q3b: Document Similarity (using cosine similarity on WDM)
doc_sim = cosine_similarity(WDM)
doc_sim_df = pd.DataFrame(doc_sim,
                          index=[f"Doc{i}" for i in range(len(docs_cleaned))],
                          columns=[f"Doc{i}" for i in range(len(docs_cleaned))])
print("Document Similarity Matrix:")
print(doc_sim_df)


Document Similarity Matrix:
          Doc0      Doc1      Doc2      Doc3      Doc4      Doc5      Doc6  \
Doc0  1.000000  0.046860  0.152604  0.176819  0.094952  0.083057  0.155819   
Doc1  0.046860  1.000000  0.055018  0.134465  0.027890  0.019660  0.040943   
Doc2  0.152604  0.055018  1.000000  0.098421  0.065358  0.035638  0.056133   
Doc3  0.176819  0.134465  0.098421  1.000000  0.074997  0.018972  0.072322   
Doc4  0.094952  0.027890  0.065358  0.074997  1.000000  0.019601  0.174631   
Doc5  0.083057  0.019660  0.035638  0.018972  0.019601  1.000000  0.053095   
Doc6  0.155819  0.040943  0.056133  0.072322  0.174631  0.053095  1.000000   
Doc7  0.055734  0.061395  0.026799  0.057203  0.075203  0.023293  0.056301   
Doc8  0.073403  0.066644  0.037367  0.080330  0.022781  0.040575  0.038981   
Doc9  0.115423  0.041729  0.057818  0.079580  0.063633  0.049493  0.092785   

          Doc7      Doc8      Doc9  
Doc0  0.055734  0.073403  0.115423  
Doc1  0.061395  0.066644  0.041729  
Do

In [27]:
# Q4: WDM × WCM (fixed with aligned vocabulary)

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

# Sample corpus
docs = [
    "Machine learning is fun and powerful",
    "Deep learning is a subset of machine learning",
    "Natural language processing uses machine learning"
]

# Preprocess
docs_cleaned = [d.lower() for d in docs]

# 1) Word-Document Matrix (TF-IDF)
vectorizer = TfidfVectorizer()
WDM = vectorizer.fit_transform(docs_cleaned)
words = vectorizer.get_feature_names_out()

wdm_df = pd.DataFrame(WDM.toarray(),
                      index=[f"Doc{i}" for i in range(len(docs_cleaned))],
                      columns=words)

print("===== Original Word-Document Matrix (WDM) =====")
print(wdm_df, "\n")

# 2) Word-Context Matrix (co-occurrence)
tokens = " ".join(docs_cleaned).split()
vocab = list(set(tokens))
vocab_index = {word: i for i, word in enumerate(vocab)}
cooc_matrix = np.zeros((len(vocab), len(vocab)))
window_size = 2

for i in range(len(tokens)):
    target = tokens[i]
    start = max(0, i - window_size)
    end = min(len(tokens), i + window_size + 1)
    context = tokens[start:i] + tokens[i+1:end]
    for ctx in context:
        cooc_matrix[vocab_index[target], vocab_index[ctx]] += 1

wcm_df = pd.DataFrame(cooc_matrix, index=vocab, columns=vocab)

print("===== Word-Context Matrix (WCM) [before alignment] =====")
print("Shape:", wcm_df.shape, "\n")

# 3) Align vocabularies: keep only WDM words in WCM
wcm_aligned = wcm_df.loc[words, words]

print("===== Word-Context Matrix (WCM) [aligned to WDM vocab] =====")
print("Shape:", wcm_aligned.shape, "\n")

# 4) Multiply WDM × WCM_aligned
q4_result = WDM @ wcm_aligned.values
q4_df = pd.DataFrame(q4_result,
                     index=[f"Doc{i}" for i in range(len(docs_cleaned))],
                     columns=words)

print("===== Q4: Resultant Word-Document Matrix (WDM × WCM) =====")
print(q4_df, "\n")

# 5) Comparison: Original vs New (Doc0, all words)
print("===== Comparison (Doc0): Original vs Q4 Result =====")
comp_q4 = pd.DataFrame({
    "Original WDM": wdm_df.iloc[0].values,
    "Q4 Result": q4_df.iloc[0].values
}, index=words)
print(comp_q4)


===== Original Word-Document Matrix (WDM) =====
           and      deep       fun        is  language  learning   machine  \
Doc0  0.483591  0.000000  0.483591  0.367784  0.000000  0.285617  0.285617   
Doc1  0.000000  0.433452  0.000000  0.329651  0.000000  0.512007  0.256004   
Doc2  0.000000  0.000000  0.000000  0.000000  0.461381  0.272499  0.272499   

       natural        of  powerful  processing    subset      uses  
Doc0  0.000000  0.000000  0.483591    0.000000  0.000000  0.000000  
Doc1  0.000000  0.433452  0.000000    0.000000  0.433452  0.000000  
Doc2  0.461381  0.000000  0.000000    0.461381  0.000000  0.461381   

===== Word-Context Matrix (WCM) [before alignment] =====
Shape: (14, 14) 

===== Word-Context Matrix (WCM) [aligned to WDM vocab] =====
Shape: (13, 13) 

===== Q4: Resultant Word-Document Matrix (WDM × WCM) =====
           and      deep       fun        is  language  learning   machine  \
Doc0  1.334966  1.620583  1.620583  1.824033  0.285617  2.559600  1.22

In [28]:
# Q5: WDM^T × WDM = Word-Word Matrix
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

# Same corpus
docs = [
    "Machine learning is fun and powerful",
    "Deep learning is a subset of machine learning",
    "Natural language processing uses machine learning"
]

# Preprocess
docs_cleaned = [d.lower() for d in docs]

# 1) Word-Document Matrix
vectorizer = TfidfVectorizer()
WDM = vectorizer.fit_transform(docs_cleaned)
words = vectorizer.get_feature_names_out()
wdm_df = pd.DataFrame(WDM.toarray(),
                      index=[f"Doc{i}" for i in range(len(docs_cleaned))],
                      columns=words)

print("===== Original Word-Document Matrix (WDM) =====")
print(wdm_df, "\n")

# 2) Multiply WDM^T × WDM → Word-Word Matrix
q5_result = WDM.T @ WDM
q5_df = pd.DataFrame(q5_result.toarray(), index=words, columns=words)

print("===== Q5: Word-Word Matrix (from WDM^T × WDM) =====")
print(q5_df.iloc[:10, :10], "\n")  # first 10x10 slice

# 3) Compare with Original WCM (local co-occurrence)
# For fair comparison, we can reuse WCM (from Q4)
# NOTE: This requires running Q4 first, because wcm_df is built there.

try:
    print("===== Comparison: WCM vs Q5 Word-Word Matrix (10x10) =====")
    print("WCM (local co-occurrence):")
    print(wcm_df.iloc[:10, :10], "\n")
    print("Q5 Word-Word Matrix (document-based):")
    print(q5_df.iloc[:10, :10])
except:
    print("Run Q4 first to generate WCM for comparison.")


===== Original Word-Document Matrix (WDM) =====
           and      deep       fun        is  language  learning   machine  \
Doc0  0.483591  0.000000  0.483591  0.367784  0.000000  0.285617  0.285617   
Doc1  0.000000  0.433452  0.000000  0.329651  0.000000  0.512007  0.256004   
Doc2  0.000000  0.000000  0.000000  0.000000  0.461381  0.272499  0.272499   

       natural        of  powerful  processing    subset      uses  
Doc0  0.000000  0.000000  0.483591    0.000000  0.000000  0.000000  
Doc1  0.000000  0.433452  0.000000    0.000000  0.433452  0.000000  
Doc2  0.461381  0.000000  0.000000    0.461381  0.000000  0.461381   

===== Q5: Word-Word Matrix (from WDM^T × WDM) =====
               and      deep       fun        is  language  learning  \
and       0.233860  0.000000  0.233860  0.177857  0.000000  0.138122   
deep      0.000000  0.187880  0.000000  0.142888  0.000000  0.221930   
fun       0.233860  0.000000  0.233860  0.177857  0.000000  0.138122   
is        0.177857  0