# **Assignment 3 - Handling Unstructured Textual Data**

**By Siddhant - 102203023**

Importing Libraries

In [72]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

Initialize stopwords and stemmer

In [73]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
ps = PorterStemmer()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\SIDDHANT\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Preprocess the Text

In [74]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|[^a-zA-Z\s]", "", text)
    words = text.split()
    words = [ps.stem(word) for word in words if word not in stop_words]
    return words  # Return as a list of words

**Q1: Based on Loading corpus**

Download the IMDB movie review corpus from the following link:

https://drive.google.com/file/d/1DnRiVquqJ-IrUnBfO4i7ilnCO6mKHD_e/view?usp=sharing

(a) Load the dataset into a Pandas DataFrame

(b) Load first 100 reviews from the review column of DataFrame into a list of strings called as ‘corpus’

In [75]:
file_path = 'IMDB_Dataset.csv'
df = pd.read_csv(file_path)
corpus = df['review'].head(100).tolist()

**Q2: Based on Pre-processing Corpus**

Pre-process each document (i.e. each string of corpus list) so that all words are in lower case, there is no special symbols, url, numbers, or stopwords. Also, each word of the document is stemmed to its root word using Porter Stemmer.

Preprocessing each document in the corpus

In [76]:
preprocessed_corpus = [preprocess_text(doc) for doc in corpus]

Create vocabulary from the preprocessed corpus (all unique words)

In [77]:
vocab = sorted(set([word for doc in preprocessed_corpus for word in doc]))
vocab_dict = {word: i for i, word in enumerate(vocab)}
vocab_size = len(vocab)
num_docs = len(preprocessed_corpus)

Initialize matrices for Binary, Term Frequency, Normalized Term Frequency, and TF-IDF

In [78]:
binary_matrix_scratch = np.zeros((num_docs, vocab_size), dtype=int)
term_frequency_matrix_scratch = np.zeros((num_docs, vocab_size), dtype=int)
normalized_tf_matrix_scratch = np.zeros((num_docs, vocab_size), dtype=float)
tfidf_matrix_scratch = np.zeros((num_docs, vocab_size), dtype=float)

Document Identifiers for DataFrame display

In [79]:
document_identifiers = [f"doc_{i+1}" for i in range(num_docs)]

Calculate Binary and Actual Term Frequency Matrices (from scratch)

In [80]:
for doc_idx, doc in enumerate(preprocessed_corpus):
    for word in doc:
        word_idx = vocab_dict[word]
        term_frequency_matrix_scratch[doc_idx][word_idx] += 1
        binary_matrix_scratch[doc_idx][word_idx] = 1

Calculate Normalized Term Frequency Matrix (L2 normalization)

In [81]:
for doc_idx in range(num_docs):
    doc_length = np.linalg.norm(term_frequency_matrix_scratch[doc_idx])
    if doc_length > 0:
        normalized_tf_matrix_scratch[doc_idx] = term_frequency_matrix_scratch[doc_idx] / doc_length

Calculating TF-IDF Matrix (from scratch)

In [82]:
df_vector = np.sum(term_frequency_matrix_scratch > 0, axis=0)

idf_vector = np.log((num_docs + 1) / (df_vector + 1)) + 1  # Add 1 to avoid division by zero

tfidf_matrix_scratch = term_frequency_matrix_scratch * idf_vector

Creating Dataframe

In [83]:
binary_tdm_scratch = pd.DataFrame(binary_matrix_scratch, index=document_identifiers, columns=vocab)
term_frequency_tdm_scratch = pd.DataFrame(term_frequency_matrix_scratch, index=document_identifiers, columns=vocab)
normalized_tf_tdm_scratch = pd.DataFrame(normalized_tf_matrix_scratch, index=document_identifiers, columns=vocab)
tfidf_tdm_scratch = pd.DataFrame(tfidf_matrix_scratch, index=document_identifiers, columns=vocab)

Binary Term Document Matrix (from scratch)

In [84]:
print("Binary Term Document Matrix (from scratch):")
binary_tdm_scratch

Binary Term Document Matrix (from scratch):


Unnamed: 0,abbot,abbrevi,abet,abid,abil,abl,aboveaverag,abraham,absolut,absorb,...,zani,zellweg,zerog,zeu,zombi,zombiebr,zone,zoo,zoom,zwick
doc_1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
doc_2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
doc_3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
doc_4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,1,0,0,0,0
doc_5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
doc_96,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
doc_97,0,1,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
doc_98,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
doc_99,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Actual Term Frequency Matrix (from scratch)

In [85]:
print("\nActual Term Frequency Matrix (from scratch):")
term_frequency_tdm_scratch


Actual Term Frequency Matrix (from scratch):


Unnamed: 0,abbot,abbrevi,abet,abid,abil,abl,aboveaverag,abraham,absolut,absorb,...,zani,zellweg,zerog,zeu,zombi,zombiebr,zone,zoo,zoom,zwick
doc_1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
doc_2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
doc_3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
doc_4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,1,0,0,0,0
doc_5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
doc_96,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
doc_97,0,1,0,0,0,0,0,0,0,0,...,0,0,0,2,0,0,0,0,0,0
doc_98,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
doc_99,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Normalized Term Frequency Matrix (from scratch)

In [86]:
print("\nNormalized Term Frequency Matrix (from scratch):")
normalized_tf_tdm_scratch


Normalized Term Frequency Matrix (from scratch):


Unnamed: 0,abbot,abbrevi,abet,abid,abil,abl,aboveaverag,abraham,absolut,absorb,...,zani,zellweg,zerog,zeu,zombi,zombiebr,zone,zoo,zoom,zwick
doc_1,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0
doc_2,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0
doc_3,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0
doc_4,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.092848,0.092848,0.0,0.0,0.0,0.0
doc_5,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
doc_96,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0
doc_97,0.0,0.082199,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.164399,0.000000,0.000000,0.0,0.0,0.0,0.0
doc_98,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0
doc_99,0.0,0.000000,0.0,0.0,0.0,0.065938,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0


TF-IDF Matrix (from scratch)

In [87]:
print("\nTF-IDF Matrix (from scratch):")
tfidf_tdm_scratch


TF-IDF Matrix (from scratch):


Unnamed: 0,abbot,abbrevi,abet,abid,abil,abl,aboveaverag,abraham,absolut,absorb,...,zani,zellweg,zerog,zeu,zombi,zombiebr,zone,zoo,zoom,zwick
doc_1,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0
doc_2,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0
doc_3,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0
doc_4,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,4.921973,4.921973,0.0,0.0,0.0,0.0
doc_5,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
doc_96,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0
doc_97,0.0,4.921973,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,9.843947,0.000000,0.000000,0.0,0.0,0.0,0.0
doc_98,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0
doc_99,0.0,0.000000,0.0,0.0,0.0,4.516508,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0


Binary Term Document Matrix (inbuilt method)

In [88]:
vectorizer_binary = CountVectorizer(binary=True)
binary_matrix_inbuilt = vectorizer_binary.fit_transform([" ".join(doc) for doc in preprocessed_corpus]).toarray()

binary_tdm_inbuilt = pd.DataFrame(binary_matrix_inbuilt, index=document_identifiers, columns=vectorizer_binary.get_feature_names_out())
print("\nBinary Term Document Matrix (inbuilt method):")
binary_tdm_inbuilt


Binary Term Document Matrix (inbuilt method):


Unnamed: 0,abbot,abbrevi,abet,abid,abil,abl,aboveaverag,abraham,absolut,absorb,...,zani,zellweg,zerog,zeu,zombi,zombiebr,zone,zoo,zoom,zwick
doc_1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
doc_2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
doc_3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
doc_4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,1,0,0,0,0
doc_5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
doc_96,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
doc_97,0,1,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
doc_98,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
doc_99,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Actual Term Frequency Matrix (inbuilt method)

In [89]:
vectorizer_freq = CountVectorizer()
term_frequency_matrix_inbuilt = vectorizer_freq.fit_transform([" ".join(doc) for doc in preprocessed_corpus]).toarray()

term_frequency_tdm_inbuilt = pd.DataFrame(term_frequency_matrix_inbuilt, index=document_identifiers, columns=vectorizer_freq.get_feature_names_out())
print("\nActual Term Frequency Matrix (inbuilt method):")
term_frequency_tdm_inbuilt


Actual Term Frequency Matrix (inbuilt method):


Unnamed: 0,abbot,abbrevi,abet,abid,abil,abl,aboveaverag,abraham,absolut,absorb,...,zani,zellweg,zerog,zeu,zombi,zombiebr,zone,zoo,zoom,zwick
doc_1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
doc_2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
doc_3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
doc_4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,1,0,0,0,0
doc_5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
doc_96,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
doc_97,0,1,0,0,0,0,0,0,0,0,...,0,0,0,2,0,0,0,0,0,0
doc_98,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
doc_99,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Normalized Term Frequency Matrix (inbuilt method)

In [90]:
from sklearn.preprocessing import normalize
normalized_tf_matrix_inbuilt = normalize(term_frequency_matrix_inbuilt, norm='l2')

normalized_tf_tdm_inbuilt = pd.DataFrame(normalized_tf_matrix_inbuilt, index=document_identifiers, columns=vectorizer_freq.get_feature_names_out())
print("\nNormalized Term Frequency Matrix (inbuilt method):")
normalized_tf_tdm_inbuilt


Normalized Term Frequency Matrix (inbuilt method):


Unnamed: 0,abbot,abbrevi,abet,abid,abil,abl,aboveaverag,abraham,absolut,absorb,...,zani,zellweg,zerog,zeu,zombi,zombiebr,zone,zoo,zoom,zwick
doc_1,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0
doc_2,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0
doc_3,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0
doc_4,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.092848,0.092848,0.0,0.0,0.0,0.0
doc_5,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
doc_96,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0
doc_97,0.0,0.082199,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.164399,0.000000,0.000000,0.0,0.0,0.0,0.0
doc_98,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0
doc_99,0.0,0.000000,0.0,0.0,0.0,0.066082,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0


TF-IDF Matrix (inbuilt method)

In [91]:
vectorizer_tfidf = TfidfVectorizer()
tfidf_matrix_inbuilt = vectorizer_tfidf.fit_transform([" ".join(doc) for doc in preprocessed_corpus]).toarray()

tfidf_tdm_inbuilt = pd.DataFrame(tfidf_matrix_inbuilt, index=document_identifiers, columns=vectorizer_tfidf.get_feature_names_out())
print("\nTF-IDF Matrix (inbuilt method):")
tfidf_tdm_inbuilt


TF-IDF Matrix (inbuilt method):


Unnamed: 0,abbot,abbrevi,abet,abid,abil,abl,aboveaverag,abraham,absolut,absorb,...,zani,zellweg,zerog,zeu,zombi,zombiebr,zone,zoo,zoom,zwick
doc_1,0.0,0.00000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0
doc_2,0.0,0.00000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0
doc_3,0.0,0.00000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0
doc_4,0.0,0.00000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.122963,0.122963,0.0,0.0,0.0,0.0
doc_5,0.0,0.00000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
doc_96,0.0,0.00000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0
doc_97,0.0,0.12214,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.244279,0.000000,0.000000,0.0,0.0,0.0,0.0
doc_98,0.0,0.00000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0
doc_99,0.0,0.00000,0.0,0.0,0.0,0.092912,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0


**Q4: Based on Co-occurrence Matrix**

(a) Construct a term-term co-occurrence matrix from the pre-processed list of documents (obtained from Q2) whose each (ij)th entry is number of documents in which both ith and jth terms co-occur.

(b) Also, construct a Positive Pointwise Mutual Information (PPMI) whose (ij)th entry is computed

In [92]:
co_occurrence_matrix = np.zeros((vocab_size, vocab_size), dtype=int)

for doc in preprocessed_corpus:
    unique_words = list(set(doc))
    for i in range(len(unique_words)):
        for j in range(i + 1, len(unique_words)):
            word_i = unique_words[i]
            word_j = unique_words[j]
            idx_i = vocab_dict[word_i]
            idx_j = vocab_dict[word_j]
            co_occurrence_matrix[idx_i][idx_j] += 1
            co_occurrence_matrix[idx_j][idx_i] += 1

co_occurrence_df = pd.DataFrame(co_occurrence_matrix, index=vocab, columns=vocab)
print("\nCo-occurrence Matrix:")
co_occurrence_df


Co-occurrence Matrix:


Unnamed: 0,abbot,abbrevi,abet,abid,abil,abl,aboveaverag,abraham,absolut,absorb,...,zani,zellweg,zerog,zeu,zombi,zombiebr,zone,zoo,zoom,zwick
abbot,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
abbrevi,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
abet,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
abid,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
abil,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zombiebr,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
zone,0,0,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
zoo,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
zoom,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [93]:
total_docs = num_docs
term_frequencies = np.sum(binary_matrix_scratch, axis=0)

def ppmi(i, j):
    co_occurrence = co_occurrence_matrix[i, j]
    if co_occurrence == 0:
        return 0
    p_i_j = co_occurrence / total_docs
    p_i = term_frequencies[i] / total_docs
    p_j = term_frequencies[j] / total_docs
    ppmi_value = np.log2(p_i_j / (p_i * p_j))
    return max(ppmi_value, 0)

ppmi_matrix = np.zeros_like(co_occurrence_matrix, dtype=float)

for i in range(vocab_size):
    for j in range(vocab_size):
        ppmi_matrix[i, j] = ppmi(i, j)

ppmi_df = pd.DataFrame(ppmi_matrix, index=vocab, columns=vocab)
print("\nPPMI Matrix:")
ppmi_df


PPMI Matrix:


Unnamed: 0,abbot,abbrevi,abet,abid,abil,abl,aboveaverag,abraham,absolut,absorb,...,zani,zellweg,zerog,zeu,zombi,zombiebr,zone,zoo,zoom,zwick
abbot,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0
abbrevi,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.000000,0.0,6.643856,0.000000,0.0,0.000000,0.0,0.0,0.0
abet,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0
abid,0.0,0.0,0.0,0.000000,5.643856,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.000000,0.0,0.000000,0.000000,0.0,5.643856,0.0,0.0,0.0
abil,0.0,0.0,0.0,5.643856,0.000000,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.000000,0.0,0.000000,0.000000,0.0,6.643856,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zombiebr,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.000000,0.0,0.000000,6.643856,0.0,0.000000,0.0,0.0,0.0
zone,0.0,0.0,0.0,5.643856,6.643856,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0
zoo,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0
zoom,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0
