Synset based topic modeling

In [1]:
#install libraries
pip install nltk scikit-learn gensim
pip install PyPDF2


Collecting gensim
  Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Collecting numpy>=1.19.5 (from scikit-learn)
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scipy>=1.6.0 (from scikit-learn)
  Downloading scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.7/26.7 MB[0m [31m23.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
[2K   [90m━━━━━━━━━━━━

Syntopextract-Context based topic extraction for agricultural documents

In [None]:
#import libraries
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.wsd import lesk

from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora.dictionary import Dictionary

import PyPDF2

In [None]:
# ==== NLTK Download (run once) ====
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('omw-1.4')  # For enhanced WordNet
nltk.download('punkt_tab') # Download the missing resource
nltk.download('averaged_perceptron_tagger_eng') # Download the missing resource

In [14]:

#synset based topic modeling
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def read_pdf(file_path):
    text = ""
    with open(file_path, 'rb') as file:
        pdf_reader = PyPDF2.PdfReader(file)
        for page_num in range(len(pdf_reader.pages)):
            page = pdf_reader.pages[page_num]
            page_text = page.extract_text()
            if page_text:
                text += page_text + " "
    return text

def preprocess_document(text):
    sentences = sent_tokenize(text)
    sentence_data = []
    for sent in sentences:
        tokens = word_tokenize(sent)
        filtered_tokens = [t for t in tokens if t.isalpha() and t.lower() not in stop_words]
        tagged_tokens = nltk.pos_tag(filtered_tokens)
        lemmas = [lemmatizer.lemmatize(token.lower(), pos=get_wordnet_pos(pos_tag))
                  for token, pos_tag in tagged_tokens]
        sentence_data.append({
            'original_sentence': sent,
            'lemmas': lemmas,
            'tokens': filtered_tokens
        })
    return sentence_data

def assign_synsets_to_lemmas(doc_sentence_data):
    synset_map = {}
    for entry in doc_sentence_data:
        sent_tokens = word_tokenize(entry['original_sentence'])
        for lemma in entry['lemmas']:
            if lemma not in synset_map:
                syn = lesk(sent_tokens, lemma)
                if syn is None:
                    synsets = wordnet.synsets(lemma)
                    # Assign None if no synset is found
                    syn = synsets[0] if synsets else None
                synset_map[lemma] = syn
    return synset_map

def group_lemmas_by_synset(synset_map):
    synset_groups = defaultdict(set)
    for lemma, syn in synset_map.items():
        if syn: # Check if syn is not None
            synset_groups[syn.name()].add(lemma)
        else:
            synset_groups[None].add(lemma)
    return synset_groups

def convert_docs_lemmas_to_synsets(documents_lemmas, synset_map):
    synset_documents = []
    for doc_lemmas in documents_lemmas:
        synset_doc = []
        for lemma in doc_lemmas:
            syn = synset_map.get(lemma)
            if syn: # Check if syn is not None
                synset_doc.append(syn.name())
        synset_documents.append(synset_doc)
    return synset_documents

def get_all_lemmas_from_doc_sentence_data(all_doc_sentence_data):
    all_docs_lemmas = []
    for doc_sentences in all_doc_sentence_data:
        doc_lemmas = []
        for entry in doc_sentences:
            doc_lemmas.extend(entry['lemmas'])
        all_docs_lemmas.append(doc_lemmas)
    return all_docs_lemmas

# ===== Main processing =====

# List your two PDF documents here
pdf_corpus_files = ['doc1.pdf', 'doc2.pdf', 'doc3.pdf', 'doc4.pdf', 'doc5.pdf', 'doc6.pdf', 'doc7.pdf', 'doc8.pdf', 'doc9.pdf',
 'doc10.pdf', 'doc11.pdf', 'doc12.pdf', 'doc13.pdf', 'doc14.pdf', 'doc15.pdf', 'doc16.pdf', 'doc17.pdf',
 'doc18.pdf', 'doc19.pdf', 'doc20.pdf', 'doc21.pdf', 'doc22.pdf', 'doc23.pdf', 'doc24.pdf']


print(f"Reading and processing {len(pdf_corpus_files)} PDF documents...")

all_doc_sentence_data = []
for pdf_file in pdf_corpus_files:
    text = read_pdf(pdf_file)
    sentence_data = preprocess_document(text)
    all_doc_sentence_data.append(sentence_data)

# Collect all lemmas from all documents
all_lemmas = set()
for doc_sentences in all_doc_sentence_data:
    for sent in doc_sentences:
        all_lemmas.update(sent['lemmas'])
print(f"Unique lemmas across the corpus: {len(all_lemmas)}")

# Assign synsets for all lemmas (aggregated from corpus)
# For better accuracy, ideally WSD should be per sentence + lemma,
# but here we assign one synset per lemma using the first context encountered
corpus_synset_map = {}
for doc_sentences in all_doc_sentence_data:
    synset_map = assign_synsets_to_lemmas(doc_sentences)
    corpus_synset_map.update(synset_map)
print(f"Lemmas with assigned synsets: {len(corpus_synset_map)}")


# Group lemmas by synset
lemma_groups_by_synset = group_lemmas_by_synset(corpus_synset_map)

print("\nSample synset groups (showing up to 5):")
for idx, (synset_name, lemmas) in enumerate(lemma_groups_by_synset.items()):
    if synset_name is not None:
        print(f"{synset_name}: {sorted(list(lemmas))}")
    if idx >= 4:
        break

# Print total number of unique synsets (excluding None)
num_real_synsets = sum(1 for syn in lemma_groups_by_synset if syn is not None)
print(f"\nNumber of unique synsets after grouping: {num_real_synsets}")


# Convert entire documents to synset-based representation
all_docs_lemmas = get_all_lemmas_from_doc_sentence_data(all_doc_sentence_data)
synset_docs = convert_docs_lemmas_to_synsets(all_docs_lemmas, corpus_synset_map)

# TF-IDF matrix creation
synset_docs_str = [' '.join(doc) for doc in synset_docs]

vectorizer = TfidfVectorizer(tokenizer=lambda x: x.split(), preprocessor=lambda x: x, lowercase=False)
tfidf_matrix = vectorizer.fit_transform(synset_docs_str)
feature_names = vectorizer.get_feature_names_out()

print(f"\nTF-IDF matrix (documents x synsets) shape = {tfidf_matrix.shape}")

# LDA topic modeling
num_topics = 4
lda = LatentDirichletAllocation(n_components=num_topics, random_state=42)
lda.fit(tfidf_matrix)

topic_strengths = lda.components_.mean(axis=1)
top_topic_indices = topic_strengths.argsort()[::-1][:num_topics]

print(f"\nTop {num_topics} topic indices by average beta:")
print(top_topic_indices)

# Prepare data for coherence calculation
corpus_tokens = [doc.split() for doc in synset_docs_str]
dictionary = Dictionary(corpus_tokens)
gensim_corpus = [dictionary.doc2bow(text) for text in corpus_tokens]

print("\nTopic coherence scores:")

for topic_idx in top_topic_indices:
    top_indices = lda.components_[topic_idx].argsort()[-100:][::-1]
    top_synsets = [feature_names[i] for i in top_indices if i < len(feature_names)]

    cm = CoherenceModel(
        topics=[top_synsets],
        texts=corpus_tokens,
        dictionary=dictionary,
        coherence='c_v'
    )
    coherence_score = cm.get_coherence()
    print(f"Topic {topic_idx + 1} coherence score: {coherence_score:.4f}")

# Display top 10 synsets per topic
print("\nTop 10 synsets per topic:")
for topic_idx in range(num_topics):
    print(f"Topic {topic_idx + 1}:")
    top_indices = lda.components_[topic_idx].argsort()[-10:][::-1]
    top_synsets = [feature_names[i] for i in top_indices if i < len(feature_names)]
    print(", ".join(top_synsets))
    print()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


Reading and processing 24 PDF documents...
Unique lemmas across the corpus: 13177
Lemmas with assigned synsets: 13177

Sample synset groups (showing up to 5):
citizen.n.01: ['citizen']
client.n.01: ['client']
charter.n.01: ['charter']
department.n.03: ['department']
agricultural.a.01: ['agricultural']

Number of unique synsets after grouping: 5888

TF-IDF matrix (documents x synsets) shape = (24, 5888)





Top 4 topic indices by average beta:
[0 2 3 1]

Topic coherence scores:
Topic 1 coherence score: 0.3392
Topic 3 coherence score: 0.4788
Topic 4 coherence score: 0.8736
Topic 2 coherence score: 0.6423

Top 10 synsets per topic:
Topic 1:
research.v.02, agricultural.a.01, system.n.02, institute.v.02, education.n.05, university.n.03, total.v.03, grow.v.08, india.n.01, besides.r.02

Topic 2:
gravitational_constant.n.01, radius.n.01, barn.n.02, two.n.01, degree_centigrade.n.01, vitamin_e.n.01, farad.n.01, west.n.02, idaho.n.01, planck's_constant.n.01

Topic 3:
servicing.n.01, agricultural.a.01, research.v.02, director.n.03, sow.v.01, indentation.n.03, client.n.01, substantial.a.03, part.n.09, reception.n.04

Topic 4:
liter.n.01, cystic_fibrosis.n.01, farad.n.01, terrestrial_time.n.01, degree_centigrade.n.01, joule.n.01, radius.n.01, q.n.01, florida.n.01, computerized_tomography.n.01



In [15]:
#morphological Analysis
num_sentences = sum(len(doc_sentences) for doc_sentences in all_doc_sentence_data)

# Number of unique lemmas in the entire corpus
unique_lemmas = set()
for doc_sentences in all_doc_sentence_data:
    for sent in doc_sentences:
        unique_lemmas.update(sent['lemmas'])
num_lemmas = len(unique_lemmas)

# Number of unique synsets (excluding None)
num_synsets = sum(1 for syn in lemma_groups_by_synset if syn is not None)

print(f"Number of sentences: {num_sentences}")
print(f"Number of unique lemmas: {num_lemmas}")
print(f"Number of unique synsets: {num_synsets}")

Number of sentences: 12908
Number of unique lemmas: 13177
Number of unique synsets: 5888


In [18]:
#sysets formed for similar lemmas
for synset_name, lemmas in lemma_groups_by_synset.items():
    if synset_name is not None and len(lemmas) > 1:
        print(f"Synset: {synset_name} | Lemmas: {sorted(lemmas)}")

Synset: farming.n.01 | Lemmas: ['agriculture', 'husbandry']
Synset: web_site.n.01 | Lemmas: ['site', 'website']
Synset: hypertext_transfer_protocol.n.01 | Lemmas: ['http', 'https']
Synset: offspring.n.01 | Lemmas: ['issue', 'progeny']
Synset: imagination.n.01 | Lemmas: ['imagination', 'vision']
Synset: first_step.n.01 | Lemmas: ['initiative', 'initiatives']
Synset: transaction.n.01 | Lemmas: ['dealing', 'transaction']
Synset: sow.v.01 | Lemmas: ['seed', 'seeded', 'sow', 'sowing', 'sown']
Synset: snip.v.02 | Lemmas: ['crop', 'cropping', 'prune', 'pruning']
Synset: cosmopolitan.s.03 | Lemmas: ['general', 'worldwide']
Synset: allotment.n.02 | Lemmas: ['allocation', 'allotment']
Synset: supply.v.01 | Lemmas: ['furnish', 'furnishes', 'provide', 'supplied']
Synset: october.n.01 | Lemmas: ['oct', 'october']
Synset: order.v.03 | Lemmas: ['prescribe', 'prescribes']
Synset: tip.v.03 | Lemmas: ['fee', 'feed']
Synset: responsibility.n.03 | Lemmas: ['responsibilities', 'responsibility']
Synset: ker