In [17]:
#import libraries
import os
import re
import numpy as np
import pandas as pd
import json
from pprint import pprint
import random
import string
from nltk import word_tokenize
from nltk.corpus import stopwords
from gensim.corpora import Dictionary
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_colwidth', 100)

In [18]:
documents_dir='../input/CORD-19-research-challenge/document_parses/pdf_json/'
filenames = os.listdir(documents_dir)
print("Number of documents :", len(filenames))

Number of documents : 401214


In [19]:
filenames[:5]

['8f97e16f3842e4bbd2d5d1c0c95ac1e31993ec68.json',
 '8187ea360c53a56ca2c579d758a5d6aa67716836.json',
 'a0d063dca746b135afe0451ce0b3bb1e06cf15ae.json',
 'edb294108440787c9f074483fd3c953a83e53622.json',
 'e0777fb5df224525ee1b06008582b084c1b6b13b.json']

In [20]:
random.shuffle(filenames)

In [21]:
file = json.load(open('../input/CORD-19-research-challenge/document_parses/pdf_json/0000028b5cc154f68b8a269f6578f21e31f62977.json', 'rb'))

In [22]:
# pprint(file)
pprint(file["metadata"]["title"])

'"Multi-faceted" COVID-19: Russian experience'


# Step 1 : Data Cleaning¶

In [23]:
def clean(text):
    text = str(text).lower()
    text = re.sub(r'\[.*?\]', '', text)
    text = re.sub(r'\(.*?\)', '', text)
    text = re.sub(r"\s+", " ", text)
    text = re.sub(r'\w*\d\w*', '', text)
    text = re.sub(r"\w+…|…", "", text)  # Remove ellipsis (and last word)
    text = re.sub(f"[{re.escape(string.punctuation)}]", "", text)
    return text

In [24]:
def remove_stopwords_and_tokenize(text):
    my_stopwords = set(stopwords.words("english"))
    tokens = word_tokenize(text)  # tokenize 
    tokens = [t for t in tokens if not t in my_stopwords]  # Remove stopwords
    tokens = [t for t in tokens if len(t) > 1]  # Remove short tokens
    return tokens

In [25]:
def parse_body_text(body_text):
    body =""
    for item in body_text:
        body += item["section"]
        body += "\n\n"
        body += item["text"]
        body += "\n\n"
    body=clean(body)
    tokens=remove_stopwords_and_tokenize(body)
    return body,tokens

In [26]:
all_text = []
all_tokens=[]
all_titles=[]
for i,filename in enumerate(filenames[:1000]):
    filepath = documents_dir + filename
    file = json.load(open(filepath, 'rb'))
    text,tokens=parse_body_text(file["body_text"])
    all_text.append(text)
    all_tokens.append(tokens)
    all_titles.append(file["metadata"]["title"])

In [27]:
data=pd.DataFrame()
data['text']=all_text
data['tokens']=all_tokens
data['doc_id']=filenames[:1000]
data['title']=all_titles
del all_text,all_tokens,all_titles
data.head(2)

Unnamed: 0,text,tokens,doc_id,title
0,introduction the call to train physicians knowledgeable about and proficient in performance impr...,"[introduction, call, train, physicians, knowledgeable, proficient, performance, improvement, inc...",ad73627f68e8df0a4b0230b638f62f09456f167c.json,A longitudinal course pilot to improve surgical resident acquisition of quality improvement skills
1,introduction proteins are macromolecules playing vital roles in most biological processes under...,"[introduction, proteins, macromolecules, playing, vital, roles, biological, processes, understan...",ae861828dd1f0d6a79f8e9bf1d207d705047069a.json,MUfoldQA_G: High-accuracy protein model QA via retraining and transformation-NC-ND license (http...


# Step 2 : Apply LDA model¶

In [28]:
# Create a dictionary representation of the documents.
# from gensim.corpora import Dictionary
dictionary = Dictionary(data["tokens"])

# Filter out words that occur less than 20 documents, or more than 50% of the documents.
dictionary.filter_extremes(no_below=20, no_above=0.5)

In [29]:
# Bag-of-words representation of the documents.
corpus = [dictionary.doc2bow(doc) for doc in data["tokens"]]

In [33]:
from gensim.models import LdaModel
# Build LDA model
lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=20, random_state=100, chunksize=200, passes=100)
lda_model

<gensim.models.ldamodel.LdaModel at 0x7d96a81c7310>

In [34]:
lda_model.print_topics()[:5]

[(0,
  '0.017*"rna" + 0.013*"viral" + 0.012*"protein" + 0.011*"viruses" + 0.011*"virus" + 0.010*"cells" + 0.008*"proteins" + 0.007*"assay" + 0.006*"samples" + 0.005*"antibodies"'),
 (1,
  '0.011*"social" + 0.007*"users" + 0.006*"people" + 0.005*"process" + 0.005*"work" + 0.004*"like" + 0.004*"researchers" + 0.004*"knowledge" + 0.004*"public" + 0.004*"community"'),
 (2,
  '0.025*"model" + 0.008*"set" + 0.008*"function" + 0.008*"models" + 0.007*"problem" + 0.006*"section" + 0.005*"epidemic" + 0.005*"solution" + 0.005*"optimal" + 0.005*"parameters"'),
 (3,
  '0.250*"de" + 0.131*"la" + 0.088*"en" + 0.058*"na" + 0.037*"et" + 0.031*"si" + 0.028*"un" + 0.027*"se" + 0.026*"con" + 0.019*"ha"'),
 (4,
  '0.021*"protein" + 0.017*"sarscov" + 0.016*"binding" + 0.014*"proteins" + 0.014*"structure" + 0.010*"activity" + 0.010*"compounds" + 0.009*"molecular" + 0.008*"drugs" + 0.008*"interactions"')]

In [35]:
lda_model[corpus][0]

[(1, 0.010122193),
 (5, 0.16011707),
 (6, 0.034634273),
 (10, 0.2700027),
 (12, 0.07439547),
 (13, 0.095658265),
 (16, 0.35018814)]

In [36]:
lda_model.show_topic(1)

[('social', 0.010722387),
 ('users', 0.0068944558),
 ('people', 0.0057363007),
 ('process', 0.005140736),
 ('work', 0.0047215866),
 ('like', 0.0043636975),
 ('researchers', 0.004256799),
 ('knowledge', 0.004256412),
 ('public', 0.004189861),
 ('community', 0.003940274)]

# Step 4: Results¶


* Document - Topic Table¶



In [38]:
def get_document_topic_table(lda_model, corpus, texts=data):  # This function organizes and identifies topics for each document
    # Initialize an empty table to store the results
    document_topic_df = pd.DataFrame()  # Create an empty DataFrame to store topics for documents

    # Loop through each document in the corpus
    for i, row_list in enumerate(lda_model[corpus]):  # Go through each document and its topic distribution
        # Sort topics for the document by their contribution in descending order
        row = sorted(row_list, key=lambda x: (x[1]), reverse=True)  # Arrange topics by importance (highest first)

        # Pick the most important topic for the current document
        topic_num = row[0][0]  # The topic number with the highest contribution
        prop_topic = row[0][1]  # The proportion of the topic in the document

        # Get the top words associated with the most important topic
        wp = lda_model.show_topic(topic_num)  # Fetch the words defining this topic
        topic_keywords = ", ".join([word for word, prop in wp])  # Combine the topic's top words into a string

        # Save the details to the table
        document_topic_df.at[i, 'best_topic'] = topic_num  # Save the topic number
        document_topic_df.at[i, 'prop_topic'] = prop_topic  # Save the proportion of the topic
        document_topic_df.at[i, 'topic_keywords'] = topic_keywords  # Save the words defining the topic
        document_topic_df.at[i, 'document_num'] = i  # Save the document number

    # Return the completed table with topics for all documents
    return document_topic_df  # Give back the table with the topic details

# Call the function to get the table of topics for all documents
document_topic_df = get_document_topic_table(lda_model=lda_model, corpus=corpus, texts=data["tokens"])  # Analyze topics for the tokenized text data

In [51]:
document_topic_df.head()

Unnamed: 0,best_topic,prop_topic,topic_keywords,document_num
0,16.0,0.350181,"students, learning, education, school, quality, online, remote, academic, course, skills",0.0
1,6.0,0.726115,"model, methods, method, models, performance, features, dataset, learning, network, detection",1.0
2,5.0,0.37849,"treatment, outcomes, intervention, mortality, outcome, age, cohort, years, review, statistical",2.0
3,0.0,0.482342,"rna, viral, protein, viruses, virus, cells, proteins, assay, samples, antibodies",3.0
4,0.0,0.521417,"rna, viral, protein, viruses, virus, cells, proteins, assay, samples, antibodies",4.0


# Recommend k topics¶

In [52]:
def get_topic_id(doc_id):  # This function finds the topic ID for a given document ID
    for i, row in data.iterrows():  # Loop through each row in the dataset
        if row["doc_id"] == doc_id:  # Check if the current document ID matches the input document ID
            # Return the topic ID for the matching document
            return document_topic_df["best_topic"][i]  # Fetch the best topic for the matching document
    return -1  # If no match is found, return -1 to indicate failure

def get_matching_topics_docs(topic_id):  # This function finds all documents with the same topic ID
    matched_topics = []  # Initialize a list to store matching topics and documents
    for i, row in document_topic_df.iterrows():  # Loop through each row in the topic table
        if row["best_topic"] == topic_id:  # Check if the topic ID matches the input topic ID
            # Create a tuple with topic ID, proportion, and document number
            topic_prop_doc = (topic_id, row["prop_topic"], i)  
            matched_topics.append(topic_prop_doc)  # Add the tuple to the matched topics list
    return matched_topics  # Return the list of matched topics

def get_top_k_topics(matched_topics, k):  # This function retrieves the top K documents for a given topic
    # Sort the matched topics by their topic proportion in descending order
    top_k = sorted(matched_topics, key=lambda x: [x[1]], reverse=True)  
    
    print(top_k[:k])  # Display the top K matched topics for verification
    
    # Create an empty DataFrame to store details of the top K topics
    k_topics_df = pd.DataFrame(columns=["doc_id", "topic_id", "topic_prop", "title"])  
    
    i = 0  # Counter for the DataFrame rows
    for topic_id, topic_prop, doc_num in top_k[:k]:  # Loop through the top K matched topics
        # Populate the DataFrame with details of each topic
        k_topics_df.at[i, 'doc_id'] = data["doc_id"][doc_num]  # Save the document ID
        k_topics_df.at[i, 'topic_id'] = topic_id  # Save the topic ID
        k_topics_df.at[i, 'topic_prop'] = topic_prop  # Save the topic proportion
        k_topics_df.at[i, 'title'] = data["title"][doc_num]  # Save the document title
        i += 1  # Move to the next row
    return k_topics_df  # Return the DataFrame with top K topics

In [53]:
def recommend_k_topics(doc_id, k):  # This function recommends the top K topics for a given document
    topic_id = get_topic_id(doc_id)  # Get the topic ID for the input document
    if topic_id != -1:  # Check if the topic ID is valid
        # Get all documents that match the topic ID
        matched_topics = get_matching_topics_docs(topic_id)  
        # Get the top K topics from the matched topics
        return get_top_k_topics(matched_topics, k)  

In [54]:
# Example usage: Recommend the top 5 topics for a given document
k_topics_df = recommend_k_topics('/kaggle/input/CORD-19-research-challenge/document_parses/pdf_json/0000028b5cc154f68b8a269f6578f21e31f62977.json', 5)  
k_topics_df  # Display the resulting DataFrame with recommendations