# Using [Unstructured.io](https://www.unstructured.io/) to process arXiv Papers and Perform Topic Modelling! 

### Import General Use Packages

In [1]:
import arxiv # Interact with arXiv api to scrape papers
from sentence_transformers import SentenceTransformer # Use Hugging Face Embedding for Topic Modelling
from bertopic import BERTopic # Package for Topic Modelling
from tqdm import tqdm #Progress Bar When Iterating
import glob #Identify Files in Directory
import os #Delete Files in Directory
import pandas as pd #Dataframe Manipulation

### Import [Unstructured](https://unstructured-io.github.io/unstructured/installing.html) Bricks

In [2]:
from unstructured.partition.auto import partition #Base Function to Partition PDF
from unstructured.staging.base import convert_to_dict #Convert List Unstructured Elements Into List of Dicts for Easy Parsing
from unstructured.cleaners.core import clean, remove_punctuation, clean_non_ascii_chars #Cleaning Bricks
import re #Create Custom Cleaning Brick
import nltk #Toolkit for more advanced pre-processing
from nltk.corpus import stopwords #list of stopwords to remove
from typing import List #Type Hinting

### Setup NLTK

In [3]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/pravinsanthanam/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### Create Function to Extract PDFs About Machine Learning from arXiv

In [6]:
def get_arxiv_paper_texts(query: str, max_results: int = 100) -> List[str]:
    """Function to Use arXiv API to Fetch Papers Related to Query, Download and Pre-Process

    Args:
        query (str): query for arXiv API
        max_results (int, optional): Number of Papers to get back. Defaults to 100.

    Returns:
        paper_texts (list[str]): Return list of narrative texts for each paper
    """
    #Get List of Arxiv Papers Matching Our Query
    arxiv_papers = list(
        arxiv.Search(
            query = query,
            max_results = max_results,
            sort_by = arxiv.SortCriterion.Relevance,
            sort_order = arxiv.SortOrder.Descending
        )
        .results()
    )

    #Loop Through PDFs, Download and Pre-Process and Then Delete
    paper_texts = []
    for paper in tqdm(arxiv_papers):
        paper.download_pdf()
        pdf_file = glob.glob('*.pdf')[0]
        elements = partition(pdf_file) #Partition PDF Using Unstructured
        isd = convert_to_dict(elements) #Convert List of Elements to List of Dictionaries
        narrative_texts = [element['text'] for element in isd if element['type'] == 'NarrativeText'] #Only Keep Narrative Text and Combine Into One String
        os.remove(pdf_file) #Delete PDF
        paper_texts += narrative_texts
    return paper_texts


### Run Scrape + PreProcess Function to Get List of Paper Text To Feed Through Topic Modelling Algorithm

In [7]:
paper_texts = get_arxiv_paper_texts(query='natural language processing', max_results=10)

100%|██████████| 10/10 [04:59<00:00, 29.92s/it]


### Run Narrative Texts Through Custom Cleaner Brick Using Unstructured

In [8]:
#Stopwords to Remove
stop_words = set(stopwords.words('english'))

#Function to Apply Whatever Cleaning Brick Functionality to Each Narrative Text Element
def custom_clean_brick(narrative_text: str) -> str:
    """Apply Mix of Unstructured Cleaning Bricks With Some Custom Functionality to Pre-Process Narrative Text

    Args:
        narrative_text (str): Narrative Text or Any Other Sentence

    Returns:
        cleaned_text (str): Text after going through all the cleaning procedures
    """
    remove_numbers = lambda text: re.sub(r'\d+', "", text) #lambda function to remove all punctuation
    cleaned_text = remove_numbers(narrative_text) #Apply Custom Lambda
    cleaned_text = clean(cleaned_text, extra_whitespace=True, dashes=True, bullets=True, trailing_punctuation=True, lowercase=True) #Apply Basic Clean Brick With All the Options
    cleaned_text = remove_punctuation(cleaned_text) #Remove all punctuation
    cleaned_text = ' '.join([word for word in cleaned_text.split() if word not in stop_words]) #remove stop words
    return cleaned_text

#Apply Function to Paper Texts
cleaned_paper_texts = [custom_clean_brick(text) for text in paper_texts]

#Count Narratve Texts
print("Number of Narrative Texts to Run Through Topic Modelling: {}".format(len(cleaned_paper_texts)))

Number of Narrative Texts to Run Through Topic Modelling: 1711


### Setup [BerTopic](https://maartengr.github.io/BERTopic/index.html)

In [9]:
#Choose Which Hugging Face Model You Want to Use
sentence_model = SentenceTransformer("all-MiniLM-L6-v2")

#Initialize Model
topic_model = BERTopic(embedding_model=sentence_model, top_n_words=10, nr_topics=10, verbose=True)

### Run Document Text Through Topic Model To Get Major Topics Discussed in Narrative Texts

In [10]:
#Fit Topic Model and Transform List of Paper Narrative Texts Into Topic and Probabilities
topic_model.fit(cleaned_paper_texts)

#Store Document-Topic Info
doc_topic_info = topic_model.get_document_info(cleaned_paper_texts)

#Store Topic Info
topic_info = pd.DataFrame(topic_model.get_topics())
topic_info = topic_info.applymap(lambda x: x[0])
topic_info.columns = ['topic_{}'.format(col+1) for col in topic_info.columns]

Batches:   0%|          | 0/54 [00:00<?, ?it/s]

2023-04-14 14:27:29,129 - BERTopic - Transformed documents to Embeddings
2023-04-14 14:27:33,621 - BERTopic - Reduced dimensionality
2023-04-14 14:27:33,647 - BERTopic - Clustered reduced embeddings
2023-04-14 14:27:34,255 - BERTopic - Reduced number of topics from 32 to 10


### Checkout Keywords for Each Topic

In [11]:
display(topic_info)

Unnamed: 0,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9
0,neural,language,state,function,cost,publication,graph,llama,tangkhul,want
1,network,natural,rnn,distribution,function,april,computation,like,compound,edu
2,function,model,memory,output,sgd,syst,node,south,root,dsontagcoursesinferenceslidespseudolikelihoodn...
3,networks,word,vector,class,training,technol,nodes,animal,morphological,regardlessly
4,one,planning,input,tanh,expected,date,backward,america,verbs,satisfied
5,input,words,network,data,optimization,vol,function,translation,noun,november
6,vector,based,recurrent,yˆ,algorithm,intell,backpropagation,french,roots,tune
7,language,processing,sequence,loss,set,acm,algorithm,cute,adjectives,return
8,model,models,neural,activation,validation,article,parameters,google,formation,fully
9,training,data,lstm,softmax,rate,trans,output,domesticated,language,results


### Visualize Topics

In [13]:
topic_model.visualize_topics()