# Initial familiarization with some files with data

In [1]:
import glob
import os

from pathlib import Path
import xml.etree.ElementTree as ET
# get list of files in data/raw


current_dir = os.path.dirname(os.path.realpath('.'))
project_dir = Path(current_dir).parents[0]
data_dir = os.path.join(project_dir, 'data')

list_of_files = glob.glob(os.path.join(data_dir, 'raw', '*.xml'))
len(list_of_files)

13300

In [2]:
def get_abstract(filename):
    xml_files = open(os.path.join(data_dir, 'raw', filename), 'r').read()

    root = ET.fromstring(xml_files)

 
    # Iterate through elements
    abstract = ''
    for child in root:
        # print(f'Tag: {child.tag}, Attributes: {child.attrib}')
        for subchild in child:
            if 'abstract' in str(subchild.tag).lower():
                abstract = subchild.text
                print(f'  Sub-tag: {subchild.tag}, Text: {subchild.text[:]}')
                #print(f'  Sub-tag: {subchild.tag}, Text: {subchild.text[:]}')
    return abstract


In [3]:
dict_abstracts = {}
for f in list_of_files[:3]:
    try:
        abstract = get_abstract(f)
        paper = f.split('/')[-1].split('.')[0]

        dict_abstracts[paper] = abstract
    except Exception as e:
        print(f'Error with {f}')
        print(e)
        continue

dict_abstracts

  Sub-tag: AbstractNarration, Text: The NSF Convergence Accelerator supports team-based, multidisciplinary efforts that address challenges of national importance and show potential for deliverables in the near future. The objective of this workshop is to explore topics for potential NSF Convergence Accelerator tracks for FY 2021.&lt;br/&gt;&lt;br/&gt;The Future of Privacy Forum (FPF) will convene a workshop on “The Responsible Use of Data During Times of Crisis”, for pandemics as well as other crisis situations. The workshop will bring together US government leaders, international data protection authorities, corporate leaders, technologists, academic researchers and public health experts to examine benefits, risks, and strategies for the collection and protection of data in support of public health initiatives during crises, including for COVID-19. &lt;br/&gt;&lt;br/&gt;Ensuring that US and global stakeholders leading emergency efforts have the data-based knowledge, tools and governan

{'2035358': "The NSF Convergence Accelerator supports team-based, multidisciplinary efforts that address challenges of national importance and show potential for deliverables in the near future. The objective of this workshop is to explore topics for potential NSF Convergence Accelerator tracks for FY 2021.&lt;br/&gt;&lt;br/&gt;The Future of Privacy Forum (FPF) will convene a workshop on “The Responsible Use of Data During Times of Crisis”, for pandemics as well as other crisis situations. The workshop will bring together US government leaders, international data protection authorities, corporate leaders, technologists, academic researchers and public health experts to examine benefits, risks, and strategies for the collection and protection of data in support of public health initiatives during crises, including for COVID-19. &lt;br/&gt;&lt;br/&gt;Ensuring that US and global stakeholders leading emergency efforts have the data-based knowledge, tools and governance structures to naviga

In [7]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# include into stopwords characteres that are typically found in xml files
stopwords = set(stopwords.words('english') + ['.', ',', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']', '{', '}'] + ['lt', 'br', 'gt'])


def preprocess_text(text):
    # Remove non-alphabetic characters and tokenize
    tokens = word_tokenize(re.sub(r'[^a-zA-Z]', ' ', text.lower()))
    # Remove stopwords and lemmatize
    lemmatizer = WordNetLemmatizer()
    processed = [lemmatizer.lemmatize(word) for word in tokens if word not in stopwords]
    return ' '.join(processed)


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/rogergalindo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/rogergalindo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/rogergalindo/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [8]:
processed_abstracts = {paper: preprocess_text(abstract) for paper, abstract in dict_abstracts.items()}
processed_abstracts

{'2035358': 'nsf convergence accelerator support team based multidisciplinary effort address challenge national importance show potential deliverable near future objective workshop explore topic potential nsf convergence accelerator track fy future privacy forum fpf convene workshop responsible use data time crisis pandemic well crisis situation workshop bring together u government leader international data protection authority corporate leader technologist academic researcher public health expert examine benefit risk strategy collection protection data support public health initiative crisis including covid ensuring u global stakeholder leading emergency effort data based knowledge tool governance structure navigate pandemic challenge one defining public policy issue time workshop explore proposed convergence accelerator track accelerate collaboration among government industry academic researcher make health data effective usable achieve positive lasting impact future preparedness pan

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')
tfidf_matrix = vectorizer.fit_transform(processed_abstracts.values())

ModuleNotFoundError: No module named 'sklearn'