# Script for parsing XML files 

Files exported from IU One Search, read into this script, leading to an export of a graph.gexf file saved to drive for reading into Gephi for visualization and Louvain Modularity grouping. 

R. Fischer
rkfische@iu.edu

## packages

In [5]:
import xml.etree.ElementTree as ET
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
import numpy as np
import math
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS 
import matplotlib.cm as cm

import networkx as nx
#import community.community_louvain
from networkx.algorithms.community.centrality import girvan_newman

from collections import Counter

## Microbiome exports with second tier search for bacteria in the subject lists

In [6]:


'''
xml_files = ["microbiome_amplicon.xml", "microbiome_wholegenome.xml", "microbiome_deNovo.xml", 
             "microbiome_index.xml",'microbiome_consortium.xml',
             "microbiome_parkinson.xml", "microbiome_cancer.xml", 
             "microbiome_diabetes.xml", "microbiome_alzheimer.xml",
             "ibs.xml"]
file_tags = ['amp', 'wg', 'dn', 'ind', 'con','p', 'c', 'd', 'a','ib']

bacteria_list = ['escherichia', 'coli', 'staphylococcus', 'aureus',
                 'enterococcus', 'faecalis',
                'staphylococcus', 'epidermidis', 'carbapenems',
                'salmonella', 'lactobacillus', 'streptococcus',
                'prevotella', 'coxiella', 'burnetii', 'legionella',
                'akkermansia', 'bifidobacterium',
                'lactobacillus', 'escherichia', 
                'roseburia', 'blautia', 'faecalibacterium']
'''

'\nxml_files = ["microbiome_amplicon.xml", "microbiome_wholegenome.xml", "microbiome_deNovo.xml", \n             "microbiome_index.xml",\'microbiome_consortium.xml\',\n             "microbiome_parkinson.xml", "microbiome_cancer.xml", \n             "microbiome_diabetes.xml", "microbiome_alzheimer.xml",\n             "ibs.xml"]\nfile_tags = [\'amp\', \'wg\', \'dn\', \'ind\', \'con\',\'p\', \'c\', \'d\', \'a\',\'ib\']\n\nbacteria_list = [\'escherichia\', \'coli\', \'staphylococcus\', \'aureus\',\n                 \'enterococcus\', \'faecalis\',\n                \'staphylococcus\', \'epidermidis\', \'carbapenems\',\n                \'salmonella\', \'lactobacillus\', \'streptococcus\',\n                \'prevotella\', \'coxiella\', \'burnetii\', \'legionella\',\n                \'akkermansia\', \'bifidobacterium\',\n                \'lactobacillus\', \'escherichia\', \n                \'roseburia\', \'blautia\', \'faecalibacterium\']\n'

## Tele-critical care search output

In [7]:
xml_files = ["tcc.xml"]
file_tags = ['t']


## Functions for parsing the xml file, creating records lists, tokenizing abstract and creating network graph file

In [8]:
def create_root(file):
    tree = ET.parse(file)
    root = tree.getroot()
    return root

In [9]:
def records_no_sub_subj(root):
    text_list = []
    record_list = []
    article_count = 0
    for record in root.findall("rec"):
        article_count +=1
        record_list.append(record.attrib['resultID'])
    # print(record_list)
    return [record_list, article_count]

In [10]:
def records_with_bacteria_subj(root):
    text_list = []
    record_list = []
    article_count = 0
    for record in root.findall("rec"):
        article_count +=1
        try:
            subjects = record.findall("header/controlInfo/artinfo/sug/")
            bacteria_flag = 0
            if len(subjects)>0:
                for subj in subjects:
                    try:
                        text_list.append(subj.text.lower().split())
                        text_words = text_list.pop()
                    except:
                        pass
                    for word in text_words:
                        if word in bacteria_list:
                            # print("word matches", word)
                            bacteria_flag = 1
            if bacteria_flag == 1:
                record_list.append(record.attrib['resultID'])
        except:
            pass
    # print(record_list)
    return [record_list, article_count]


In [11]:

def get_tag(find_code, record_list, root, tag, G, flag=0 ):   
    info_list=[]
    rec_list =[]
    
    info=0
    rec=0
    for record in root.findall("rec"):
        rec+=1
        # Check if ID includes a bacteria from record_list
        if record.attrib['resultID'] in record_list:
            recID = tag+record.attrib['resultID']
            G.add_node(recID, type='recordID') 
            rec_list.append(recID)
            #rec_list.append(tag)
            tag_text = "NAB"
            temp_list = []
            if flag > 0:
                try:
                    temp_tag = record.findall(find_code)                
                    if len(temp_tag)>0:
                        for tt in temp_tag:
                            try:
                                if flag >1:
                                    temp_list.append(tt.text.lower()[0:30])
                                else:
                                    sub_words = tt.text.lower().split()
                                    for word in sub_words:
                                        temp_list.append(word)
                                while len(temp_list)>0:
                                    tag_text = temp_list.pop()
                                #print(tag_text)
                            except:
                                pass
                    
                    
                except:
                    tag_text = "NAT"
                
            else:
                try:
                    tag_text = record.find(find_code).attrib['year']
                except:
                    tag_text = "NAT"

            info_list.append(tag_text)
            if not tag_text.startswith("NA"):
                G.add_edge(recID, tag_text)
    return info_list, rec_list


In [12]:

def create_graph(node1,node2,label1, label2):
    for n1, n2 in zip(node1,node2): 
        if n1 != "NAT": G.add_node(n1, type=label1) 
        if n2 != "NAT": G.add_node(n2,type=label2) 
        if n1 != "NAT" and n2 != "NAT": G.add_edge(n1, n2)

def create_edge(list1, list2):
    for l1 in list1:
        for l2 in list2:
            G.add_edge(l1,l2)

In [13]:
def get_abstracts(record_list, root):
    tag_dict = {'au':'author', 'atl': 'title', 'subj': 'subject', 'affil': 'affiliation',
               'fmt': 'format'}
    ab_text_list = []
    rec_nbr=[]
    for record in root.findall("rec"):
        if record.attrib['resultID'] in record_list:
            try:
                abstract = record.find("header/controlInfo/artinfo/ab").text
                #There may be more than one ab per article
                if abstract not in ab_text_list:
                    ab_text_list.append(abstract)
                    rec_nbr.append(record.attrib['resultID'])
            except:
                pass
    # print(rec_nbr)
    return [ab_text_list, rec_nbr]
    

In [35]:
def topic_generation(text_list, tag, rec_nbr, number_of_topics, number_of_tokens):
    topic_list = []
    search_list = []
    '''
    vect = TfidfVectorizer(smooth_idf=True, sublinear_tf=False, norm=None, analyzer='word',
                           max_features=3000, min_df=2, ngram_range=(1,1),
                          stop_words='english', strip_accents ='ascii')
    '''
    vect = CountVectorizer(analyzer='word',
                           max_features=250, min_df=2, ngram_range=(1,1),
                          stop_words='english', strip_accents ='ascii')
    X = vect.fit_transform(text_list)
    feature_names = np.array(vect.get_feature_names())
    # merge the search file tag to the record number to link back to a specific article
    index_rec = [tag+str(x) for x in rec_nbr]
    # save X array into a dataframe 
    # with word tokens as columns and file record number as row index
    tf_idf_df = pd.DataFrame(X.toarray(), columns = feature_names, index=index_rec)
    
    # count the nonzero values as part of the number of topics calculation
    x_ar = X.todense()
    non_zero_values = 0
    for x in x_ar:
        non_zero_values += np.count_nonzero(x)

    #print("Number of abstract texts (length of text_list)",len(text_list))
    #print("number of words (len of feature_names)", len(feature_names))
    #print("non_zero_values", non_zero_values)
    number_of_topics = round( (len(text_list)*len(feature_names))/non_zero_values )
    #print("Number of recommended topics:", number_of_topics )

    nmf = NMF(n_components=number_of_topics, solver="mu")
    W = nmf.fit_transform(X)
    H = nmf.components_
    for i, topic in enumerate(H):
        topic_words=[]
        for x in feature_names[topic.argsort()[-number_of_tokens:]]:
            topic_words.append(x)
        topic_list.append(topic_words)
        search_list.append(tag+str(i+1))
    return topic_list, search_list, tf_idf_df

## Main script calling the functions above

Example of hacking vs coding is commenting out the tags to limit what's exported to the graph file.  Final product is network graph "graph.gefx" that's exported for reading into Gephi for visualizations and modularity grouping.

In [45]:
#df = pd.DataFrame(columns = ['authors', 'recs1', 'journals', 'recs2','year', 'recs3', 'subjects', 'recs4', 'place', 'recs5'])
G = nx.DiGraph()
sub_subj_flag = 0

for file, tag in zip(xml_files, file_tags):
    # for linking results to the search tag vs individual articles
    # G.add_node(tag, type='keyword')
    
    print("\n", file, "tag", tag)
    # parse the file
    root = create_root(file)
    
    # flag for using a two-tiered search, default is single tier
    if sub_subj_flag > 0:
        record_list, article_count = records_with_bacteria_subj(root)
    else:
        record_list, article_count = records_no_sub_subj(root)
    
    # for each abstract file within the search topic
    for record in record_list:
        
        # get lists of NMF topics, articles and words in dataframe matrix
        ab_text_list, rec_nbr = get_abstracts(record_list, root)
        topic_list, search_list, tf_idf_df = topic_generation(ab_text_list, tag, rec_nbr, 10, 5)
        
        # get infor from the desired article components, authors, subjects, year, 
        
        #authors, recs1 = get_tag(".//aug", record_list, root, tag)
        #authors, recs1 = get_tag("header/controlInfo/artinfo/aug/", record, root, tag, G, flag = 2)

        #create_graph(authors, recs1, "author", "rec")

        #journals, recs2 = get_tag(".//jtl", record, root, tag, G, flag=2)
        #create_graph(journals, recs2, "journal", "rec")

        #year, recs3 = get_tag(".//dt", record, root, tag, G, flag=0)
        #create_graph(year, recs3, "year", "rec")

        subjects,recs4 = get_tag("header/controlInfo/artinfo/sug/", record, root, tag, G, flag =1)
        #create_graph(subjects, recs4, "subject", "rec")

        #place,recs5 = get_tag(".//place", record, root, tag, G, flag=2)
        #create_graph(place, recs5, "place", "rec")


        # list_of_lists = [authors, recs1, journals, recs2, year, recs3, subjects, recs4, place, recs5]
        

        
        '''        



        df2 = pd.DataFrame(journals, columns = ['node'])
        #j_df = pd.DataFrame({'count' : df2.groupby(by='node').size().nlargest(25)}).reset_index()
        for idx, row in df2.iterrows():
            G.add_node(row['node'], type='journal')


            
        df5 = pd.DataFrame(place, columns = ['node'])
        #p_df = pd.DataFrame({'count' : df5.groupby(by='node').size().nlargest(25)}).reset_index()
        for idx, row in df5.iterrows():
            G.add_node(row['node'], type='place')

        
        df3 = pd.DataFrame(year, columns = ['node'])
        # y_df = pd.DataFrame({'count' : df3.groupby(by='node').size().nlargest(25)}).reset_index()
        for idx, row in df3.iterrows():
            G.add_node(row['node'], type='year')


        
        # create graph nodes for each of 
        df1 = pd.DataFrame(authors, columns = ['node'])
        # au_df = pd.DataFrame({'count' : df1.groupby(by='node').size().nlargest(25)}).reset_index()
        for idx, row in df1.iterrows():
            G.add_node(row['node'], type='author')
        '''
        
        for idx, row in tf_idf_df.iterrows():
             for i in range(len(row)):
                if row[i] > 0 :
                    #print(row.index[i]) # get the column name word
                    G.add_node(row.index[i], type='topic')
                    G.add_edge(idx,row.index[i] )

        df4 = pd.DataFrame(subjects, columns = ['node'])
        # s_df = pd.DataFrame({'count' : df4.groupby(by='node').size().nlargest(25)}).reset_index()
        for idx, row in df4.iterrows():
            G.add_node(row['node'], type='subject')

        df_nodes = df4.copy(deep=True)
        #df_nodes = pd.concat([df1, df3])
        #df_nodes = pd.concat([df_nodes, df5])
        #df_nodes = pd.concat([df_nodes, df4])
        #df_nodes = pd.concat([df_nodes, df5])
        #df_nodes.columns = ['node', 'count']
        #print(df_nodes.head())

        df_nodes.head()
    
    #print(len(authors), len(journals), len(year), len(subjects), len(place))
    #print(len(recs1), len(recs2), len(recs3), len(recs4), len(recs5))
nx.write_gexf(G, 'graph.gexf')


 tcc.xml tag t


In [27]:
df_nodes.head()


Unnamed: 0,node
0,"department of surgery, naval m"
1,NAB
0,female
1,NAB
