In [32]:
# import libraries
import pandas as pd
import numpy as np
import seaborn as sns
import requests
import xml.dom.minidom as m
import xml.etree.ElementTree as et
import time as time
import json
import plotnine as plt

In [19]:
# import data for Alzheimer articles
al_dict = {}
with open("Alzheimer.json") as f:
    al_dict = json.load(f)

In [20]:
# import data for cancer articles
cn_dict = {}
with open("Alzheimer.json") as f:
    cn_dict = json.load(f)

In [21]:
# create dataframe from al_dict
al_df = pd.DataFrame.from_dict(al_dict, orient='index')
al_df.head()

Unnamed: 0,title,abstract,query,mesh
33939349,Electroconvulsive Therapy for the Treatment of...,Dementia refers to a state of cognitive impair...,,
33841007,Gintonin facilitates brain delivery of donepez...,Gintonin is a ginseng-derived exogenous G-prot...,,
33627920,Bayesian Scalar on Image Regression With Nonig...,Medical imaging has become an increasingly imp...,,
33463291,The Structural Basis of Amyloid Strains in Alz...,Amyloid fibrils represent one of the defining ...,,
33323224,Healthy ageing through internet counselling in...,Although web-based interventions have been pro...,,


In [22]:
# create dataframe from cn_dict
cn_df = pd.DataFrame.from_dict(cn_dict, orient='index')
cn_df.head()

Unnamed: 0,title,abstract,query,mesh
33939349,Electroconvulsive Therapy for the Treatment of...,Dementia refers to a state of cognitive impair...,,
33841007,Gintonin facilitates brain delivery of donepez...,Gintonin is a ginseng-derived exogenous G-prot...,,
33627920,Bayesian Scalar on Image Regression With Nonig...,Medical imaging has become an increasingly imp...,,
33463291,The Structural Basis of Amyloid Strains in Alz...,Amyloid fibrils represent one of the defining ...,,
33323224,Healthy ageing through internet counselling in...,Although web-based interventions have been pro...,,


In [26]:
# count number of Alzheimer papers that have no MeSH terms
# note that the finding of mesh terms in Q1 is incorrect hence no mesh terms exist in dataframes, but this is inaccurate
al_no_mesh = len(al_df[al_df['mesh'] == "N/A"])
print("There are", al_no_mesh, "Alzheimer papers that have no MeSH terms.")

There are 1000 Alzheimer papers that have no MeSH terms.


In [27]:
# count number of cancer papers that have no MeSH terms
cn_no_mesh = len(cn_df[cn_df['mesh'] == "N/A"])
print("There are", cn_no_mesh, "cancer papers that have no MeSH terms.")

There are 1000 cancer papers that have no MeSH terms.


In [None]:
# function for finding 10 most frequent mesh terms
# returns a dictionary sorted by keys in descending order
def mesh_frequency(df):
    mesh_counts = {}
    meshes = df['mesh']
    for m in meshes:
        for i in m:
            if i not in mesh_counts: mesh_counts[i] = 0
            mesh_counts[i] += 1
    dict(sorted(mesh_counts.items(), key=lambda item: item[1], ascending=False))
    return mesh_counts

In [None]:
# 10 most frequent mesh terms in al_df
# due to error in extracting mesh and query information in Q1, below results and graph are incorrect but code could work for another dataset
al_top_10_mesh = mesh_frequency(al_df)

In [None]:
# plot frequency of 10 most common al_df mesh terms
al_x = list(al_top_10_mesh.keys()[:10])
al_y = list(al_top_10_mesh.values()[:10])

plt.bar(range(len(al_top_10_mesh)), al_y, tick_label=al_x)
plt.show()

In [31]:
# 10 most frequent mesh terms in cn_df
# due to error in extracting mesh and query information in Q1, below results and graph are incorrect but code could work for another dataset
cn_top_10_mesh = mesh_frequency(cn_df)

In [None]:
# plot frequency of 10 most common cn_df mesh terms
cn_x = list(cn_top_10_mesh.keys()[:10])
cn_y = list(cn_top_10_mesh.values()[:10])

plt.bar(range(len(cn_top_10_mesh)), cn_y, tick_label=cn_x)
plt.show()

In [None]:
# table comparing number of articles from cn_df and al_df that have both matching mesh terms as specified in homework manual
al_top_5_mesh = al_x[:5]
cn_top_5_mesh = cn_x[:5]

al_cn_mesh = pd.DataFrame(index=al_top_5_mesh, columns=cn_top_5_mesh)

for row in al_top_5_mesh:
    for col in cn_top_5_mesh:
        al_cn_mesh.loc[row][col] = al_top_10_mesh[row] + cn_top_10_mesh[col]

al_cn_mesh.head()