In [1]:
from Bio import Entrez
from pprint import pprint
import pandas as pd 
from collections import Counter



### get mesh structure

In [2]:
import re
meshFile = '/home/adrian/PhD/Data/Pubmed/MeshCodes/d2020.bin'
uiMeshList = {}
NumberMeshList = {}
terms = {}
with open(meshFile, mode='r') as file:
    mesh = file.readlines()

    for i, line in enumerate(mesh):
        if "NEWRECORD" in line and i > 0 and ui is not "" and term is not "" and number is not "s":
            uiMeshList[UI] = {"MH" : term, "MN" : number }
            NumberMeshList[number] = {"MH" : term, "UI" : UI }
            ui = ""
            term = ""
            number = ""
        
                
        meshTerm = re.search('MH = (.+)$', line)
        if meshTerm:
            term = meshTerm.group(1)

        meshNumber = re.search('MN = (.+)$', line)
        if meshNumber:
            number = meshNumber.group(1)
            
        ui = re.search("UI = (.+)", line)
        if ui:
            UI = ui.group(1)

        
print(len(NumberMeshList.keys()))
diabetesMesh = {k:v for k,v in NumberMeshList.items() if k.startswith("C19.246")}
print(len(diabetesMesh.keys()))
diabetesMesh

29638
16


{'C19.246': {'MH': 'Diabetes Mellitus', 'UI': 'D003920'},
 'C19.246.300.500': {'MH': 'Diabetes Mellitus, Lipoatrophic', 'UI': 'D003923'},
 'C19.246.300': {'MH': 'Diabetes Mellitus, Type 2', 'UI': 'D003924'},
 'C19.246.099.500': {'MH': 'Diabetic Angiopathies', 'UI': 'D003925'},
 'C19.246.099.750': {'MH': 'Diabetic Coma', 'UI': 'D003926'},
 'C19.246.099.875': {'MH': 'Diabetic Nephropathies', 'UI': 'D003928'},
 'C19.246.099.937': {'MH': 'Diabetic Neuropathies', 'UI': 'D003929'},
 'C19.246.099.500.382': {'MH': 'Diabetic Retinopathy', 'UI': 'D003930'},
 'C19.246.099.750.490': {'MH': 'Hyperglycemic Hyperosmolar Nonketotic Coma',
  'UI': 'D006944'},
 'C19.246.774': {'MH': 'Prediabetic State', 'UI': 'D011236'},
 'C19.246.200': {'MH': 'Diabetes, Gestational', 'UI': 'D016640'},
 'C19.246.099.812': {'MH': 'Diabetic Ketoacidosis', 'UI': 'D016883'},
 'C19.246.099.937.250': {'MH': 'Diabetic Foot', 'UI': 'D017719'},
 'C19.246.099': {'MH': 'Diabetes Complications', 'UI': 'D048909'},
 'C19.246.537': {'

In [3]:
# Add missing diabetes mesh manually : https://meshb.nlm.nih.gov/treeView
diabetesMeshTotal = diabetesMesh.copy()
diabetesMeshTotal["C19.246.656"] = {"MH" :"Latent Autoimmune Diabetes in Adults", "UI":"D000071698"}
diabetesMeshTotal["C19.246.267.960"] = {"MH" :"Wolfram Syndrome", "UI":"D014929"}
diabetesMeshTotal["C19.246.267"] = {"MH" :"Diabetes Mellitus, Type 1", "UI":"D003922"}
diabetesMeshTotal["C19.246.240"] = {"MH" :"Diabetes Mellitus, Experimental", "UI":"D003921"}
diabetesMeshTotal["C19.246.099.968"] = {"MH" :"Fetal Macrosomia", "UI":"D005320"}
diabetesMeshTotal["C19.246.099.500.191"] = {"MH" :"Diabetic Foot", "UI":"D017719"}

print(len(diabetesMeshTotal))
diabetesMeshTotal

22


{'C19.246': {'MH': 'Diabetes Mellitus', 'UI': 'D003920'},
 'C19.246.300.500': {'MH': 'Diabetes Mellitus, Lipoatrophic', 'UI': 'D003923'},
 'C19.246.300': {'MH': 'Diabetes Mellitus, Type 2', 'UI': 'D003924'},
 'C19.246.099.500': {'MH': 'Diabetic Angiopathies', 'UI': 'D003925'},
 'C19.246.099.750': {'MH': 'Diabetic Coma', 'UI': 'D003926'},
 'C19.246.099.875': {'MH': 'Diabetic Nephropathies', 'UI': 'D003928'},
 'C19.246.099.937': {'MH': 'Diabetic Neuropathies', 'UI': 'D003929'},
 'C19.246.099.500.382': {'MH': 'Diabetic Retinopathy', 'UI': 'D003930'},
 'C19.246.099.750.490': {'MH': 'Hyperglycemic Hyperosmolar Nonketotic Coma',
  'UI': 'D006944'},
 'C19.246.774': {'MH': 'Prediabetic State', 'UI': 'D011236'},
 'C19.246.200': {'MH': 'Diabetes, Gestational', 'UI': 'D016640'},
 'C19.246.099.812': {'MH': 'Diabetic Ketoacidosis', 'UI': 'D016883'},
 'C19.246.099.937.250': {'MH': 'Diabetic Foot', 'UI': 'D017719'},
 'C19.246.099': {'MH': 'Diabetes Complications', 'UI': 'D048909'},
 'C19.246.537': {'

In [4]:
diabetesMeshTotalInv = {v["UI"] : {"MN" : k, "MH" : v["MH"]} for k, v in diabetesMeshTotal.items()}
diabetesMeshTotalInv

{'D003920': {'MN': 'C19.246', 'MH': 'Diabetes Mellitus'},
 'D003923': {'MN': 'C19.246.300.500', 'MH': 'Diabetes Mellitus, Lipoatrophic'},
 'D003924': {'MN': 'C19.246.300', 'MH': 'Diabetes Mellitus, Type 2'},
 'D003925': {'MN': 'C19.246.099.500', 'MH': 'Diabetic Angiopathies'},
 'D003926': {'MN': 'C19.246.099.750', 'MH': 'Diabetic Coma'},
 'D003928': {'MN': 'C19.246.099.875', 'MH': 'Diabetic Nephropathies'},
 'D003929': {'MN': 'C19.246.099.937', 'MH': 'Diabetic Neuropathies'},
 'D003930': {'MN': 'C19.246.099.500.382', 'MH': 'Diabetic Retinopathy'},
 'D006944': {'MN': 'C19.246.099.750.490',
  'MH': 'Hyperglycemic Hyperosmolar Nonketotic Coma'},
 'D011236': {'MN': 'C19.246.774', 'MH': 'Prediabetic State'},
 'D016640': {'MN': 'C19.246.200', 'MH': 'Diabetes, Gestational'},
 'D016883': {'MN': 'C19.246.099.812', 'MH': 'Diabetic Ketoacidosis'},
 'D017719': {'MN': 'C19.246.099.500.191', 'MH': 'Diabetic Foot'},
 'D048909': {'MN': 'C19.246.099', 'MH': 'Diabetes Complications'},
 'D056731': {'MN':

## Get pubmed articles

In [5]:
# get articles 
def search(query, Nhits=20):
    Entrez.email = 'a.ahne@epiconcept.fr'
    handle = Entrez.esearch(db='pubmed', 
                            sort='relevance', 
                            retmax=Nhits,
                            retmode='xml', 
                            term=query)
    results = Entrez.read(handle) # Parse an XML file from the NCBI Entrez Utilities into python objects
    return results


def fetch_details(id_list):
    ids = ','.join(id_list)
    Entrez.email = 'a.ahne@epiconcept.fr'
    handle = Entrez.efetch(db='pubmed',
                           retmode='xml',
                           id=ids)
    results = Entrez.read(handle) # Parse an XML file from the NCBI Entrez Utilities into python objects
    return results


def getMeshListFromDoc(doc): 
    matches = [match["DescriptorName"].attributes["UI"] 
               for match in doc["MedlineCitation"]["MeshHeadingList"]
               if match["DescriptorName"].attributes["UI"] in diabetesMeshTotalInv.keys()
              ]

    if len(matches) > 0 : 
        return matches
    else:
        return None

def parseDate(doc):
    if "DateCompleted" in doc.keys():
        return "{}-{}-{}".format(doc["DateCompleted"]["Year"], doc["DateCompleted"]["Month"], doc["DateCompleted"]["Day"])
    elif "DateRevised" in doc.keys():
        return "{}-{}-{}".format(doc["DateRevised"]["Year"], doc["DateRevised"]["Month"], doc["DateRevised"]["Day"])
    else:
        return None

def clean(doc):
    if "[" == doc[0]:
        doc = doc[1:]
    if "." == doc[-1]:
        doc = doc[0:-1]
    if "]" == doc[-1]:
        doc = doc[0:-1]
    return doc



Nhits = 10000
results = search('diabetes', Nhits)
print(len(results['IdList']))

id_list = results['IdList']
papers = fetch_details(id_list)
print(len(papers["PubmedArticle"]))

rows = []
for i, doc in enumerate(papers["PubmedArticle"]):
    
    if "MeshHeadingList" in doc["MedlineCitation"] and "Abstract" in doc["MedlineCitation"]["Article"]: 
        meshList = getMeshListFromDoc(doc)
        if meshList:
            rows.append({"PMID" : doc["MedlineCitation"]["PMID"]
                        ,"DateCompleted" : parseDate(doc["MedlineCitation"])
                        ,"Title" : clean(doc["MedlineCitation"]["Article"]["ArticleTitle"])
                        ,"Abstract" : doc["MedlineCitation"]["Article"]["Abstract"]["AbstractText"][0]
                        ,"MeshList" : meshList
                        })

diabetesArticles = pd.DataFrame(rows, columns=["PMID", "DateCompleted", "Title", "Abstract", "MeshList"])
diabetesArticles['DateCompleted'] =  pd.to_datetime(diabetesArticles['DateCompleted'], format='%Y-%m-%d')

print(diabetesArticles.shape)
diabetesArticles.head(50)


10000
9997
(7815, 5)


Unnamed: 0,PMID,DateCompleted,Title,Abstract,MeshList
0,29659364,2019-02-05,Association between central diabetes insipidus...,Central diabetes insipidus is a rare disease o...,[D003924]
1,30071825,2019-05-21,Fatal Fournier's gangrene caused by Clostridiu...,Clostridium ramosum is a generally non-pathoge...,[D003922]
2,21842608,2011-11-03,Case of distal renal tubular acidosis complica...,We report herein a 27-year-old male case of in...,[D048909]
3,17026722,2007-01-01,The value of urine specific gravity in detecti...,When a patient with diabetes mellitus presents...,[D003924]
4,25255707,2016-01-12,Congenital central diabetes insipidus and opti...,Wolfram syndrome (WS) is an autosomal recessiv...,[D014929]
5,28588004,2018-05-23,Long-Term Follow-up of a Case with Proprotein ...,Proprotein convertase 1/3 (PC1/3) deficiency i...,"[D003920, D016883]"
6,26783152,2016-10-17,Early central diabetes insipidus: An ominous s...,Central diabetes insipidus (CDI) after cardiac...,"[D058065, D003928]"
7,25301491,2016-10-31,False diagnosis of type 1 diabetes mellitus an...,"Wolfram syndrome (WS), also known as DIDMOAD (...","[D003922, D014929]"
8,17536072,2007-09-06,Microvascular diabetes complications in Wolfra...,Some previous studies suggested that patients ...,"[D003925, D014929]"
9,25330715,2015-10-01,"The spectrum of clinical presentation, diagnos...",Primary mitochondrial diseases refer to a grou...,[D003920]


In [6]:
flat_list = [code for meshlist in diabetesArticles["MeshList"].values for code in meshlist]
print(len(flat_list))
Counter(flat_list)

10378


Counter({'D003924': 4486,
         'D003922': 1186,
         'D048909': 622,
         'D014929': 13,
         'D003920': 1908,
         'D016883': 68,
         'D058065': 74,
         'D003928': 216,
         'D003925': 231,
         'D016640': 700,
         'D003923': 2,
         'D011236': 154,
         'D003921': 374,
         'D003929': 123,
         'D017719': 33,
         'D006944': 4,
         'D003930': 141,
         'D005320': 38,
         'D056731': 1,
         'D000071698': 4})

### keep only records with one diabetes mesh code (need to reduce the dataset to be able to run it)

In [7]:
# keep only records with one diabetes Mesh code

diabArtOneMesh = diabetesArticles[diabetesArticles.apply(lambda row: len(row["MeshList"]) == 1
                                                                      , axis=1)]
diabArtOneMesh["MeshUI"] = diabArtOneMesh["MeshList"].map(lambda x: x[0]) # list entry to string
diabArtOneMesh["MeshNumber"] = diabArtOneMesh.apply(lambda row: diabetesMeshTotalInv[row["MeshUI"]]["MN"], axis=1)
diabArtOneMesh["MeshHeading"] = diabArtOneMesh.apply(lambda row: diabetesMeshTotalInv[row["MeshUI"]]["MH"], axis=1)
del diabArtOneMesh["MeshList"]
print(diabArtOneMesh.shape)
diabArtOneMesh.head(50)

(5665, 7)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


Unnamed: 0,PMID,DateCompleted,Title,Abstract,MeshUI,MeshNumber,MeshHeading
0,29659364,2019-02-05,Association between central diabetes insipidus...,Central diabetes insipidus is a rare disease o...,D003924,C19.246.300,"Diabetes Mellitus, Type 2"
1,30071825,2019-05-21,Fatal Fournier's gangrene caused by Clostridiu...,Clostridium ramosum is a generally non-pathoge...,D003922,C19.246.267,"Diabetes Mellitus, Type 1"
2,21842608,2011-11-03,Case of distal renal tubular acidosis complica...,We report herein a 27-year-old male case of in...,D048909,C19.246.099,Diabetes Complications
3,17026722,2007-01-01,The value of urine specific gravity in detecti...,When a patient with diabetes mellitus presents...,D003924,C19.246.300,"Diabetes Mellitus, Type 2"
4,25255707,2016-01-12,Congenital central diabetes insipidus and opti...,Wolfram syndrome (WS) is an autosomal recessiv...,D014929,C19.246.267.960,Wolfram Syndrome
9,25330715,2015-10-01,"The spectrum of clinical presentation, diagnos...",Primary mitochondrial diseases refer to a grou...,D003920,C19.246,Diabetes Mellitus
10,20500966,2010-07-22,Gestational diabetes insipidus: a review of an...,"To review the etiology, diagnosis, and managem...",D016640,C19.246.200,"Diabetes, Gestational"
11,23380272,2013-08-26,Gestational diabetes insipidus during a twin p...,Gestational diabetes insipidus is an uncommon ...,D016640,C19.246.200,"Diabetes, Gestational"
12,31038066,2020-04-08,Relationship Between Hormonal Mechanisms of Di...,"Obesity, diabetes mellitus may be related to t...",D003920,C19.246,Diabetes Mellitus
13,31884765,2020-01-23,The mortality of diabetes mellitus from the pe...,The statistics of causes of death is the infor...,D003920,C19.246,Diabetes Mellitus


In [8]:
Counter(diabArtOneMesh["MeshHeading"].values)


Counter({'Diabetes Mellitus, Type 2': 3065,
         'Diabetes Mellitus, Type 1': 531,
         'Diabetes Complications': 92,
         'Wolfram Syndrome': 3,
         'Diabetes Mellitus': 1390,
         'Diabetes, Gestational': 449,
         'Diabetic Nephropathies': 5,
         'Prediabetic State': 1,
         'Diabetes Mellitus, Experimental': 112,
         'Diabetic Angiopathies': 5,
         'Diabetic Cardiomyopathies': 1,
         'Diabetic Neuropathies': 6,
         'Diabetic Foot': 3,
         'Diabetic Retinopathy': 1,
         'Fetal Macrosomia': 1})

In [9]:

#train = diabArtOneMesh[(diabArtOneMesh["MeshHeading"] =="Diabetes Mellitus, Type 1") | (diabArtOneMesh["MeshHeading"] == "Diabetes Complications") ]
train = diabArtOneMesh[(diabArtOneMesh["MeshHeading"] == "Diabetes Complications")
                       | (diabArtOneMesh["MeshHeading"] == "Diabetes, Gestational")
                       | (diabArtOneMesh["MeshHeading"] == "Diabetes Mellitus, Experimental")] \
        .append(diabArtOneMesh[diabArtOneMesh["MeshHeading"] == "Diabetes Mellitus, Type 2"].sample(n=200)) \
        .append(diabArtOneMesh[diabArtOneMesh["MeshHeading"] == "Diabetes Mellitus"].sample(n=200)) \
        .append(diabArtOneMesh[diabArtOneMesh["MeshHeading"] == "Diabetes Mellitus, Type 1"].sample(n=200))
 
#train = train.append(diabArtOneMesh[diabArtOneMeshtOneMesh["MeshHeading"] == "Diabetes Mellitus, Type 2"].sample(n=500))
print(train.shape)
train.head()

(1253, 7)


Unnamed: 0,PMID,DateCompleted,Title,Abstract,MeshUI,MeshNumber,MeshHeading
2,21842608,2011-11-03,Case of distal renal tubular acidosis complica...,We report herein a 27-year-old male case of in...,D048909,C19.246.099,Diabetes Complications
10,20500966,2010-07-22,Gestational diabetes insipidus: a review of an...,"To review the etiology, diagnosis, and managem...",D016640,C19.246.200,"Diabetes, Gestational"
11,23380272,2013-08-26,Gestational diabetes insipidus during a twin p...,Gestational diabetes insipidus is an uncommon ...,D016640,C19.246.200,"Diabetes, Gestational"
41,25646595,2015-12-17,Transsphenoidal surgery and diabetes mellitus:...,Transsphenoidal surgery (TSS) has emerged as t...,D048909,C19.246.099,Diabetes Complications
171,30866836,2019-04-26,Prevalence of diabetes mellitus among tubercul...,Tuberculosis and diabetes mellitus are signifi...,D048909,C19.246.099,Diabetes Complications


In [59]:
train.to_parquet("/home/adrian/PhD/Data/Pubmed/diabetes_test_abstracts.parquet")

## Remove documents who have the root class :  "D003920", "Diabetes Mellitus", "C19.246"

In [7]:
import pandas as pd
data = pd.read_parquet("/home/adrian/PhD/Data/Pubmed/baseline_diabetes_unique_maxNperClass5000.parquet")
data.head(2)

Unnamed: 0,PMID,title,abstract,date,mesh_ui,mesh_mh,mesh_ui_diab,mesh_mh_diab,title_abstract_prep
0,28800712,Outcomes Achieved With Use of a Prefabricated ...,BACKGROUND\nThe total contact cast (TCC) is co...,2017-10,"D000328,D000367,D000368,D000369,D002370,D01533...","Adult,Age Factors,Aged,Aged, 80 and over,Casts...",D017719,Diabetic Foot,outcomes achieved with use of a prefabricated ...
1,6989594,Investigation of insulin sensitivity in early ...,Twenty-three normal weight subjects without an...,1980-01,"D001786,D001835,D005230,D005951,D006801,D00732...","Blood Glucose,Body Weight,Fatty Acids, Noneste...",D011236,Prediabetic State,investigation of insulin sensitivity in early ...


In [8]:
print(data.shape)
data2 = data[data["mesh_mh_diab"] != "Diabetes Mellitus"]
print(data2.shape)

(55911, 9)
(50911, 9)


In [9]:
data2.to_parquet("/home/adrian/PhD/Data/Pubmed/baseline_diabetes_unique_maxNperClass5000_withoutRootClassDiabetesMellitus.parquet")


In [1]:
import pandas as pd 
data = pd.read_parquet("/home/adrian/PhD/Data/Pubmed/baseline_diabetes_unique_maxNperClass5000_withoutRootClassDiabetesMellitus.parquet")
print(data.shape)
data.head(2)

(50911, 11)


Unnamed: 0,PMID,title,abstract,date,mesh_ui,mesh_mh,mesh_ui_diab,mesh_mh_diab,title_abstract_prep,vec,mesh_ui_diab_with_childs
0,28800712,Outcomes Achieved With Use of a Prefabricated ...,BACKGROUND\nThe total contact cast (TCC) is co...,2017-10,"D000328,D000367,D000368,D000369,D002370,D01533...","Adult,Age Factors,Aged,Aged, 80 and over,Casts...",D017719,Diabetic Foot,outcomes achieved with use of a prefabricated ...,"[-0.1456865, 0.10184939, 0.08282469, 0.0834905...","[D003925, D003929, D048909]"
1,6989594,Investigation of insulin sensitivity in early ...,Twenty-three normal weight subjects without an...,1980-01,"D001786,D001835,D005230,D005951,D006801,D00732...","Blood Glucose,Body Weight,Fatty Acids, Noneste...",D011236,Prediabetic State,investigation of insulin sensitivity in early ...,"[-0.13539475, 0.06387955, 0.0704756, 0.1377651...",[]


In [26]:
diab_gestational = data[data["mesh_ui_diab"] == "D016640"].sample(n=1000, random_state=0)
diab_nonGestational = data[data["mesh_ui_diab"] != "D016640"].sample(n=1000, random_state=0)
diab_complications = data[data["mesh_ui_diab"] == "D048909"].sample(n=1000, random_state=0)
diab_nonComplications = data[data["mesh_ui_diab"] != "D048909"].sample(n=1000, random_state=0)
data_2000_random = data.sample(n=2000, random_state=0).reset_index(drop=True)
data_5000_random = data.sample(n=5000, random_state=0).reset_index(drop=True)
data_10000_random = data.sample(n=10000, random_state=0).reset_index(drop=True)
data_20000_random = data.sample(n=20000, random_state=0).reset_index(drop=True)
data_30000_random = data.sample(n=30000, random_state=0).reset_index(drop=True)
data_40000_random = data.sample(n=40000, random_state=0).reset_index(drop=True)
data_50000_random = data.sample(n=50000, random_state=0).reset_index(drop=True)

In [None]:
data_2000_random = data.sample(n=2000, random_state=0).reset_index(drop=True)


In [9]:
diab_gestational_nonGestational = diab_gestational.append(diab_nonGestational).sample(frac=1).reset_index(drop=True)
diab_complications_nonComplications = diab_complications.append(diab_nonComplications).sample(frac=1).reset_index(drop=True)


In [27]:
dirpath = "/home/adrian/workspace/ActiveLearning/Active-Learning-for-Neural-Networks/data/diabetes"
#subdir_gest = dirpath+"/gestational_1000pos_and_neg_samples/"
subdir_complications = dirpath+"/complications_100pos_and_neg_samples/"
#(diab_gestational.title + " " + diab_gestational.abstract).map(lambda s: s.replace('\n', '')).to_csv(subdir_gest+"diab_gestational_1000samples.txt", index=False, header=False)
#(diab_nonGestational.title + " " + diab_nonGestational.abstract).map(lambda s: s.replace('\n', '')).to_csv(subdir_gest+"diab_Nongestational_1000samples.txt", index=False, header=False)
#(diab_gestational.title ).map(lambda s: s.replace('\n', '')).to_csv(dirpath+"/gestational_1000pos_and_neg_samples_onlyTitles/diab_gestational_1000samples_onlyTitles.txt", index=False, header=False)
#(diab_nonGestational.title).map(lambda s: s.replace('\n', '')).to_csv(dirpath+"/gestational_1000pos_and_neg_samples_onlyTitles/diab_Nongestational_1000samples_onlyTitles.txt", index=False, header=False)
#(diab_complications.title + " " + diab_complications.abstract).map(lambda s: s.replace('\n', '')).to_csv(subdir_complications+"/diab_complications_1000samples.txt", index=False, header=False)
#(diab_nonComplications.title + " " + diab_nonComplications.abstract).map(lambda s: s.replace('\n', '')).to_csv(subdir_complications+"/diab_nonComplications_1000samples.txt", index=False, header=False)
#(diab_complications.title ).map(lambda s: s.replace('\n', '')).to_csv(dirpath+"/complications_1000pos_and_neg_samples_onlyTitles/diab_complications_1000samples_onlyTitles.txt", index=False, header=False)
#(diab_nonComplications.title).map(lambda s: s.replace('\n', '')).to_csv(dirpath+"/complications_1000pos_and_neg_samples_onlyTitles/diab_nonComplications_1000samples_onlyTitles.txt", index=False, header=False)

#diab_gestational_nonGestational.to_parquet(subdir_gest+"data.parquet")
#diab_complications_nonComplications.to_parquet(subdir_complications+"data.parquet")
#diab_complications_nonComplications.to_parquet("/home/adrian/workspace/Hierarchical-Clustering-Active-Learning-Text/tmp/complications_1000pos_and_neg_samples.parquet")
data_2000_random.to_parquet(dirpath+"/2000_random/2000_random_abstracts.parquet")
data_5000_random.to_parquet(dirpath+"/5000_random/5000_random_abstracts.parquet")
#dirpath = "/home/adrian/PhD/Data/Pubmed/baseline_diabetes_unique_maxNperClass5000_withoutRootClassDiabetesMellitus"
#data_5000_random.to_parquet(dirpath+"_sample5000.parquet")
#data_10000_random.to_parquet(dirpath+"_sample10000.parquet")
#data_20000_random.to_parquet(dirpath+"_sample20000.parquet")
#data_30000_random.to_parquet(dirpath+"_sample30000.parquet")
#data_40000_random.to_parquet(dirpath+"_sample40000.parquet")
#data_50000_random.to_parquet(dirpath+"_sample50000.parquet")

### Get occurrences of abstracts per class

In [2]:
import pandas as pd
from collections import Counter

test = pd.read_parquet("/home/adrian/PhD/Data/Pubmed/baseline_diabetes_unique_maxNperClass5000_withoutRootClassDiabetesMellitus.parquet")
test.head(2)
Counter(test["mesh_mh_diab"].values)

Counter({'Diabetic Foot': 4424,
         'Prediabetic State': 1261,
         'Diabetic Neuropathies': 3662,
         'Diabetes, Gestational': 5000,
         'Diabetes Mellitus, Experimental': 5000,
         'Diabetes Complications': 5000,
         'Diabetes Mellitus, Type 2': 5000,
         'Diabetic Retinopathy': 5000,
         'Diabetes Mellitus, Type 1': 5000,
         'Diabetic Angiopathies': 3026,
         'Diabetic Ketoacidosis': 1308,
         'Diabetic Nephropathies': 5000,
         'Hyperglycemic Hyperosmolar Nonketotic Coma': 97,
         'Fetal Macrosomia': 1282,
         'Diabetic Cardiomyopathies': 386,
         'Diabetic Coma': 97,
         'Diabetes Mellitus, Lipoatrophic': 85,
         'Wolfram Syndrome': 228,
         'Donohue Syndrome': 39,
         'Latent Autoimmune Diabetes in Adults': 16})

### Get sample of diabetes tweets

In [9]:
import pandas as pd
import pyspark
spark = pyspark.sql.SparkSession.builder.getOrCreate()
data = spark.read.parquet("/home/adrian/PhD/Data/Tweets20190708/matching-tweets_diab_noRT-noBots_personal_noJokes_locationUS_geoCityCodeNotNull_emotions_gender-typeDiab_repartition6.parquet")
print(data.count())
print(data.show())
#data = pd.read_parquet("/home/adrian/PhD/Data/Tweets20190708/matching-tweets_diab_noRT-noBots_personal_noJokes_locationUS_geoCityCodeNotNull_emotions_gender-typeDiab_repartition6.parquet")
#print(data.shape)
#data.head(2)

46407
+--------------------+--------------------+----+--------------------+--------------------+----------------+--------------------+------------------+--------------------+--------------------+---------------+-------------------+--------------------------+------------------------------+----------------------------+-----------------------+--------------------------+-------------------------+--------------+------------+--------------------+--------------------+-------+----------------+--------+--------+----------------+-------------+-------------+-------------+-------------+-------------+------------------+----------+--------------------+------------+----------+------+------------+
|                  id|          created_at|lang|                text|           user_name|user_screen_name|user_followers_count|user_friends_count|       user_location|    user_description|place_full_name|retweeted_user_name|retweeted_user_screen_name|retweeted_user_followers_count|retweeted_user_friends_cou

In [12]:
#data.sample(n=1000).head()
sample_data = data.sample(0.021)
print(sample_data.count())
sample_data.write.mode("overwrite").parquet("/home/adrian/PhD/Data/Tweets20190708/matching-tweets_diab_noRT-noBots_personal_noJokes_locationUS_geoCityCodeNotNull_emotions_gender-typeDiab_repartition6_sample1000.parquet")


1002


## add column with child mesh classes to dataset

In [3]:
meshcodes= {'D003920': {'MN': 'C19.246', 'MH': 'Diabetes Mellitus'},
 'D003923': {'MN': 'C19.246.300.500', 'MH': 'Diabetes Mellitus, Lipoatrophic'},
 'D003924': {'MN': 'C19.246.300', 'MH': 'Diabetes Mellitus, Type 2'},
 'D003925': {'MN': 'C19.246.099.500', 'MH': 'Diabetic Angiopathies'},
 'D003926': {'MN': 'C19.246.099.750', 'MH': 'Diabetic Coma'},
 'D003928': {'MN': 'C19.246.099.875', 'MH': 'Diabetic Nephropathies'},
 'D003929': {'MN': 'C19.246.099.937', 'MH': 'Diabetic Neuropathies'},
 'D003930': {'MN': 'C19.246.099.500.382', 'MH': 'Diabetic Retinopathy'},
 'D006944': {'MN': 'C19.246.099.750.490',
  'MH': 'Hyperglycemic Hyperosmolar Nonketotic Coma'},
 'D011236': {'MN': 'C19.246.774', 'MH': 'Prediabetic State'},
 'D016640': {'MN': 'C19.246.200', 'MH': 'Diabetes, Gestational'},
 'D016883': {'MN': 'C19.246.099.812', 'MH': 'Diabetic Ketoacidosis'},
 'D017719': {'MN': 'C19.246.099.500.191', 'MH': 'Diabetic Foot'},
 'D048909': {'MN': 'C19.246.099', 'MH': 'Diabetes Complications'},
 'D056731': {'MN': 'C19.246.537', 'MH': 'Donohue Syndrome'},
 'D058065': {'MN': 'C19.246.099.625', 'MH': 'Diabetic Cardiomyopathies'},
 'D000071698': {'MN': 'C19.246.656',
  'MH': 'Latent Autoimmune Diabetes in Adults'},
 'D014929': {'MN': 'C19.246.267.960', 'MH': 'Wolfram Syndrome'},
 'D003922': {'MN': 'C19.246.267', 'MH': 'Diabetes Mellitus, Type 1'},
 'D003921': {'MN': 'C19.246.240', 'MH': 'Diabetes Mellitus, Experimental'},
 'D005320': {'MN': 'C19.246.099.968', 'MH': 'Fetal Macrosomia'}}

In [4]:
mesh_hierarchy = {
    "D003920" : {"MH" : 'Diabetes Mellitus',"MN" : 'C19.246', "children": [
            {"D048909" : {'MN': 'C19.246.099', 'MH': 'Diabetes Complications', "children": [{
                "D003925" : {'MN': 'C19.246.099.500', 'MH': 'Diabetic Angiopathies', "children" : [{
                    "D017719" : {'MN': 'C19.246.099.500.191', 'MH': 'Diabetic Foot', "children" : []}
                    , "D003930" : {'MN': 'C19.246.099.500.382', 'MH': 'Diabetic Retinopathy', "children" : []}
                }]}
                , "D058065" : {'MN': 'C19.246.099.625', 'MH': 'Diabetic Cardiomyopathies', "children":[]}
                , "D003926" : {'MN': 'C19.246.099.750', 'MH': 'Diabetic Coma', "children":[{
                    "D006944" : {'MN': 'C19.246.099.750.490', 'MH': 'Hyperglycemic Hyperosmolar Nonketotic Coma', "children": []}
                }]}
                , "D016883" : {'MN': 'C19.246.099.812', 'MH': 'Diabetic Ketoacidosis', "children": []}
                , "D003928" : {'MN': 'C19.246.099.875', 'MH': 'Diabetic Nephropathies', "children": []}
                , "D003929" : {'MN': 'C19.246.099.937', 'MH': 'Diabetic Neuropathies', "children": [{
                    "D017719" : {'MN': 'C19.246.099.937.250', 'MH': 'Diabetic Foot', "children" : []}
                }]}
                , "D005320" : {'MN': 'C19.246.099.968', 'MH': 'Fetal Macrosomia', "children":[]}
            }] }
             , "D016640" : {'MN': 'C19.246.200', 'MH': 'Diabetes, Gestational', "children": []}
             , "D003921" : {'MN': 'C19.246.240', 'MH': 'Diabetes Mellitus, Experimental', "children":[]}
             , "D003922" : {'MN': 'C19.246.267', 'MH': 'Diabetes Mellitus, Type 1', "children":[{
                 "D014929" : {'MN': 'C19.246.267.960', 'MH': 'Wolfram Syndrome', "children":[]}
             }]}
             , "D003924" : {'MN': 'C19.246.300', 'MH': 'Diabetes Mellitus, Type 2', "children":[{
                 'D003923': {'MN': 'C19.246.300.500', 'MH': 'Diabetes Mellitus, Lipoatrophic', "children":[]}
             }]}
             , "D056731" : {'MN': 'C19.246.537', 'MH': 'Donohue Syndrome', "children":[]}
             , "D000071698" : {'MN': 'C19.246.656', 'MH': 'Latent Autoimmune Diabetes in Adults', "children": []}
             , "D011236" : {'MN': 'C19.246.774', 'MH': 'Prediabetic State', "children" : []}
             
            }
        ]
    }
}
#mesh_hierarchy

In [200]:

mesh_with_parents = {
 'D003923': ["D003924"],
 'D003924': [],
 'D003925': ["D048909"],
 'D003926': ["D048909"],
 'D003928': ["D048909"],
 'D003929': ["D048909"],
 'D003930': ["D048909","D003925"],
 'D006944': ["D003926", "D048909"],
 'D011236': [],
 'D016640': [],
 'D016883': ["D048909"],
 'D017719': ["D003925", "D003929", "D048909"],
 'D048909': [],
 'D056731': [],
 'D058065': ["D048909"],
 'D000071698': [],
 'D014929': ["D003922"],
 'D003922': [],
 'D003921': [],
 'D005320': ["D048909"]
}
"""
mesh_childs = {
 'D003923': [],
 'D003924': ["D003923"],
 'D003925': ["D017719", "D003930"],
 'D003926': ["D006944"],
 'D003928': [],
 'D003929': ["D017719"],
 'D003930': [],
 'D006944': [],
 'D011236': [],
 'D016640': [],
 'D016883': [],
 'D017719': [],
 'D048909': ["D003925", "D017719", "D003930", "D058065", "D003926", "D006944", "D016883", "D003928", "D003929", "D017719", "D005320" ],
 'D056731': [],
 'D058065': [],
 'D000071698': [],
 'D014929': [],
 'D003922': ["D014929"],
 'D003921': [],
 'D005320': []
}
"""



'\nmesh_childs = {\n \'D003923\': [],\n \'D003924\': ["D003923"],\n \'D003925\': ["D017719", "D003930"],\n \'D003926\': ["D006944"],\n \'D003928\': [],\n \'D003929\': ["D017719"],\n \'D003930\': [],\n \'D006944\': [],\n \'D011236\': [],\n \'D016640\': [],\n \'D016883\': [],\n \'D017719\': [],\n \'D048909\': ["D003925", "D017719", "D003930", "D058065", "D003926", "D006944", "D016883", "D003928", "D003929", "D017719", "D005320" ],\n \'D056731\': [],\n \'D058065\': [],\n \'D000071698\': [],\n \'D014929\': [],\n \'D003922\': ["D014929"],\n \'D003921\': [],\n \'D005320\': []\n}\n'

In [201]:
import pandas as pd
data = pd.read_parquet("/home/adrian/PhD/Data/Pubmed/baseline_diabetes_unique_maxNperClass5000_withoutRootClassDiabetesMellitus.parquet")
data.head()

Unnamed: 0,PMID,title,abstract,date,mesh_ui,mesh_mh,mesh_ui_diab,mesh_mh_diab,title_abstract_prep,vec,mesh_ui_diab_with_childs
0,28800712,Outcomes Achieved With Use of a Prefabricated ...,BACKGROUND\nThe total contact cast (TCC) is co...,2017-10,"D000328,D000367,D000368,D000369,D002370,D01533...","Adult,Age Factors,Aged,Aged, 80 and over,Casts...",D017719,Diabetic Foot,outcomes achieved with use of a prefabricated ...,"[-0.1456865, 0.10184939, 0.08282469, 0.0834905...","[D003925, D003929, D048909]"
1,6989594,Investigation of insulin sensitivity in early ...,Twenty-three normal weight subjects without an...,1980-01,"D001786,D001835,D005230,D005951,D006801,D00732...","Blood Glucose,Body Weight,Fatty Acids, Noneste...",D011236,Prediabetic State,investigation of insulin sensitivity in early ...,"[-0.13539475, 0.06387955, 0.0704756, 0.1377651...",[]
2,524360,Ultrastructural pathology of peripheral nerves...,Sural nerve lesions in patients with clinicall...,1979-12,"D000328,D001369,D003929,D006801,D008297,D00887...","Adult,Axons,Diabetic Neuropathies,Humans,Male,...",D003929,Diabetic Neuropathies,ultrastructural pathology of peripheral nerves...,"[-0.13219672, 0.107427344, 0.012001918, 0.0821...",[D048909]
3,21199315,Evidence-based interventional pain medicine ac...,"In the industrialized world, polyneuropathy in...",2011,"D000698,D000700,D003929,D019317,D006801","Analgesia,Analgesics,Diabetic Neuropathies,Evi...",D003929,Diabetic Neuropathies,evidence-based interventional pain medicine ac...,"[-0.15465583, 0.120985754, 0.11113819, 0.06654...",[D048909]
4,24607755,Delivery timing and cesarean delivery risk in ...,OBJECTIVE\nThe purpose of this study was to ev...,2014-09,"D000328,D002585,D016640,D005260,D005865,D00680...","Adult,Cesarean Section,Diabetes, Gestational,F...",D016640,"Diabetes, Gestational",delivery timing and cesarean delivery risk in ...,"[-0.080492444, 0.1405143, 0.00491666, 0.153142...",[]


In [10]:
#data.to_parquet("/home/adrian/PhD/Data/Pubmed/baseline_diabetes_unique_maxNperClass5000_withoutRootClassDiabetesMellitus_backup.parquet")


In [204]:
data["mesh_ui_diab_with_childs"] = data["mesh_ui_diab"].map(lambda mesh: mesh_with_parents[mesh])
print(data.shape)
data.head(5)

(50911, 11)


Unnamed: 0,PMID,title,abstract,date,mesh_ui,mesh_mh,mesh_ui_diab,mesh_mh_diab,title_abstract_prep,vec,mesh_ui_diab_with_childs
0,28800712,Outcomes Achieved With Use of a Prefabricated ...,BACKGROUND\nThe total contact cast (TCC) is co...,2017-10,"D000328,D000367,D000368,D000369,D002370,D01533...","Adult,Age Factors,Aged,Aged, 80 and over,Casts...",D017719,Diabetic Foot,outcomes achieved with use of a prefabricated ...,"[-0.1456865, 0.10184939, 0.08282469, 0.0834905...","[D003925, D003929, D048909]"
1,6989594,Investigation of insulin sensitivity in early ...,Twenty-three normal weight subjects without an...,1980-01,"D001786,D001835,D005230,D005951,D006801,D00732...","Blood Glucose,Body Weight,Fatty Acids, Noneste...",D011236,Prediabetic State,investigation of insulin sensitivity in early ...,"[-0.13539475, 0.06387955, 0.0704756, 0.1377651...",[]
2,524360,Ultrastructural pathology of peripheral nerves...,Sural nerve lesions in patients with clinicall...,1979-12,"D000328,D001369,D003929,D006801,D008297,D00887...","Adult,Axons,Diabetic Neuropathies,Humans,Male,...",D003929,Diabetic Neuropathies,ultrastructural pathology of peripheral nerves...,"[-0.13219672, 0.107427344, 0.012001918, 0.0821...",[D048909]
3,21199315,Evidence-based interventional pain medicine ac...,"In the industrialized world, polyneuropathy in...",2011,"D000698,D000700,D003929,D019317,D006801","Analgesia,Analgesics,Diabetic Neuropathies,Evi...",D003929,Diabetic Neuropathies,evidence-based interventional pain medicine ac...,"[-0.15465583, 0.120985754, 0.11113819, 0.06654...",[D048909]
4,24607755,Delivery timing and cesarean delivery risk in ...,OBJECTIVE\nThe purpose of this study was to ev...,2014-09,"D000328,D002585,D016640,D005260,D005865,D00680...","Adult,Cesarean Section,Diabetes, Gestational,F...",D016640,"Diabetes, Gestational",delivery timing and cesarean delivery risk in ...,"[-0.080492444, 0.1405143, 0.00491666, 0.153142...",[]


In [205]:
data.to_parquet("/home/adrian/PhD/Data/Pubmed/baseline_diabetes_unique_maxNperClass5000_withoutRootClassDiabetesMellitus.parquet")


## Get subsets of data

In [206]:
import pandas as pd 
data = pd.read_parquet("/home/adrian/PhD/Data/Pubmed/baseline_diabetes_unique_maxNperClass5000_withoutRootClassDiabetesMellitus.parquet")
print(data.shape)
data.head(1)

(50911, 11)


Unnamed: 0,PMID,title,abstract,date,mesh_ui,mesh_mh,mesh_ui_diab,mesh_mh_diab,title_abstract_prep,vec,mesh_ui_diab_with_childs
0,28800712,Outcomes Achieved With Use of a Prefabricated ...,BACKGROUND\nThe total contact cast (TCC) is co...,2017-10,"D000328,D000367,D000368,D000369,D002370,D01533...","Adult,Age Factors,Aged,Aged, 80 and over,Casts...",D017719,Diabetic Foot,outcomes achieved with use of a prefabricated ...,"[-0.1456865, 0.10184939, 0.08282469, 0.0834905...","[D003925, D003929, D048909]"


In [207]:
diab_gestational = data[data["mesh_ui_diab"] == "D016640"].sample(n=1000, random_state=0)
diab_nonGestational = data[data["mesh_ui_diab"] != "D016640"].sample(n=1000, random_state=0)
diab_gestational_nonGestational = diab_gestational.append(diab_nonGestational).sample(frac=1).reset_index(drop=True)

diab_complications = data[data["mesh_ui_diab"] == "D048909"].sample(n=1000, random_state=0)
diab_nonComplications = data[data["mesh_ui_diab"] != "D048909"].sample(n=1000, random_state=0)
diab_complications_nonComplications = diab_complications.append(diab_nonComplications).sample(frac=1).reset_index(drop=True)

data_2000_random = data.sample(n=2000, random_state=0).reset_index(drop=True) # 62
data_5000_random = data.sample(n=5000, random_state=0).reset_index(drop=True)
data_10000_random = data.sample(n=10000, random_state=0).reset_index(drop=True)
data_20000_random = data.sample(n=20000, random_state=0).reset_index(drop=True)
data_30000_random = data.sample(n=30000, random_state=0).reset_index(drop=True)
data_40000_random = data.sample(n=40000, random_state=0).reset_index(drop=True)
data_50000_random = data.sample(n=50000, random_state=0).reset_index(drop=True)

In [212]:
posClass = "D003930"

def mesh_to_boolean(x):
    mesh, meshChilds = x
    if (mesh == posClass or posClass in meshChilds): 
        return True
    else:
        return False

def get_N_posAndNeg(data):
    dataToBool = data[["mesh_ui_diab", "mesh_ui_diab_with_childs"]].apply(mesh_to_boolean , axis=1) 
    return dataToBool.groupby(dataToBool).count()

#print("N pos + N neg abstracts:")
#dataToBool = data_2000_random[["mesh_ui_diab", "mesh_ui_diab_with_childs"]].apply(mesh_to_boolean , axis=1) 
#print(dataToBool.groupby(dataToBool).count())
print(get_N_posAndNeg(data_2000_random))
get_N_posAndNeg(data_2000_random).loc[True]


False    1815
True      185
dtype: int64


185

In [213]:
from sklearn.model_selection import train_test_split
#y_test = test["mesh_ui_diab"].map(lambda mesh: 1 if mesh == "D048909" else 0).values
#X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=0.75, random_state=0)

train, test = train_test_split(data_2000_random, test_size=0.5, random_state=0) # 17:130
print("train:")
print(get_N_posAndNeg(train))
print("\ntest:")
print(get_N_posAndNeg(test))

train:
False    899
True     101
dtype: int64

test:
False    916
True      84
dtype: int64


In [217]:
#train = X_train
#train["true_label"] = y_train
train.reset_index(drop=True).to_parquet("/home/adrian/workspace/ActiveLearning/Active-Learning-for-Neural-Networks/data/diabetes/2000_random/2000random_1000train.parquet")
#test = X_test
#test["true_label"] = y_test
test.reset_index(drop=True).to_parquet("/home/adrian/workspace/ActiveLearning/Active-Learning-for-Neural-Networks/data/diabetes/2000_random/2000random_1000train.parquet")

In [214]:
dirpath = "/home/adrian/workspace/ActiveLearning/Active-Learning-for-Neural-Networks/data/diabetes"
#subdir_gest = dirpath+"/gestational_1000pos_and_neg_samples/"
#subdir_complications = dirpath+"/complications_100pos_and_neg_samples/"
#(diab_gestational.title + " " + diab_gestational.abstract).map(lambda s: s.replace('\n', '')).to_csv(subdir_gest+"diab_gestational_1000samples.txt", index=False, header=False)
#(diab_nonGestational.title + " " + diab_nonGestational.abstract).map(lambda s: s.replace('\n', '')).to_csv(subdir_gest+"diab_Nongestational_1000samples.txt", index=False, header=False)
#(diab_gestational.title ).map(lambda s: s.replace('\n', '')).to_csv(dirpath+"/gestational_1000pos_and_neg_samples_onlyTitles/diab_gestational_1000samples_onlyTitles.txt", index=False, header=False)
#(diab_nonGestational.title).map(lambda s: s.replace('\n', '')).to_csv(dirpath+"/gestational_1000pos_and_neg_samples_onlyTitles/diab_Nongestational_1000samples_onlyTitles.txt", index=False, header=False)
#(diab_complications.title + " " + diab_complications.abstract).map(lambda s: s.replace('\n', '')).to_csv(subdir_complications+"/diab_complications_1000samples.txt", index=False, header=False)
#(diab_nonComplications.title + " " + diab_nonComplications.abstract).map(lambda s: s.replace('\n', '')).to_csv(subdir_complications+"/diab_nonComplications_1000samples.txt", index=False, header=False)
#(diab_complications.title ).map(lambda s: s.replace('\n', '')).to_csv(dirpath+"/complications_1000pos_and_neg_samples_onlyTitles/diab_complications_1000samples_onlyTitles.txt", index=False, header=False)
#(diab_nonComplications.title).map(lambda s: s.replace('\n', '')).to_csv(dirpath+"/complications_1000pos_and_neg_samples_onlyTitles/diab_nonComplications_1000samples_onlyTitles.txt", index=False, header=False)

#diab_gestational_nonGestational.to_parquet(subdir_gest+"data.parquet")
#diab_complications_nonComplications.to_parquet(subdir_complications+"data.parquet")
#diab_complications_nonComplications.to_parquet("/home/adrian/workspace/Hierarchical-Clustering-Active-Learning-Text/tmp/complications_1000pos_and_neg_samples.parquet")
data_2000_random.to_parquet(dirpath+"/2000_random/2000_random_abstracts.parquet")
#data_5000_random.to_parquet(dirpath+"/5000_random/5000_random_abstracts.parquet")
#dirpath = "/home/adrian/PhD/Data/Pubmed/baseline_diabetes_unique_maxNperClass5000_withoutRootClassDiabetesMellitus"
#data_5000_random.to_parquet(dirpath+"_sample5000.parquet")
#data_10000_random.to_parquet(dirpath+"_sample10000.parquet")
#data_20000_random.to_parquet(dirpath+"_sample20000.parquet")
#data_30000_random.to_parquet(dirpath+"_sample30000.parquet")
#data_40000_random.to_parquet(dirpath+"_sample40000.parquet")
#data_50000_random.to_parquet(dirpath+"_sample50000.parquet")