# BC5CDR Dataset Data Transformations & EDA

## Setup

In [76]:
# Libraries 

import pandas as pd 
import numpy as np

import ast 

from collections import Counter

In [113]:
# Global settings

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [3]:
# Path for datasets

datapath = '../data/'

In [88]:
# Load datasets

df_train = pd.read_csv(f'{datapath}' + 'OfficialTrainingSet1.csv')
df_val = pd.read_csv(f'{datapath}' + 'OfficialValidationSet1.csv')
df_test = pd.read_csv(f'{datapath}' + 'OfficialTestSet1.csv')

print("Shape of train dataset:", df_train.shape)
print("Shape of validation dataset:", df_val.shape)
print("Shape of test dataset:", df_test.shape)

df_train.head(3)

Shape of train dataset: (500, 13)
Shape of validation dataset: (500, 13)
Shape of test dataset: (500, 13)


Unnamed: 0,article_code,title,abstract,chemicals,diseases,chemical_start_indices,chemical_end_indices,disease_start_indices,disease_end_indices,chemical_ids,disease_ids,CID_chemical,CID_disease
0,227508,Naloxone reverses the antihypertensive effect ...,"In unanesthetized, spontaneously hypertensive ...","['Naloxone', 'clonidine', 'clonidine', 'nalozo...","['hypertensive', 'hypotensive', 'hypertensive'...","['0', '49', '181', '244', '306', '354', '364',...","['8', '58', '190', '252', '322', '362', '372',...","['93', '274', '469', '750']","['105', '285', '481', '762']","['D009270', 'D003000', 'D003000', '-1', 'D0087...","['D006973', 'D007022', 'D006973', 'D006973']",['D008750'],['D007022']
1,354896,Lidocaine-induced cardiac asystole.,Intravenous administration of a single 50-mg b...,"['Lidocaine', 'lidocaine', 'lidocaine']","['cardiac asystole', 'depression', 'bradyarrhy...","['0', '90', '409']","['9', '99', '418']","['18', '142', '331']","['34', '152', '347']","['D008012', 'D008012', 'D008012']","['D006323', 'D003866', 'D001919']",['D008012'],['D006323']
2,435349,Suxamethonium infusion rate and observed fasci...,Suxamethonium chloride (Sch) was administered ...,"['Suxamethonium', 'Suxamethonium chloride', 'S...","['fasciculations', 'tetanic', 'Fasciculations'...","['0', '80', '104', '312']","['13', '102', '107', '315']","['41', '265', '395', '483', '523', '538', '561...","['55', '272', '409', '496', '536', '544', '568...","['D013390', 'D013390', 'D013390', 'D013390']","['D005207', 'D013746', 'D005207', 'D005207', '...",['D013390'],['D005207']


## Data Transformations (Pre-EDA)

In [89]:
# Data transformation functions (Pre-EDA)

# Deprecated because we want to keep cols as lists
# def delist(data, start_col=3):
#     """
#     Removes brackets '[' ']' to put contents in a de-list format
#     Items are still separated by commas, be careful when reading comma separated files 
#     """
#     data.iloc[:, start_col:] = data.iloc[:, start_col:].apply(
#         lambda x: x.apply(
#             lambda item: ', '.join([i.strip() for i in item.replace('[', '').replace(']', '').replace("'", "").split(',')])
#             if isinstance(item, str) else item
#         )
#     )
#     return data

def convert_col_to_list(string):
    """
    Converts all string columns that look like lists (col index 3 to end) into actual lists 
    """
    return ast.literal_eval(string)
    

def map_cid_to_chemical_name(row):
    """
    Maps CID of chemical in the CID_chemical column into the actual name of the chemical
    """
    cid_chemicals = row['CID_chemical']
    chemical_ids = row['chemical_ids']
    chemicals = row['chemicals']
    
    chemical_names = []
    
    for cid in cid_chemicals:
        if cid in chemical_ids:
            idx = chemical_ids.index(cid)
            chemical_names.append(chemicals[idx])
        else:
            chemical_names.append('Unknown')
    
    return chemical_names


def map_cid_to_disease_name(row):
    """
    Maps CID of disease in the CID_disease column into the actual name of the disease
    """
    cid_diseases = row['CID_disease']
    disease_ids = row['disease_ids']
    diseases = row['diseases']
    
    disease_names = []
    
    for cid in cid_diseases:
        if cid in disease_ids:
            idx = disease_ids.index(cid) 
            disease_names.append(diseases[idx]) 
        else:
            disease_names.append('Unknown')
    
    return disease_names


In [90]:
# Apply the data transformations functions to all three datasets

list_columns = ['chemicals', 'diseases', 'chemical_ids', 'disease_ids', 'CID_chemical', 'CID_disease']
for col in list_columns:
    df_train[col] = df_train[col].apply(convert_col_to_list) 
    df_val[col] = df_val[col].apply(convert_col_to_list) 
    df_test[col] = df_test[col].apply(convert_col_to_list) 

df_train['CID_chemical_name'] = df_train.apply(map_cid_to_chemical_name, axis=1)
df_train['CID_disease_name'] = df_train.apply(map_cid_to_disease_name, axis=1)
df_val['CID_chemical_name'] = df_val.apply(map_cid_to_chemical_name, axis=1)
df_val['CID_disease_name'] = df_val.apply(map_cid_to_disease_name, axis=1)
df_test['CID_chemical_name'] = df_test.apply(map_cid_to_chemical_name, axis=1)
df_test['CID_disease_name'] = df_test.apply(map_cid_to_disease_name, axis=1)

In [101]:
df_train.head(50)

Unnamed: 0,article_code,title,abstract,chemicals,diseases,chemical_start_indices,chemical_end_indices,disease_start_indices,disease_end_indices,chemical_ids,disease_ids,CID_chemical,CID_disease,CID_chemical_name,CID_disease_name
0,227508,Naloxone reverses the antihypertensive effect ...,"In unanesthetized, spontaneously hypertensive ...","[Naloxone, clonidine, clonidine, nalozone, alp...","[hypertensive, hypotensive, hypertensive, hype...","['0', '49', '181', '244', '306', '354', '364',...","['8', '58', '190', '252', '322', '362', '372',...","['93', '274', '469', '750']","['105', '285', '481', '762']","[D009270, D003000, D003000, -1, D008750, D0092...","[D006973, D007022, D006973, D006973]",[D008750],[D007022],[alpha-methyldopa],[hypotensive]
1,354896,Lidocaine-induced cardiac asystole.,Intravenous administration of a single 50-mg b...,"[Lidocaine, lidocaine, lidocaine]","[cardiac asystole, depression, bradyarrhythmias]","['0', '90', '409']","['9', '99', '418']","['18', '142', '331']","['34', '152', '347']","[D008012, D008012, D008012]","[D006323, D003866, D001919]",[D008012],[D006323],[Lidocaine],[cardiac asystole]
2,435349,Suxamethonium infusion rate and observed fasci...,Suxamethonium chloride (Sch) was administered ...,"[Suxamethonium, Suxamethonium chloride, Sch, Sch]","[fasciculations, tetanic, Fasciculations, fasc...","['0', '80', '104', '312']","['13', '102', '107', '315']","['41', '265', '395', '483', '523', '538', '561...","['55', '272', '409', '496', '536', '544', '568...","[D013390, D013390, D013390, D013390]","[D005207, D013746, D005207, D005207, D005207, ...",[D013390],[D005207],[Suxamethonium],[fasciculations]
3,603022,"Galanthamine hydrobromide, a longer acting ant...","Galanthamine hydrobromide, an anticholinestera...","[Galanthamine hydrobromide, scopolamine, Hyosc...",[overdosage],"['0', '111', '124', '135', '292', '305', '352'...","['25', '122', '132', '160', '303', '313', '365...",['315'],['325'],"[D005702, D012601, D012601, D005702, D012601, ...",[D062787],[D012601],[D062787],[scopolamine],[overdosage]
4,1378968,Effects of uninephrectomy and high protein fee...,Rats with lithium-induced nephropathy were sub...,"[lithium, lithium, lithium, lithium, lithium, ...","[chronic renal failure, nephropathy, renal fai...","['54', '111', '362', '520', '581', '608', '632...","['61', '118', '369', '527', '588', '615', '639...","['70', '127', '309', '975', '1000', '1027', '1...","['91', '138', '322', '986', '1012', '1045', '1...","[D008094, D008094, D008094, D008094, D008094, ...","[D007676, D007674, D051437, D011507, D006973, ...","[D008094, D008094, D008094]","[D006973, D011507, D007676]","[lithium, lithium, lithium]","[hypertension, proteinuria, chronic renal fail..."
5,1420741,Treatment of Crohn's disease with fusidic acid...,Fusidic acid is an antibiotic with T-cell spec...,"[fusidic acid, cyclosporin, cyclosporin, fusid...","[Crohn's disease, Crohn's disease, Crohn's dis...","['34', '107', '217', '391', '507', '743', '120...","['46', '118', '228', '403', '519', '755', '121...","['13', '292', '467', '910', '1263', '1440']","['28', '307', '482', '916', '1278', '1466']","[D005672, D016572, D016572, D005672, D005672, ...","[D003424, D003424, D003424, D009325, D003424, ...",[D005672],[D009325],[fusidic acid],[nausea]
6,1601297,Electrocardiographic evidence of myocardial in...,The electrocardiograms (ECG) of 99 cocaine-abu...,"[cocaine, cocaine, cocaine]","[myocardial injury, schizophrenic, myocardial ...","['83', '135', '232']","['90', '142', '239']","['33', '194', '305', '334', '357', '371']","['50', '207', '322', '355', '365', '390']","[D003042, D003042, D003042]","[D009202, D012559, D009202, D009203, D007511, ...","[D003042, D003042]","[D009203, D002037]","[cocaine, cocaine]","[myocardial infarction, bundle branch block]"
7,1967484,Sulpiride-induced tardive dystonia.,Sulpiride is a selective D2-receptor antagonis...,"[Sulpiride, Sulpiride, antidepressant, sulpiri...","[tardive dystonia, tardive dyskinesia, parkins...","['0', '36', '107', '204', '395', '456']","['9', '45', '121', '213', '404', '465']","['18', '222', '245', '355', '474']","['34', '240', '257', '363', '490']","[D013469, D013469, D000928, D013469, D013469, ...","[D004421, D004409, D010302, D004421, D004421]",[D013469],[D004421],[Sulpiride],[tardive dystonia]
8,2234245,Ocular and auditory toxicity in hemodialyzed p...,During an 18-month period of study 41 hemodial...,"[desferrioxamine, desferrioxamine, Desferrioxa...","[Ocular and auditory toxicity, audiovisual tox...","['64', '151', '766', '1030', '1097', '1234']","['79', '166', '781', '1045', '1106', '1249']","['0', '250', '314', '457', '534', '576', '604'...","['28', '270', '341', '472', '548', '599', '631...","[D003676, D003676, D003676, D003676, -1, D003676]","[D014786|D006311, D014786|D006311, D014786|D00...","[D003676, D003676, D003676]","[D012164, D014786, D006319]","[desferrioxamine, desferrioxamine, desferrioxa...","[pigmentary retinal deposits, Visual toxicity,..."
9,2385256,Myasthenia gravis presenting as weakness after...,We studied a patient with no prior history of ...,"[magnesium, magnesium, magnesium, magnesium, a...","[Myasthenia gravis, neuromuscular disease, qua...","['47', '192', '245', '321', '691', '777', '1024']","['56', '201', '254', '330', '704', '786', '1033']","['0', '119', '162', '221', '525', '761', '844'...","['17', '140', '174', '233', '560', '770', '861...","[D008274, D008274, D008274, D008274, D000109, ...","[D009157, D009468, D011782, D011225, D009468, ...",[D008274],[D009157],[magnesium],[Myasthenia gravis]


## EDA

What are the most commonly found chemicals and diseases that have relationships? 

In [100]:
# Value counts of CID_chemical_name and CID_disease_name

df_train_exploded_chemical = df_train.explode('CID_chemical_name')
df_train_exploded_disease = df_train.explode('CID_disease_name')

print("Chemical name value counts:")
print(df_train_exploded_chemical['CID_chemical_name'].value_counts())
print('\n')

print("Disease name value counts:")
print(df_train_exploded_disease['CID_disease_name'].value_counts())

Chemical name value counts:
cocaine                                                                       18
doxorubicin                                                                   17
pilocarpine                                                                   15
cisplatin                                                                     13
haloperidol                                                                   13
sulphasalazine                                                                11
fentanyl                                                                      11
nicotine                                                                      11
carbamazepine                                                                 10
cyclophosphamide                                                              10
methyldopa                                                                    10
tacrolimus                                                                    10


There are 29 "Unknown" disease relationships

In [107]:
# Unknown diseases

df_train_unknown_disease = df_train[df_train['CID_disease_name'].apply(lambda x: 'Unknown' in x)]
print(len(df_train_unknown_disease), "rows where disease relationship has unknown")
df_train_unknown_disease

29 rows where disease relationship has unknown


Unnamed: 0,article_code,title,abstract,chemicals,diseases,chemical_start_indices,chemical_end_indices,disease_start_indices,disease_end_indices,chemical_ids,disease_ids,CID_chemical,CID_disease,CID_chemical_name,CID_disease_name
10,2505783,Chloroacetaldehyde and its contribution to uro...,"Based on clinical data, indicating that chloro...","[Chloroacetaldehyde, cyclophosphamide, ifosfam...","[hemorrhagic cystitis, bladder damage]","['0', '77', '97', '192', '212', '349', '423', ...","['18', '93', '107', '210', '215', '352', '426'...","['375', '476']","['395', '490']","[C004656, D003520, D007069, C004656, C004656, ...","[D006470|D003556, D001745]","[C004656, C004656]","[D003556, D006470]","[Chloroacetaldehyde, Chloroacetaldehyde]","[Unknown, Unknown]"
20,3412544,Does paracetamol cause urothelial cancer or re...,The risk of developing renal papillary necrosi...,"[paracetamol, phenacetin, paracetamol, phenace...","[urothelial cancer, renal papillary necrosis, ...","['5', '205', '219', '399', '580']","['16', '215', '230', '409', '591']","['23', '44', '93', '121', '327', '445', '496',...","['40', '68', '117', '166', '351', '483', '511'...","[D000082, D010615, D000082, D010615, D000082]","[D014523, D007681, D007681, D007680|D014516|D0...","[D010615, D010615, D010615]","[D007681, D001749, D007680]","[phenacetin, phenacetin, phenacetin]","[renal papillary necrosis, Unknown, Unknown]"
25,3800626,Compression neuropathy of the radial nerve due...,"Fibrous myopathy is a common, well-known side ...","[pentazocine, pentazocine, pentazocine, pentaz...","[Compression neuropathy of the radial nerve, f...","['50', '153', '243', '345']","['61', '164', '254', '356']","['0', '70', '88', '185', '263', '365', '620']","['42', '86', '104', '207', '271', '381', '636']","[D010423, D010423, D010423, D010423]","[D009408|D020425, D005355|D009135, D005355|D00...","[D010423, D010423, D010423, D010423]","[D005355, D009135, D020425, D009408]","[pentazocine, pentazocine, pentazocine, pentaz...","[Unknown, myopathy, Unknown, compression neuro..."
27,3997294,Pneumonitis with pleural and pericardial effus...,A patient with sinuatrial disease and implante...,"[amiodarone, amiodarone, amiodarone, prednisol...","[pleural and pericardial effusion, neuropathy,...","['72', '167', '462', '492', '593', '641', '678']","['82', '177', '472', '504', '603', '651', '685']","['17', '54', '107', '262', '310', '323', '378'...","['49', '64', '125', '295', '321', '356', '403'...","[D000638, D000638, D000638, D011239, D000638, ...","[D010996|D010490, D009422, D002318, D013617, D...","[D000638, D000638, D000638, D000638]","[D010490, D009468, D010996, D011014]","[amiodarone, amiodarone, amiodarone, amiodarone]","[Unknown, proximal motor neuropathy, Unknown, ..."
80,19631624,Learning and memory deficits in ecstasy users ...,It has been consistently shown that ecstasy us...,"[ecstasy, ecstasy, ecstasy, ecstasy, cannabis,...","[Learning and memory deficits, impairments in ...","['32', '139', '251', '616', '693', '713', '782...","['39', '146', '258', '623', '701', '720', '790...","['0', '161', '854', '903', '1196', '1861']","['28', '195', '885', '916', '1209', '1871']","[D018817, D018817, D018817, D018817, D002188, ...","[D007859|D008569, D007859|D008569, D007859|D00...","[D018817, D018817]","[D008569, D007859]","[ecstasy, ecstasy]","[Unknown, Unknown]"
81,20003049,Prolonged elevation of plasma argatroban in a ...,BACKGROUND: Direct thrombin inhibitors (DTIs) ...,"[argatroban, heparin, heparin, argatroban, arg...","[thrombocytopenia, thrombosis, thrombocytopeni...","['30', '101', '277', '499', '933', '1054', '11...","['40', '108', '284', '509', '943', '1064', '11...","['117', '139', '293', '311', '319', '328', '34...","['133', '149', '309', '314', '322', '338', '34...","[C031942, D006493, D006493, C031942, C031942, ...","[D013921, D013927, D013921, D013921, D013921, ...","[C031942, C031942]","[D016063, D019106]","[argatroban, argatroban]","[Unknown, Unknown]"
91,1835291,Acute bronchodilating effects of ipratropium b...,The bronchodilator effects of a single dose of...,"[ipratropium bromide, theophylline, ipratropiu...","[chronic obstructive pulmonary disease, chroni...","['33', '57', '159', '220', '643', '690', '781'...","['52', '69', '178', '232', '654', '702', '792'...","['73', '400', '997', '1150']","['110', '437', '1040', '1177']","[D009241, D013806, D009241, D013806, D009241, ...","[D029424, D029424, D002318|D005767, D029424]","[D013806, D013806]","[D002318, D005767]","[theophylline, theophylline]","[Unknown, Unknown]"
94,2054792,Effect of adriamycin combined with whole body ...,Thermal enhancement of Adriamycin-mediated ant...,"[adriamycin, Adriamycin, Adriamycin, Adriamycin]","[hyperthermia, tumor, toxicities, hyperthermia...","['10', '111', '600', '983']","['20', '121', '610', '993']","['46', '62', '168', '193', '283', '329', '347'...","['58', '67', '178', '205', '288', '339', '357'...","[D004317, D004317, D004317, D004317]","[D005334, D009369, D064420, D005334, D009369, ...",[D004317],[D006331],[adriamycin],[Unknown]
110,7269015,Busulfan-induced hemorrhagic cystitis.,A case of a busulfan-induced hemorrhage cystit...,"[Busulfan, busulfan, busulfan, cyclophosphamid...","[hemorrhagic cystitis, hemorrhage cystitis, cy...","['0', '51', '219', '260', '375']","['8', '59', '227', '276', '383']","['17', '68', '228', '285', '414']","['37', '87', '236', '293', '423']","[D002066, D002066, D002066, D003520, D002066]","[D006470|D003556, D006470|D003556, D003556, D0...","[D002066, D002066]","[D003556, D006470]","[Busulfan, Busulfan]","[cystitis, Unknown]"
121,8739323,Effect of some anticancer drugs and combined c...,The nephrotoxic action of anticancer drugs suc...,"[nitrogranulogen, NG, methotrexate, MTX, 5-flu...","[renal toxicity, nephrotoxic, hemorrhagic cyst...","['128', '145', '150', '164', '170', '186', '19...","['143', '147', '162', '167', '184', '190', '21...","['61', '81', '1175', '1506']","['75', '92', '1195', '1520']","[D008466, D008466, D008727, D008727, D005472, ...","[D007674, D007674, D006470|D003556, D007674]","[D003520, D003520]","[D006470, D003556]","[cyclophosphamide, cyclophosphamide]","[Unknown, Unknown]"


*This is due to Dxxxxxx|Dxxxxxxx notation - need to find out what this is*

What are the most commonly found chemical-disease relationships? 

In [108]:
# Value counts of chemical-disease relationship pairs

relationships = []

for chemicals, diseases in zip(df_train['CID_chemical_name'], df_train['CID_disease_name']):
    relationships.extend(list(zip(chemicals, diseases)))

relationship_counts = Counter(relationships)

most_common_relationships = relationship_counts.most_common(30) 

relationship_df = pd.DataFrame(most_common_relationships, columns=['Relationship', 'Count'])
relationship_df[['Chemical', 'Disease']] = pd.DataFrame(relationship_df['Relationship'].tolist(), index=relationship_df.index)
relationship_df.drop('Relationship', axis=1, inplace=True)

print(relationship_df)

    Count                   Chemical                 Disease
0       5                pilocarpine      status epilepticus
1       5           cyclophosphamide                 Unknown
2       5              isoproterenol   myocardial infarction
3       5                haloperidol               catalepsy
4       4  puromycin aminonucleoside             proteinuria
5       3              nitroglycerin             hypotension
6       3                   fentanyl             hypotension
7       3                scopolamine                 amnesia
8       3                 vincristin                 Unknown
9       3              dexamethasone            hypertension
10      3                    cocaine                seizures
11      3                amphetamine           hyperactivity
12      3                  capsaicin            hyperalgesia
13      2         Chloroacetaldehyde                 Unknown
14      2              nitroprusside             hypotension
15      2               

How many are affected by an unknonw concept identifier '-1'? 

In [110]:
# Rows where chemical_ids has -1

df_train_minus_one_chemical = df_train[df_train['chemical_ids'].apply(lambda x: '-1' in x)]
print(len(df_train_minus_one_chemical), "rows where chemical_ids has -1")
df_train_minus_one_chemical

12 rows where chemical_ids has -1


Unnamed: 0,article_code,title,abstract,chemicals,diseases,chemical_start_indices,chemical_end_indices,disease_start_indices,disease_end_indices,chemical_ids,disease_ids,CID_chemical,CID_disease,CID_chemical_name,CID_disease_name
0,227508,Naloxone reverses the antihypertensive effect ...,"In unanesthetized, spontaneously hypertensive ...","[Naloxone, clonidine, clonidine, nalozone, alp...","[hypertensive, hypotensive, hypertensive, hype...","['0', '49', '181', '244', '306', '354', '364',...","['8', '58', '190', '252', '322', '362', '372',...","['93', '274', '469', '750']","['105', '285', '481', '762']","[D009270, D003000, D003000, -1, D008750, D0092...","[D006973, D007022, D006973, D006973]",[D008750],[D007022],[alpha-methyldopa],[hypotensive]
8,2234245,Ocular and auditory toxicity in hemodialyzed p...,During an 18-month period of study 41 hemodial...,"[desferrioxamine, desferrioxamine, Desferrioxa...","[Ocular and auditory toxicity, audiovisual tox...","['64', '151', '766', '1030', '1097', '1234']","['79', '166', '781', '1045', '1106', '1249']","['0', '250', '314', '457', '534', '576', '604'...","['28', '270', '341', '472', '548', '599', '631...","[D003676, D003676, D003676, D003676, -1, D003676]","[D014786|D006311, D014786|D006311, D014786|D00...","[D003676, D003676, D003676]","[D012164, D014786, D006319]","[desferrioxamine, desferrioxamine, desferrioxa...","[pigmentary retinal deposits, Visual toxicity,..."
84,20722491,Safety of capecitabine: a review.,"IMPORTANCE OF THE FIELD: Fluoropyrimidines, in...","[capecitabine, Fluoropyrimidines, 5-fluorourac...","[tumors, colorectal, breast and head and neck ...","['10', '59', '92', '108', '333', '467', '615',...","['22', '76', '106', '112', '345', '479', '627'...","['169', '187', '539', '913', '1061', '1071', '...","['175', '231', '563', '998', '1069', '1077', '...","[C110904, -1, D005472, D005472, C110904, C1109...","[D009369, D015179|D001943|D006258, D007674, D0...","[C110904, C110904, C110904, C110904, C110904]","[D060831, D013280, D014839, D003967, D009325]","[capecitabine, capecitabine, capecitabine, cap...","[renal cell and head and neck cancers, gastric..."
117,8092427,Immediate allergic reactions to amoxicillin.,A large group of patients with suspected aller...,"[amoxicillin, beta-lactam, beta-lactam, amoxic...","[allergic reactions, allergic reactions, aller...","['32', '108', '318', '388', '401', '439', '494...","['43', '119', '329', '399', '403', '449', '525...","['10', '86', '306', '366', '845', '931', '966'...","['28', '104', '314', '374', '853', '938', '977...","[D000658, D047090, D047090, D000658, D000658, ...","[D004342, D004342, D004342, D004342, D004342, ...","[D000658, D000658, D000658]","[D000707, D004342, D000799]","[amoxicillin, amoxicillin, amoxicillin]","[Anaphylaxis, allergic reactions, angioedema]"
118,8638206,Persistent paralysis after prolonged use of at...,Neuromuscular blocking agents (NMBAs) are ofte...,"[atracurium, vecuronium bromide, Atracurium be...","[paralysis, paralysis, paralysis, paralysis]","['44', '330', '408', '444', '646']","['54', '348', '427', '464', '656']","['11', '222', '574', '665']","['20', '231', '583', '674']","[D001279, D014673, D001279, -1, D001279]","[D010243, D010243, D010243, D010243]","[D001279, D014673]","[D010243, D010243]","[atracurium, vecuronium bromide]","[paralysis, paralysis]"
136,11426838,Conformationally restricted analogs of BD1008 ...,Cocaine's ability to interact with sigma recep...,"[BD1008, oligodeoxynucleotide, cocaine, Cocain...","[convulsions, toxicity, convulsive]","['39', '63', '124', '149', '378', '386', '453'...","['45', '83', '131', '156', '384', '450', '459'...","['981', '1400', '1822']","['992', '1408', '1832']","[C085527, D009838, D003042, D003042, -1, -1, C...","[D012640, D064420, D012640]",[D003042],[D012640],[cocaine],[convulsions]
153,16005948,Evaluation of the anticocaine monoclonal antib...,The illicit use of cocaine continues in epidem...,"[GNC92H2, cocaine, GNC92H2, cocaine, GNC92H2, ...","[cocaine overdose, cocaine overdose, cocaine o...","['50', '119', '418', '603', '615', '663', '729...","['57', '126', '425', '610', '622', '670', '736...","['82', '179', '456', '780', '897', '920', '965...","['98', '195', '472', '788', '905', '925', '970...","[-1, D003042, -1, D003042, -1, -1, D003042, D0...","[D062787, D062787, D062787, D064420, D012640, ...",[D003042],[D012640],[cocaine],[seizures]
172,19759529,The glycine transporter-1 inhibitor SSR103800 ...,Schizophrenia has been initially associated wi...,"[glycine, SSR103800, dopamine, glutamate, N-me...","[Schizophrenia, schizophrenic, hyperactivity, ...","['4', '36', '202', '279', '289', '311', '448',...","['11', '45', '210', '288', '309', '315', '452'...","['138', '334', '1020', '1209', '1322', '1402',...","['151', '347', '1033', '1222', '1335', '1415',...","[D005998, -1, D004298, D018698, D016202, D0162...","[D012559, D012559, D006948, D006948, D006948, ...","[C094645, D006220, C076029, D016291, D000661, ...","[D002375, D002375, D002375, D006948, D006948, ...","[aripiprazole, haloperidol, olanzapine, MK-801...","[catalepsy, catalepsy, catalepsy, hyperactivit..."
179,1833784,Evidence for an involvement of D1 and D2 dopam...,Previous studies have suggested that repeated ...,"[dopamine, nicotine, nicotine, nicotine, dopam...","[hyperactivity, increase in locomotor activity...","['41', '73', '247', '391', '447', '479', '583'...","['49', '81', '255', '399', '455', '487', '591'...","['90', '775', '1217', '1427', '1562']","['103', '805', '1230', '1440', '1575']","[D004298, D009538, D009538, D009538, D004298, ...","[D006948, D006948, D006948, D006948, D006948]",[D009538],[D006948],[nicotine],[hyperactivity]
214,9284778,Epidemic of liver disease caused by hydrochlor...,BACKGROUND: Hydrochlorofluorocarbons (HCFCs) a...,"[hydrochlorofluorocarbons, ozone, chlorofluoro...","[liver disease, hepatotoxicity, liver disease,...","['36', '69', '98', '131', '157', '217', '233',...","['60', '74', '117', '155', '162', '222', '252'...","['12', '307', '381', '825', '919', '1335', '14...","['25', '321', '394', '839', '932', '1343', '14...","[-1, D010126, D017402, -1, -1, D010126, D01740...","[D008107, D056486, D008107, D056486, D008107, ...","[C072959, C067411]","[D008107, D008107]","[1-chloro-1,2,2,2-tetrafluoroethane, 1,1-dichl...","[liver disease, liver disease]"


In [114]:
# Extract names of chemicals whose ids are -1

chemicals_minus_one = df_train.apply(lambda row: [row['chemicals'][i] for i, cid in enumerate(row['chemical_ids']) if cid == '-1'], axis=1)

chemicals_minus_one = chemicals_minus_one[chemicals_minus_one.apply(len) > 0]
print(chemicals_minus_one)

0        [nalozone, 3H-naloxone, 3H-dihydroergocryptine]
8                                            [aluminium]
84                                   [Fluoropyrimidines]
117    [benzylpenicilloyl-poly-L-lysine, BPO-PLL, ben...
118                               [benzylisoquinolinium]
136    [BD1018, 3S-1-2-(3,4-dichlorophenyl)ethyl-1,4-...
153    [GNC92H2, GNC92H2, GNC92H2, GNC92H2, GNC92H2, ...
172    [SSR103800, SSR103800, SSR103800, SSR103800, S...
179                                               [PHNO]
214    [hydrochlorofluorocarbons, Hydrochlorofluoroca...
379                                                [MMF]
480                            [sodium acetylsalicylate]
dtype: object


A lot of these seem to be different ways of writing the same chemical (aluminium) or some code (BD1018) or an abbreviation

In [111]:
df_train_minus_one_disease = df_train[df_train['disease_ids'].apply(lambda x: '-1' in x)]
print(len(df_train_minus_one_disease), "rows where disease_ids has -1")
df_train_minus_one_disease

15 rows where disease_ids has -1


Unnamed: 0,article_code,title,abstract,chemicals,diseases,chemical_start_indices,chemical_end_indices,disease_start_indices,disease_end_indices,chemical_ids,disease_ids,CID_chemical,CID_disease,CID_chemical_name,CID_disease_name
8,2234245,Ocular and auditory toxicity in hemodialyzed p...,During an 18-month period of study 41 hemodial...,"[desferrioxamine, desferrioxamine, Desferrioxa...","[Ocular and auditory toxicity, audiovisual tox...","['64', '151', '766', '1030', '1097', '1234']","['79', '166', '781', '1045', '1106', '1249']","['0', '250', '314', '457', '534', '576', '604'...","['28', '270', '341', '472', '548', '599', '631...","[D003676, D003676, D003676, D003676, -1, D003676]","[D014786|D006311, D014786|D006311, D014786|D00...","[D003676, D003676, D003676]","[D012164, D014786, D006319]","[desferrioxamine, desferrioxamine, desferrioxa...","[pigmentary retinal deposits, Visual toxicity,..."
50,9625142,"Acute hepatitis, autoimmune hemolytic anemia, ...",An 80-yr-old man developed acute hepatitis sho...,"[ceftriaxone, ceftriaxone, beta lactam, biliru...","[hepatitis, autoimmune hemolytic anemia, eryth...","['84', '169', '262', '324', '474', '626']","['95', '180', '273', '333', '482', '637']","['6', '17', '50', '130', '390', '422']","['15', '44', '72', '139', '417', '444']","[D002443, D002443, D047090, D001663, D013256, ...","[D056486, D000744, -1, D056486, D000744, -1]","[D002443, D002443]","[D000744, D056486]","[ceftriaxone, ceftriaxone]","[autoimmune hemolytic anemia, hepatitis]"
64,15804801,Coronary aneurysm after implantation of a pacl...,Formation of coronary aneurysm is a rare compl...,"[paclitaxel, paclitaxel]","[Coronary aneurysm, coronary aneurysm, aneurys...","['42', '490']","['52', '500']","['0', '81', '290', '341', '355', '416', '553',...","['17', '98', '298', '351', '369', '433', '561'...","[D017239, D017239]","[D003323, D003323, D000783, D013927, -1, D0033...",[D017239],[D003323],[paclitaxel],[Coronary aneurysm]
106,6133211,Possible teratogenicity of sulphasalazine.,"Three infants, born of two mothers with inflam...","[sulphasalazine, sulphasalazine, sulphasalazine]","[inflammatory bowel disease, congenital anomal...","['27', '138', '724']","['41', '152', '738']","['83', '200', '265', '313', '344', '409', '463...","['109', '220', '283', '337', '369', '424', '49...","[D012460, D012460, D012460]","[D015212, D000013, D003093, D001017, D006345, ...","[D012460, D012460, D012460, D012460]","[D003025, D001017, D006345, D007690]","[sulphasalazine, sulphasalazine, sulphasalazin...","[talipes equinovarus, coarctation of the aorta..."
138,11587867,Fatal myeloencephalopathy due to accidental in...,We report on two fatal cases of accidental int...,"[vincristin, vincristine, vincristine, vincris...","[myeloencephalopathy, acute lymphoblastic leuc...","['56', '161', '378', '700', '970']","['66', '172', '389', '711', '981']","['6', '222', '278', '447', '505', '608', '652']","['25', '250', '300', '490', '514', '640', '679']","[D014750, D014750, D014750, D014750, D014750]","[D001927, D054198, D054198, D020258|D020258|D0...","[D014750, D014750, D014750]","[D020258, D009410, D003711]","[vincristin, vincristin, vincristin]","[Unknown, Unknown, Unknown]"
160,17241784,Progressive myopathy with up-regulation of MHC...,Statins can cause a necrotizing myopathy and h...,"[statin, Statins, statins, prednisolone, metho...","[myopathy, myopathy, hyperCKaemia, myopathy, n...","['65', '81', '233', '617', '634', '711', '923']","['71', '88', '240', '629', '646', '718', '930']","['12', '113', '126', '254', '391', '534', '751...","['20', '121', '138', '262', '399', '542', '759...","[D019821, D019821, D019821, D011239, D008727, ...","[D009135, D009135, -1, D009135, D009336, D0093...","[D019821, D019821]","[D009336, D009135]","[statin, statin]","[necrosis, myopathy]"
162,17343925,Influence of smoking on developing cochlea. Do...,OBJECTIVE: Maternal tobacco smoking has negati...,"[smoking, smoking, smoking, smoking, smoking, ...","[hearing loss, hearing impairment, decreases o...","['13', '49', '179', '242', '326', '463', '703'...","['20', '56', '186', '249', '333', '470', '710'...","['370', '612', '1331']","['382', '630', '1361']","[D012906, D012906, D012906, D012906, D012906, ...","[D034381, D034381, -1]",[D012906],[D034381],[smoking],[hearing loss]
174,20619828,"A novel, multiple symptom model of obsessive-c...",BACKGROUND: Current animal models of obsessive...,"[antidepressant, clomipramine, Clomipramine, D...","[obsessive-compulsive-like behaviors, obsessiv...","['394', '409', '514', '886', '947', '969']","['408', '421', '526', '894', '956', '978']","['35', '120', '151', '260', '319', '631', '680...","['70', '149', '154', '267', '322', '638', '704...","[D000928, D002997, D002997, D004298, D012701, ...","[D009771, D009771, D009771, D001008, D009771, ...","[D002997, D002997, D002997]","[D008569, D001008, D060845]","[clomipramine, clomipramine, clomipramine]","[memory impairment, anxiety, hoarding]"
193,3560095,Flurbiprofen in the treatment of juvenile rheu...,Thirty-four patients with juvenile rheumatoid ...,"[Flurbiprofen, flurbiprofen]","[juvenile rheumatoid arthritis, juvenile rheum...","['0', '143']","['12', '155']","['33', '90', '249', '336', '367', '380', '434'...","['62', '119', '258', '349', '375', '390', '451...","[D005480, D005480]","[D001171, D001171, D001168, -1, D004487, -1, -...","[D005480, D005480, D005480]","[D006261, D006471, D015746]","[Flurbiprofen, Flurbiprofen, Flurbiprofen]","[headache, gastrointestinal (GI) bleeding, abd..."
272,20098969,"Oral manifestations of ""meth mouth"": a case re...",AIM: The aim of the documentation of this clin...,"[Methamphetamine, methamphetamine, methampheta...","[meth mouth, meth mouth, cardiac dysrhythmias,...","['226', '467', '626', '990', '1613']","['241', '482', '641', '1005', '1629']","['24', '140', '369', '391', '405', '425', '519...","['34', '150', '389', '403', '419', '441', '529...","[D008694, D008694, D008694, D008694, D008694]","[-1, -1, D001145, D006973, D006212, D001523, D...",[D008694],[D003731],[Methamphetamine],[caries]


In [115]:
# Extract names of diseases whose ids are -1

diseases_minus_one = df_train.apply(lambda row: [row['diseases'][i] for i, cid in enumerate(row['disease_ids']) if cid == '-1'], axis=1)

diseases_minus_one = diseases_minus_one[diseases_minus_one.apply(len) > 0]
print(diseases_minus_one)

8                                       [dyschromatopsy]
50      [erythroblastocytopenia, erythroblastocytopenia]
64                                      [vessel rupture]
106    [rudimentary left uterine cornu, Potter's faci...
138                   [degeneration of myelin and axons]
160                                       [hyperCKaemia]
162                     [decreases of TEOAEs amplitudes]
174    [behavioral inflexibility, corticostriatal dys...
193    [tender joints, tenderness, morning stiffness,...
272    [meth mouth, meth mouth, meth mouth, meth mout...
277                                 [dysphoric reaction]
352                                   [CIPS, CIPS, CIPS]
381    [platypnea-orthodeoxia-like syndrome, platypne...
397           [Spontaneous recurrent seizures, SRS, SRS]
452                                      [bronchorrhoea]
dtype: object
