In [None]:
%pip install -q rdflib gdown==4.6.3

In [None]:
from pathlib import Path
from rdflib import Graph, URIRef, Literal, Namespace, BNode
from rdflib.collection import Collection
from rdflib.namespace import RDF, RDFS, OWL, XSD
from re import sub
import gdown
import numpy as np
import pandas as pd
import pickle
import string

In [None]:
data_dir = Path('./data/')
data_path = data_dir / Path('trials.pkl')
graph_path = data_dir / Path('graph.ttl')


def download_data() -> None:
  # Create folder if not existing
  data_dir.mkdir()

  gdown.download(
    id='1oi3mnz6PQVt-tEMR6IQnqC0ab9IZ1iXx',
    output=str(data_path),
    quiet=True
  )


if not data_dir.exists():
  download_data()

with open(data_path, 'rb') as file:
  extracted_data = []
  while True:
    try:
      document = pickle.load(file)
      extracted_data.extend(document)
    except EOFError:
      break

df = pd.DataFrame(extracted_data)

In [None]:
# Drop irrelevant columns
df.drop(columns=[
    'brief_title',
    'official_title',
    'brief_summary',
    'trials_keyword',
    'condition_browse_mesh_term',
    'detailed_description',
    'study_type',
    'study_design_info_primary_purpose',
    'intervention_intervention_type',
    'intervention_intervention_name',
    'location',
    'intervention_browse_mesh_term',
    'eligibility_criteria'
  ],
  inplace=True
)

In [None]:
df.head(1)

Unnamed: 0,nct_id,condition,eligibility_inclusion_criteria,eligibility_exclusion_criteria,eligibility_gender,eligibility_minimum_age,eligibility_maximum_age,eligibility_healthy_volunteers
0,NCT00976963,Urinary Tract Infection,:\r\n\r\n - Non pregnant women in go...,":\r\n\r\n - Pregnant, lactating, or ...",Female,18 Years,45 Years,Accepts Healthy Volunteers


## Analyze data

I want to find which criterias can I retrieve from the relevant columns, to insert in the Knowledge Base.

In [None]:
# Check for the amount of empty strings
for name, values in df.items():
  ratio = "{:.2f}%".format(round((values == '').sum() / df.shape[0] * 100, 2))
  print(f"{name}: {(values == '').sum()}/{df.shape[0]} | {ratio}")

nct_id: 0/375580 | 0.00%
condition: 860/375580 | 0.23%
eligibility_inclusion_criteria: 1204/375580 | 0.32%
eligibility_exclusion_criteria: 7560/375580 | 2.01%
eligibility_gender: 863/375580 | 0.23%
eligibility_minimum_age: 863/375580 | 0.23%
eligibility_maximum_age: 863/375580 | 0.23%
eligibility_healthy_volunteers: 7899/375580 | 2.10%


In [None]:
word_dict = {}
for index, row in df.iterrows():
  text = row['eligibility_inclusion_criteria']

  # Remove punctuation
  text = text.translate(str.maketrans('', '', string.punctuation))
  # Remove multiple spaces
  text = sub(r"\s+", " ", text)
  # Lowercase
  text = text.lower()

  for word in text.split(' '):
    if word in word_dict:
      word_dict[word] += 1
    else:
      word_dict[word] = 1


for index, row in df.iterrows():
  text = row['eligibility_exclusion_criteria']

  # Remove punctuation
  text = text.translate(str.maketrans('', '', string.punctuation))
  # Remove multiple spaces
  text = sub(r"\s+", " ", text)
  # Lowercase
  text = text.lower()

  for word in text.split(' '):
    if word in word_dict:
      word_dict[word] += 1
    else:
      word_dict[word] = 1


# Remove 'therapy'
word_dict.pop('therapy')

# Manually written down criterion based on the most frequent / most discriminant words in the first 5000 trials
criteria_dict = {
    'pregnant': ['pregnant', 'pregnancy', 'childbearing'],
    'healthy': ['healthy', 'health'],
    'smoker': ['smoker', 'smoking', 'smoke'],
    'vaccinated': ['vaccinated', 'vaccine'],
    'fertile': ['fertile'],
    'sterile': ['sterile'],
    'surgery': ['surgery'],
    'contraception': ['contraception']
}

# Automatically add words ending with 'therapy' or 'scopy'
for item, value in word_dict.items():
  # Add item only if it appears at least 3 times
  if value >= 3:
    if item[-7:] == 'therapy' or item[-5:] == 'scopy':
      criteria_dict[item] = [item]

print(criteria_dict)

{'pregnant': ['pregnant', 'pregnancy', 'childbearing'], 'healthy': ['healthy', 'health'], 'smoker': ['smoker', 'smoking', 'smoke'], 'vaccinated': ['vaccinated', 'vaccine'], 'fertile': ['fertile'], 'sterile': ['sterile'], 'surgery': ['surgery'], 'contraception': ['contraception'], 'chemotherapy': ['chemotherapy'], 'immunotherapy': ['immunotherapy'], 'radiotherapy': ['radiotherapy'], 'mediastinoscopy': ['mediastinoscopy'], 'thoracoscopy': ['thoracoscopy'], 'bronchoscopy': ['bronchoscopy'], 'laryngoscopy': ['laryngoscopy'], 'colonoscopy': ['colonoscopy'], 'endoscopy': ['endoscopy'], 'cystoscopy': ['cystoscopy'], 'ophthalmoscopy': ['ophthalmoscopy'], 'brachytherapy': ['brachytherapy'], 'psychotherapy': ['psychotherapy'], 'laparoscopy': ['laparoscopy'], 'phototherapy': ['phototherapy'], 'ileoscopy': ['ileoscopy'], 'monotherapy': ['monotherapy'], 'microscopy': ['microscopy'], 'otoscopy': ['otoscopy'], 'cryotherapy': ['cryotherapy'], 'arthroscopy': ['arthroscopy'], 'chemoradiotherapy': ['chem

In [None]:
# == Graph building ==
graph = Graph()

ct = Namespace('http://example.org/ontologies/clinical_trials/')
graph.bind('ct', ct)

# Fill in the dataframe data
for index, row in df.iterrows():
  trial = ct[row['nct_id']]

  # > Condition
  if row['condition'] != '':
    condition_name = row['condition']

    # Remove punctuation
    condition_name = condition_name.translate(str.maketrans('', '', string.punctuation))
    # Remove multiple spaces
    condition_name = sub(r"\s+", " ", condition_name)
    # Transform spaces into _
    condition = ct[condition_name.replace(' ', '_').lower()]

    graph.add((
        trial,
        ct.condition,
        condition
    ))
    graph.add((
        condition,
        RDFS.label,
        Literal(row['condition'], datatype=XSD.string)
    ))

  # > Gender
  if row['eligibility_gender'] == 'Male' or row['eligibility_gender'] == 'All':
    graph.add((
        trial,
        ct.eligibilityGender,
        ct['male']
    ))
  if row['eligibility_gender'] == 'Female' or row['eligibility_gender'] == 'All':
    graph.add((
        trial,
        ct.eligibilityGender,
        ct['female']
    ))

  # > Minimum age
  if row['eligibility_minimum_age'] != 'N/A' and row['eligibility_minimum_age'] != '':
    number, period = row['eligibility_minimum_age'].split(' ')
    number = int(number)

    if period == 'Year' or period == 'Years':
      graph.add((
        trial,
        ct.eligibilityMinimumYears,
        Literal(number, datatype=XSD.integer)
      ))
    elif period == 'Month' or period == 'Months':
      graph.add((
        trial,
        ct.eligibilityMinimumMonths,
        Literal(number, datatype=XSD.integer)
      ))
    elif period == 'Week' or period == 'Weeks':
      graph.add((
        trial,
        ct.eligibilityMinimumWeeks,
        Literal(number, datatype=XSD.integer)
      ))
    elif period == 'Day' or period == 'Days':
      graph.add((
        trial,
        ct.eligibilityMinimumDays,
        Literal(number, datatype=XSD.integer)
      ))
    elif period == 'Hour' or period == 'Hours':
      graph.add((
        trial,
        ct.eligibilityMinimumHours,
        Literal(number, datatype=XSD.integer)
      ))
    elif period == 'Minute' or period == 'Minutes':
      graph.add((
        trial,
        ct.eligibilityMinimumMinutes,
        Literal(number, datatype=XSD.integer)
      ))

  # > Maximum age
  if row['eligibility_maximum_age'] != 'N/A' and row['eligibility_maximum_age'] != '':
    number, period = row['eligibility_maximum_age'].split(' ')
    number = int(number)

    if period == 'Year' or period == 'Years':
      graph.add((
        trial,
        ct.eligibilityMaximumYears,
        Literal(number, datatype=XSD.integer)
      ))
    elif period == 'Month' or period == 'Months':
      graph.add((
        trial,
        ct.eligibilityMaximumMonths,
        Literal(number, datatype=XSD.integer)
      ))
    elif period == 'Week' or period == 'Weeks':
      graph.add((
        trial,
        ct.eligibilityMaximumWeeks,
        Literal(number, datatype=XSD.integer)
      ))
    elif period == 'Day' or period == 'Days':
      graph.add((
        trial,
        ct.eligibilityMaximumDays,
        Literal(number, datatype=XSD.integer)
      ))
    elif period == 'Hour' or period == 'Hours':
      graph.add((
        trial,
        ct.eligibilityMaximumHours,
        Literal(number, datatype=XSD.integer)
      ))
    elif period == 'Minute' or period == 'Minutes':
      graph.add((
        trial,
        ct.eligibilityMaximumMinutes,
        Literal(number, datatype=XSD.integer)
      ))

  # > Inclusion criteria
  text = row['eligibility_inclusion_criteria']

  # Remove punctuation
  text = text.translate(str.maketrans('', '', string.punctuation))
  # Remove multiple spaces
  text = sub(r"\s+", " ", text)
  # Lowercase
  text = text.lower()

  previous_not = False
  for word in text.split(' '):
    if word == 'not' or word == 'non':
      previous_not = True
      continue

    for key, values in criteria_dict.items():
      if word in values:
        if previous_not:
          graph.add((
            trial,
            ct.eligibilityExclude,
            ct[key]
          ))
          graph.add((
            ct[key],
            RDFS.label,
            Literal(key.title(), datatype=XSD.string)
          ))
        else:
          graph.add((
            trial,
            ct.eligibilityInclude,
            ct[key]
          ))
          graph.add((
            ct[key],
            RDFS.label,
            Literal(key.title(), datatype=XSD.string)
          ))
    previous_not = False

  # > Exclusion criteria
  text = row['eligibility_exclusion_criteria']

  # Remove punctuation
  text = text.translate(str.maketrans('', '', string.punctuation))
  # Remove multiple spaces
  text = sub(r"\s+", " ", text)
  # Lowercase
  text = text.lower()

  previous_not = False
  for word in text.split(' '):
    if word == 'not' or word == 'non':
      previous_not = True
      continue

    for key, values in criteria_dict.items():
      if word in values:
        if previous_not:
          graph.add((
            trial,
            ct.eligibilityInclude,
            ct[key]
          ))
          graph.add((
            ct[key],
            RDFS.label,
            Literal(key.title(), datatype=XSD.string)
          ))
        else:
          graph.add((
            trial,
            ct.eligibilityExclude,
            ct[key]
          ))
          graph.add((
            ct[key],
            RDFS.label,
            Literal(key.title(), datatype=XSD.string)
          ))
    previous_not = False

  # > Healthy volunteers
  if row['eligibility_healthy_volunteers'] != '':
    if row['eligibility_healthy_volunteers'] == 'Accepts Healthy Volunteers':
      graph.add((
        trial,
        ct.eligibilityHealthy,
        Literal(True, datatype=XSD.boolean)
      ))
    elif row['eligibility_healthy_volunteers'] == 'No':
      graph.add((
        trial,
        ct.eligibilityHealthy,
        Literal(False, datatype=XSD.boolean)
      ))

In [None]:
graph.serialize(destination=str(graph_path))

<Graph identifier=N4ac12c808e7f45bfb9ec4614fe47b52c (<class 'rdflib.graph.Graph'>)>