# Integrate existing data sets on transmission experiments

In [1]:
from pathlib import Path
import pandas as pd
from mappings import Mappings
from ontology import Ontology
from dataset import Dataset

In [2]:
import pyvis.network

## Provide files ##

**Dataset**

In [3]:
data_dir = Path('../../../data/examples')

In [4]:
#trans_data = Path.cwd()/'data_maldi_UU_2_updated.xlsx'
trans_data_original = data_dir/'sample1_original.csv'
trans_data = data_dir/'sample1_modified.csv'
trans_data2 = data_dir/ 'sample2.csv'

**Mapping file**

In [5]:
config1 = data_dir/'sample1_mapping.yml'
config2 = data_dir/'sample2_mapping.yml'


**Ontology schema**

In [6]:
ont_file = Path.cwd()/'infection_trans.owl'

# Example 1


### Original dataset

In [7]:
dataset_original = pd.read_csv(trans_data_original)

dataset_original

Unnamed: 0,chicken_id,type,sex,pen,house,D3,D4 12:00,D4 16:00
0,12345,I,F,10,1,-,-,+


* The samples are taken on day 3 (no hour is specified), and 2 samples at day 4 (12:00 and 16:00). 

* We need to update the data in order to be able to create triples:

    1. The day of the sample in separate column -> create new column for each sample with day value

    2. If one of the samemples has specified an hour, others should specify as well -> we need to specify hour for sample taken on day3
    3. We should understand that sample result of one sample is related to the sample day and hour based on column names -> use sample's ordered number for column names. For example, sample1_date, sample2_date ...

## Corresponding preprocessed dataset

#### the step of renaming columns  and creating new ones should be manully preformed before creating the linked triples

In [8]:
dataset = pd.read_csv(trans_data)

dataset

Unnamed: 0,chicken_id,type,sex,pen,house,sample1_day,sample1_hour,sample1_result,sample2_day,sample2_hour,sample2_result,sample3_day,sample3_hour,sample3_result
0,12345,I,F,10,1,3,12:00,-,4,12:00,-,4,16:00,+


## Tidy dataset

### The script takes preprocessed data and mapping file to create tidy version (one date per row)

### Config example

In [9]:
mappings = Mappings(config1)

In [10]:
#dataset = pd.read_excel(trans_dta).fillna(method='ffill',axis=0).fillna('')
dataset = Dataset(str(trans_data), mappings)

In [11]:
dataset.tidy_dataset

Unnamed: 0,house,pen,sex,type,﻿chicken_id,experimentDay,experimentHour,sample.*_result
0,1,10,F,I,12345,3,12:00,-
1,1,10,F,I,12345,4,12:00,-
2,1,10,F,I,12345,4,16:00,+


## Ontology population

In [12]:
ontology = Ontology(ont_file)

**Create instances and populate ontology**
for each row of a dataset we create triples based on the mapping file

In [13]:
for _, row in dataset.tidy_dataset.iterrows():
        ontology.populate_ontology(mappings, row)

In [14]:
row

house                  1
pen                   10
sex                    F
type                   I
﻿chicken_id        12345
experimentDay          4
experimentHour     16:00
sample.*_result        +
Name: 2, dtype: object

In [15]:
mappings.ont_mappings['Experiment']

{'experimentID': 'ChickenData', 'experimentDay': 'experimentDay'}

In [16]:
pyvis_graph = pyvis.network.Network(notebook=True)

In [17]:
pyvis_graph.add_node(0, label="Experiment1", title="This is FOO", color="orange", size=15, mass=7)
pyvis_graph.add_node(1, label="Experiment", title="That is BAR", color="blue", size=10)
pyvis_graph.add_node(2, label="ChickenData", title="Here is BAZ", color="green", size=10)
pyvis_graph.add_node(3, label="4", title="Here is BAZ", color="green", size=10)



pyvis_graph.add_edge(0, 1, label="isA", color="red")
pyvis_graph.add_edge(0, 2, label="experimentID", color="red")
pyvis_graph.add_edge(0, 3, label="experimentDay", color="red")

pyvis_graph.show('example.html')

In [18]:
mappings.ont_mappings['Host']

{'id': 'chicken_id',
 'inoculationStatus': 'type',
 'sex': 'sex',
 'locatedIn': 'Environment'}

In [19]:
row

house                  1
pen                   10
sex                    F
type                   I
﻿chicken_id        12345
experimentDay          4
experimentHour     16:00
sample.*_result        +
Name: 2, dtype: object

In [20]:
pyvis_graph_host = pyvis.network.Network(notebook=True)

In [21]:
pyvis_graph_host.add_node(0, label="Host1234", title="This is FOO", color="orange", size=15, mass=7)
pyvis_graph_host.add_node(1, label="Host", title="That is BAR", color="blue", size=10)
pyvis_graph_host.add_node(2, label="12345", title="Here is BAZ", color="green", size=10)
pyvis_graph_host.add_node(3, label="F", title="Here is BAZ", color="green", size=10)
pyvis_graph_host.add_node(4, label="I", title="Here is BAZ", color="green", size=10)
pyvis_graph_host.add_node(5, label="Environment1", title="Here is BAZ", color="blue", size=10)



pyvis_graph_host.add_edge(0, 1, label="isA", color="red")
pyvis_graph_host.add_edge(0, 2, label="id", color="red")
pyvis_graph_host.add_edge(0, 3, label="sex", color="red")
pyvis_graph_host.add_edge(0, 4, label="type", color="red")
pyvis_graph_host.add_edge(0, 5, label="locatedIn", color="blue")

pyvis_graph_host.show('example.html')

### Quering the populated ontology

In [22]:
query1 = """
Prefix : <http://www.purl.org/infection_trans#>
SELECT ?ex_day ?ex_hour ?host_id ?inoculationStatus ?sample_type  ?sample_result
Where{
?experiment a :Experiment;
                          :experimentDay ?ex_day;
                          :hasMeasurement ?measurement.
              ?measurement a :Measurement;
                           :hasHost ?host.
                 ?measurement  :hasSample ?sample.
                optional {?measurement :experimentHour ?ex_hour.}
  
              ?host :id ?host_id;
                    :locatedIn ?env.
      optional{?host :treatment ?treatment.}
      optional{?host   :inoculationStatus ?inoculationStatus.}
      optional{?host   :type ?type.}
			optional{   ?sample :hasType ?sample_type.}
		  optional {?sample :result ?sample_result.}
      

}"""

qres1 = ontology.graph.query(query1)

## Query result

In [23]:


result = pd.DataFrame(
        data=([None if x is None else x.toPython() for x in row] for row in qres1),
        columns=[str(x) for x in qres1.vars],
    )

result

Unnamed: 0,ex_day,ex_hour,host_id,inoculationStatus,sample_type,sample_result
0,3,12:00,chicken_id,I,http://www.purl.org/infection_trans#Swab,-
1,4,12:00,chicken_id,I,http://www.purl.org/infection_trans#Swab,-
2,4,16:00,chicken_id,I,http://www.purl.org/infection_trans#Swab,+


# Example 2

### In this example we show already preprocessed file with multiple measurements per row

In [24]:
mappings2 = Mappings(config2)

In [25]:
dataset2 = Dataset(str(trans_data2),mappings2)
dataset2.dataset

Unnamed: 0,﻿house,pen,house_pen,treatment,animalnr_col,"I,S1, S2",weight_d0,weight_d21,BS0,BS0_date,...,swab1,swab1_date,swab2,swab2_date,swab2_value,swab3,swab3_date,swab3_value,value_weight_d0,value_weight_d21
0,1,S1,H1_S1,control,3_Ge,S2,3572,732,-*,-1,...,-,5,-*,6,,-*,7,,0,21
1,1,S1,H1_S1,control,10_Ge,S2,4351,934,-*,-1,...,-,5,-,6,,-,7,,0,21


### To capture mote than one measurement in the mapping file the class Measuremnet represents as list of values

In [26]:
measurements = mappings2.ont_mappings['Measurement']

measurements

[{'experimentDay': 'weight_d(.*)',
  'hasQuantity': 'BodyMass',
  'hasHost': 'Host',
  'BodyMass': {'hasPhenomenon': 'Host'},
  'Measure': {'hasNumericalValue': 'weight_d.*'}},
 {'experimentDay': 'BS.*_date',
  'hasHost': 'Environment',
  'Sample': {'result': 'BS.*', 'hasType': 'EnvironmentalSample'},
  'Pathogen': {'name': 'BS.*_value'}},
 {'experimentDay': 'swab.*_date',
  'hasHost': 'Host',
  'Sample': {'hasType': 'SwabSample', 'result': 'swab.*'},
  'Pathogen': {'name': 'swab.*_value'}}]

In [27]:
mappings2.ont_mappings


{'Experiment': {'experimentID': 'A', 'experimentDay': 'experimentDay'},
 'Environment': {'level2': 'house', 'level1': 'pen'},
 'Host': {'id': 'animalnr_col',
  'treatment': 'treatment',
  'inoculationStatus': 'I, S1, S2',
  'type': 'broiler',
  'locatedIn': 'Environment'},
 'Measurement': [{'experimentDay': 'weight_d(.*)',
   'hasQuantity': 'BodyMass',
   'hasHost': 'Host',
   'BodyMass': {'hasPhenomenon': 'Host'},
   'Measure': {'hasNumericalValue': 'weight_d.*'}},
  {'experimentDay': 'BS.*_date',
   'hasHost': 'Environment',
   'Sample': {'result': 'BS.*', 'hasType': 'EnvironmentalSample'},
   'Pathogen': {'name': 'BS.*_value'}},
  {'experimentDay': 'swab.*_date',
   'hasHost': 'Host',
   'Sample': {'hasType': 'SwabSample', 'result': 'swab.*'},
   'Pathogen': {'name': 'swab.*_value'}}]}

### Measurement of Weight of chicken is first element in the list

In [28]:
measurements[0]

{'experimentDay': 'weight_d(.*)',
 'hasQuantity': 'BodyMass',
 'hasHost': 'Host',
 'BodyMass': {'hasPhenomenon': 'Host'},
 'Measure': {'hasNumericalValue': 'weight_d.*'}}

### Measurement of Bootsock sample is second element in the list

In [29]:
measurements[1]

{'experimentDay': 'BS.*_date',
 'hasHost': 'Environment',
 'Sample': {'result': 'BS.*', 'hasType': 'EnvironmentalSample'},
 'Pathogen': {'name': 'BS.*_value'}}

### Measurement of Swab sample is third element in the list

In [30]:
measurements[2]

{'experimentDay': 'swab.*_date',
 'hasHost': 'Host',
 'Sample': {'hasType': 'SwabSample', 'result': 'swab.*'},
 'Pathogen': {'name': 'swab.*_value'}}

In [31]:
dataset2.tidy_dataset

Unnamed: 0,"I,S1, S2",animalnr_col,house_pen,pen,treatment,﻿house,experimentDay,weight_d.*,BS.*,BS.*_value,swab.*,swab.*_value
0,S2,3_Ge,H1_S1,S1,control,1,0,3572.0,,,,
1,S2,10_Ge,H1_S1,S1,control,1,0,4351.0,,,,
2,S2,3_Ge,H1_S1,S1,control,1,21,732.0,,,,
3,S2,10_Ge,H1_S1,S1,control,1,21,934.0,,,,
4,S2,3_Ge,H1_S1,S1,control,1,-1,,-*,,,
5,S2,10_Ge,H1_S1,S1,control,1,-1,,-*,,,
6,S2,3_Ge,H1_S1,S1,control,1,2,,-*,,,
7,S2,10_Ge,H1_S1,S1,control,1,2,,-*,,,
8,S2,3_Ge,H1_S1,S1,control,1,5,,,,-,
9,S2,10_Ge,H1_S1,S1,control,1,5,,,,-,


In [32]:
ontology2 = Ontology(ont_file)

In [24]:
mappings2.mappings


{'required': 'animalnr_col',
 'ontology_schema': {'Experiment': {'experimentID': 'A',
   'experimentDay': 'experimentDay'},
  'Environment': {'level2': 'house', 'level1': 'pen'},
  'Host': {'id': 'animalnr_col',
   'treatment': 'treatment',
   'inoculationStatus': 'I, S1, S2',
   'type': 'broiler',
   'locatedIn': 'Environment'},
  'Measurement': [{'experimentDay': 'weight_d(.*)',
    'hasQuantity': 'BodyMass',
    'hasHost': 'Host',
    'BodyMass': {'hasPhenomenon': 'Host'},
    'Measure': {'hasNumericalValue': 'weight_d.*'}},
   {'experimentDay': 'BS.*_date',
    'hasHost': 'Environment',
    'Sample': {'result': 'BS.*', 'hasType': 'EnvironmentalSample'},
    'Pathogen': {'name': 'BS.*_value'}},
   {'experimentDay': 'swab.*_date',
    'hasHost': 'Host',
    'Sample': {'hasType': 'SwabSample', 'result': 'swab.*'},
    'Pathogen': {'name': 'swab.*_value'}}]}}

In [25]:
for _, row in dataset2.tidy_dataset.iterrows():
    # check for the required field
    # if it doesn't exit then run it for each row
    if mappings2.required_field is None or row[mappings2.required_field]:
        ontology2.populate_ontology(mappings2, row)

<ontology.Individual object at 0x1200b3b20> <ontology.Individual object at 0x1200b3490>
<ontology.Individual object at 0x1200b37c0> <ontology.Individual object at 0x1200b3a30>
<ontology.Individual object at 0x11ff6ee20> <ontology.Individual object at 0x1200b3b20>
<ontology.Individual object at 0x1200b37c0> <ontology.Individual object at 0x11ff6ee20>
<ontology.Individual object at 0x1200b37c0> <ontology.Individual object at 0x1200c7370>
<ontology.Individual object at 0x11ff6ee20> <ontology.Individual object at 0x1200c7370>
<ontology.Individual object at 0x1200b37c0> BodyMass
<ontology.Individual object at 0x1200b37c0> <ontology.Individual object at 0x1200b3b20>
<ontology.Individual object at 0x1200b37c0> <ontology.Individual object at 0x11ff92d90>
<ontology.Individual object at 0x11ff4b070> <ontology.Individual object at 0x11ff92640>
<ontology.Individual object at 0x1200c7a90> <ontology.Individual object at 0x1200b37c0>
<ontology.Individual object at 0x11ff4b070> <ontology.Individual ob

AttributeError: 'str' object has no attribute 'is_a'

In [26]:
ontology2.save_ontology('sample2_test.ttl')

In [27]:
query2 = """
Prefix : <http://www.purl.org/infection_trans#>
SELECT ?ex_day ?ex_hour ?host_id ?inoculationStatus ?sample_type  ?sample_result
Where{
?experiment a :Experiment;
                          :experimentDay ?ex_day;
                          :hasMeasurement ?measurement.
              ?measurement a :Measurement;
                           :hasHost ?host.      
      

}"""

qres2 = ontology2.graph.query(query2)

In [28]:
result = pd.DataFrame(
        data=([None if x is None else x.toPython() for x in row] for row in qres2),
        columns=[str(x) for x in qres2.vars],
    )

result

Unnamed: 0,ex_day,ex_hour,host_id,inoculationStatus,sample_type,sample_result
0,0,,,,,
1,0,,,,,
2,21,,,,,
3,21,,,,,
