In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from rdflib import Graph
from rdflib.plugins.sparql.processor import SPARQLResult

def sparql_results_to_df(results: SPARQLResult) -> pd.DataFrame:
    """
    Export results from an rdflib SPARQL query into a `pandas.DataFrame`,
    using Python types. See https://github.com/RDFLib/rdflib/issues/1179.
    """
    return pd.DataFrame(
        data=([None if x is None else x.toPython() for x in row] for row in results),
        columns=[str(x) for x in results.vars],
    )

In [2]:
data_kg = pd.read_csv('../PreprocessData/data/target.csv')

data_kg = data_kg.loc[data_kg.Relapse!='entity:UnKnown']

replacement_mapping_dict = {'entity:No_Progression': 'No relapse',
                            'entity:Relapse': 'Relapse',
                           'entity:Progression': 'Relapse'}
data_kg['Relapse'].replace(replacement_mapping_dict, inplace=True)
data_kg.replace('entity:', '', regex=True, inplace=True)

display(data_kg.head(), data_kg.shape)

Unnamed: 0,ClinicalRecord,Relapse
0,718710_ClinicalRecord,No relapse
1,2432630_ClinicalRecord,Relapse
2,583611_ClinicalRecord,No relapse
3,329355_ClinicalRecord,Relapse
4,760472_ClinicalRecord,Relapse


(962, 2)

In [3]:
path = '../../store_data/P4-LUCAT/'
g = Graph()
g.parse(path + "KG0.nt", format="nt")

<Graph identifier=Na16a0669bf2b47e9b6b6a77b7dd3b982 (<class 'rdflib.graph.Graph'>)>

In [4]:
def get_triples(graph, query):
    qres = graph.query(query)
    triples = sparql_results_to_df(qres)
    triples.replace('http://research.tib.eu/p4-lucat/entity/', '', regex=True, inplace=True)
    return triples

In [5]:
q_biomarker = """    
    select distinct ?ClinicalRecord ?Age ?Gender
    where {
        ?ClinicalRecord <http://research.tib.eu/p4-lucat/vocab/hasBiomarker> ?Biomarker .
        ?ClinicalRecord <http://research.tib.eu/p4-lucat/vocab/hasAgeCategory> ?Age .
        ?ClinicalRecord <http://research.tib.eu/p4-lucat/vocab/hasGender> ?Gender .
        }
        """
biomarker = get_triples(g, q_biomarker)
biomarker['Biomarker'] = 'Yes'
biomarker

Unnamed: 0,ClinicalRecord,Age,Gender,Biomarker
0,2703745_ClinicalRecord,Old,Male,Yes
1,2717319_ClinicalRecord,Old,Female,Yes
2,589980_ClinicalRecord,Old,Male,Yes
3,2429612_ClinicalRecord,Old,Male,Yes
4,2789786_ClinicalRecord,Old,Male,Yes
...,...,...,...,...
542,1049197_ClinicalRecord,Young,Female,Yes
543,876727_ClinicalRecord,Young,Male,Yes
544,2682136_ClinicalRecord,Young,Male,Yes
545,2860649_ClinicalRecord,Young,Male,Yes


In [6]:
q_2 = """    
    select distinct ?ClinicalRecord ?Age ?Gender
    where {
        ?ClinicalRecord <http://research.tib.eu/p4-lucat/vocab/hasAgeCategory> ?Age .
        ?ClinicalRecord <http://research.tib.eu/p4-lucat/vocab/hasGender> ?Gender .
        }
        """
df_1 = get_triples(g, q_2)
non_biomarker = df_1.merge(biomarker, how = 'outer' , on=['ClinicalRecord', 'Age', 'Gender'],
                           indicator=True).loc[lambda x : x['_merge']=='left_only'][['ClinicalRecord', 'Age', 'Gender']]
non_biomarker['Biomarker'] = 'No'
non_biomarker

Unnamed: 0,ClinicalRecord,Age,Gender,Biomarker
1,212473_ClinicalRecord,Old,Male,No
2,3528_ClinicalRecord,Old,Female,No
5,219778_ClinicalRecord,Old,Male,No
7,1169321_ClinicalRecord,Old,Male,No
9,816028_ClinicalRecord,Old,Female,No
...,...,...,...,...
1229,2492116_ClinicalRecord,Young,Male,No
1230,541088_ClinicalRecord,Young,Male,No
1235,2545879_ClinicalRecord,Young,Male,No
1238,839359_ClinicalRecord,Young,Male,No


In [7]:
data_corr = pd.concat([biomarker, non_biomarker])
data_corr = pd.merge(data_corr, data_kg, on='ClinicalRecord')#[['Biomarker', 'Relapse', 'cluster']]
data_corr

Unnamed: 0,ClinicalRecord,Age,Gender,Biomarker,Relapse
0,2717319_ClinicalRecord,Old,Female,Yes,Relapse
1,2429612_ClinicalRecord,Old,Male,Yes,No relapse
2,2782768_ClinicalRecord,Old,Female,Yes,Relapse
3,2627112_ClinicalRecord,Old,Male,Yes,Relapse
4,989268_ClinicalRecord,Old,Male,Yes,Relapse
...,...,...,...,...,...
956,611660_ClinicalRecord,Young,Male,No,Relapse
957,2492116_ClinicalRecord,Young,Male,No,Relapse
958,541088_ClinicalRecord,Young,Male,No,Relapse
959,839359_ClinicalRecord,Young,Male,No,Relapse


In [8]:
from ydata_profiling import ProfileReport

In [9]:
profile = ProfileReport(data_corr, title="Profiling Report")
profile.to_file(output_file='output_v1.html')

In [10]:
profile.to_widgets()

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render widgets:   0%|          | 0/1 [00:00<?, ?it/s]

VBox(children=(Tab(children=(Tab(children=(GridBox(children=(VBox(children=(GridspecLayout(children=(HTML(valu…

In [9]:
len(data_corr.ClinicalRecord.unique())

961

In [10]:
a = data_kg.merge(data_corr, how = 'outer' , on='ClinicalRecord', indicator=True).loc[lambda x : x['_merge']=='left_only']
a

Unnamed: 0,ClinicalRecord,Relapse_x,Age,Gender,Biomarker,Relapse_y,_merge
80,1887213_ClinicalRecord,Relapse,,,,,left_only


In [11]:
data_corr.loc[data_corr.ClinicalRecord=='1887213_ClinicalRecord']

Unnamed: 0,ClinicalRecord,Age,Gender,Biomarker,Relapse
