In [1]:
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

data_kg = pd.read_csv('../PreprocessData/data/target.csv')
path = '../PatternDetection/clusteringMeasures/TransH/SemEP_50/clusters/'  #Kmeans, METIS, SemEP


list_donor = []
entries = os.listdir(path)
for file in entries:
    cls = pd.read_csv(path + file, delimiter="\t", header=None)
    cls.columns = ['ClinicalRecord']
    data_kg.loc[data_kg.ClinicalRecord.isin(cls.ClinicalRecord), 'cluster'] = 'Cluster ' + file[:-4].split('-')[1]
    list_donor = list_donor+list(cls.ClinicalRecord)
data_kg = data_kg.loc[data_kg.ClinicalRecord.isin(list_donor)]
data_kg = data_kg.loc[data_kg.Relapse!='entity:UnKnown']
data_kg.replace('entity:', '', regex=True, inplace=True)

display(data_kg.head(), data_kg.shape)

Unnamed: 0,ClinicalRecord,Relapse,cluster
0,718710_ClinicalRecord,No_Progression,Cluster 0
1,2432630_ClinicalRecord,Progression,Cluster 0
2,583611_ClinicalRecord,No_Progression,Cluster 0
3,329355_ClinicalRecord,Progression,Cluster 1
4,760472_ClinicalRecord,Progression,Cluster 1


(962, 3)

In [2]:
from rdflib import Graph
import pandas as pd
from rdflib.plugins.sparql.processor import SPARQLResult
import seaborn as sns

def sparql_results_to_df(results: SPARQLResult) -> pd.DataFrame:
    """
    Export results from an rdflib SPARQL query into a `pandas.DataFrame`,
    using Python types. See https://github.com/RDFLib/rdflib/issues/1179.
    """
    return pd.DataFrame(
        data=([None if x is None else x.toPython() for x in row] for row in results),
        columns=[str(x) for x in results.vars],
    )

In [3]:
path = '../../store_data/P4-LUCAT/'
g = Graph()
g.parse(path + "KG0.nt", format="nt")

<Graph identifier=Ncb239b5a283040f9ab857ff073175bf4 (<class 'rdflib.graph.Graph'>)>

In [4]:
def get_triples(graph):
    query = """    
    select distinct ?ClinicalRecord ?Age ?Stage ?Gender
    where {
        ?ClinicalRecord <http://research.tib.eu/p4-lucat/vocab/hasAgeCategory> ?Age .
        ?ClinicalRecord <http://research.tib.eu/p4-lucat/vocab/hasStage> ?Stage .
        ?ClinicalRecord <http://research.tib.eu/p4-lucat/vocab/hasGender> ?Gender
        }
        """
    qres = graph.query(query)
    triples = sparql_results_to_df(qres)
#     triples.replace('http://research.tib.eu/p4-lucat/entity/', 'entity:', regex=True, inplace=True)
    triples.replace('http://research.tib.eu/p4-lucat/entity/', '', regex=True, inplace=True)
    return triples

def barplot(size, data, c, target):
    plt.figure(figsize=size)  # Adjust figure size as needed  (6, 4), (5.5, 5)
    ax = sns.barplot(data, x="Parameter", y="counts", hue="Category", palette="muted",
                    dodge=True) #deep, muted
    # Customize the legend: place it outside the plot on the right with two columns
    handles, labels = ax.get_legend_handles_labels()
    ax.legend(handles=handles, labels=labels, loc='upper left', bbox_to_anchor=(.5, 1), ncol=2, title='Category')
    plt.ylabel('Normalized Clinical Records')
    ax.set_title('Description of '+ target+' patients for cluster '+str(c))
    # Adjust layout to make the elements closer
    plt.tight_layout()
    plt.savefig('Plots/cls'+str(c)+'_'+target+'.png', bbox_inches='tight')
    plt.savefig('Plots/cls'+str(c)+'_'+target+'.pdf', format='pdf', bbox_inches='tight')
    plt.close()

def barplotV1(data, c, target):
    ax = sns.barplot(data=data, x="Category", y="counts", hue="Parameter", palette="muted",
                     alpha=.8, dodge=False, linewidth=0.0, edgecolor='black')#, palette="dark", alpha=.8, height=8
    ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
    plt.ylabel('Normalized Clinical Records')
    ax.set_title('Description of '+ target+' patients for cluster '+str(c))
    # Adjust layout to make the elements closer
    plt.tight_layout()
    plt.savefig('Plots/cls'+str(c)+'_'+target+'_v2.png', bbox_inches='tight')
    plt.savefig('Plots/cls'+str(c)+'_'+target+'_v2.pdf', format='pdf', bbox_inches='tight')
    plt.close()
    
def group_by_parameter(clinicalData, num_patient):
    grouped = clinicalData.groupby(['cluster', 'Stage']).size().reset_index(name='counts')
    grouped['Parameter'] = 'Stage'
    grouped = grouped.rename(columns={'Stage': 'Category'})

    grouped_2 = clinicalData.groupby(['cluster', 'Age']).size().reset_index(name='counts')
    grouped_2['Parameter'] = 'Age'
    grouped_2 = grouped_2.rename(columns={'Age': 'Category'})

    grouped_3 = clinicalData.groupby(['cluster', 'Gender']).size().reset_index(name='counts')
    grouped_3['Parameter'] = 'Gender'
    grouped_3 = grouped_3.rename(columns={'Gender': 'Category'})

    grouped_plot = pd.concat([grouped, grouped_2, grouped_3])
    grouped_plot.counts = grouped_plot.counts/num_patient
    grouped_plot = grouped_plot.sort_values(by='Parameter')
    return grouped_plot

In [5]:
clinicalData = get_triples(g)
clinicalData

Unnamed: 0,ClinicalRecord,Age,Stage,Gender
0,2703745_ClinicalRecord,Old,IIIB,Male
1,212473_ClinicalRecord,Old,IA,Male
2,3528_ClinicalRecord,Old,IA,Female
3,2717319_ClinicalRecord,Old,IB,Female
4,589980_ClinicalRecord,Old,IIIB,Male
...,...,...,...,...
1236,2682136_ClinicalRecord,Young,IIIA,Male
1237,2860649_ClinicalRecord,Young,IIIC,Male
1238,839359_ClinicalRecord,Young,IIIB,Male
1239,2428845_ClinicalRecord,Young,IV,Male


In [6]:
clinicalData_cls = data_kg.merge(clinicalData, how = 'outer' , on='ClinicalRecord', indicator=True).loc[lambda x : x['_merge']=='both']
# 1887213_ClinicalRecord don't have age
clinicalData_cls.drop(columns=['_merge'], inplace=True)
replacement_mapping_dict = {'No_Progression': 'No relapse',
                            'Relapse': 'Relapse',
                           'Progression': 'Relapse'}
clinicalData_cls['Relapse'].replace(replacement_mapping_dict, inplace=True)
replacement_mapping_dict = {'IA': 'I', 'IB': 'I', 'IA1': 'I', 'IA2': 'I',
                           'IIA': 'II', 'IIB': 'II',
                           'IIIA': 'III', 'IIIB': 'III', 'IIIC': 'III',
                           'IVA': 'IV', 'IVB': 'IV',
                           'Limitado': 'Limited', 'Otros': 'Others', 'Extendido': 'Extended'}
clinicalData_cls['Stage'].replace(replacement_mapping_dict, inplace=True)
clinicalData_cls

Unnamed: 0,ClinicalRecord,Relapse,cluster,Age,Stage,Gender
0,718710_ClinicalRecord,No relapse,Cluster 0,Old,II,Male
1,2432630_ClinicalRecord,Relapse,Cluster 0,Old,III,Male
2,583611_ClinicalRecord,No relapse,Cluster 0,Old,III,Female
3,329355_ClinicalRecord,Relapse,Cluster 1,Young,IV,Male
4,760472_ClinicalRecord,Relapse,Cluster 1,Old,II,Male
...,...,...,...,...,...,...
957,621354_ClinicalRecord,Relapse,Cluster 0,Old,III,Female
958,375428_ClinicalRecord,Relapse,Cluster 1,Young,II,Female
959,302278_ClinicalRecord,Relapse,Cluster 0,Old,II,Male
960,158421_ClinicalRecord,Relapse,Cluster 0,Young,II,Female


In [7]:
no_relapse_cls = clinicalData_cls.loc[clinicalData_cls.Relapse=='No relapse']
relapse_cls = clinicalData_cls.loc[clinicalData_cls.Relapse=='Relapse']

In [8]:
df_relapse_cls = group_by_parameter(relapse_cls, clinicalData_cls.shape[0])

f1 = df_relapse_cls.loc[df_relapse_cls.cluster=='Cluster 0']
f2 = df_relapse_cls.loc[df_relapse_cls.cluster=='Cluster 1']

barplot((6, 4), f1, 1, 'relapse')
barplot((6, 4), f2, 2, 'relapse')

barplotV1(f1, 1, 'relapse')
barplotV1(f2, 2, 'relapse')

In [9]:
df_norelapse_cls = group_by_parameter(no_relapse_cls, clinicalData_cls.shape[0])

f1 = df_norelapse_cls.loc[df_norelapse_cls.cluster=='Cluster 0']
f2 = df_norelapse_cls.loc[df_norelapse_cls.cluster=='Cluster 1']

barplot((6, 4), f1, 1, 'no relapse')
barplot((6, 4), f2, 2, 'no relapse')

barplotV1(f1, 1, 'no relapse')
barplotV1(f2, 2, 'no relapse')