# Read directly from the JSON label

In [1]:
import json
from glob import glob
import pandas as pd
import os

dataset_dir = "../../previous_dataset/PraxisData"
case_paths = sorted(glob(f"{dataset_dir}/**"))
case_names = []
for case_path in case_paths:
    case_name = os.path.basename(case_path)
    case_names.append(case_name)
print(f"We have {len(case_names)} cases in the dataset") 

json_dir = "./pretty.json"
with open(json_dir,'r') as data_file:
    datas = json.load(data_file)

print(f"There are {len(datas)} cases in JSON file")

We have 3794 cases in the dataset
There are 4510 cases in JSON file


In [3]:
data = datas[0]
data.keys()

dict_keys(['studyInstanceUID', 'discrepancy', 'projectName', 'finalAnnotation', 'annotationsOriginal'])

In [2]:
def detect(sub_dictionary,pathology):
    if pathology == 6: # ACL
        disease_names = ['ACL_Vord. Kreuzband-Faserunterbrechung','ACL_Vord. Kreuzband-Konturunterbrechung','ACL_Vord. Kreuzband-Resorption']
        for key in list(sub_dictionary.keys()):
            if not key in disease_names:
                del sub_dictionary[key]
    
    elif pathology == 7: # PCL
        disease_names = ['PCL_Hint. Kreuzband-Faserunterbrechung','PCL_Hint. Kreuzband-Konturunterbrechung','PCL_Hint. Kreuzband-Resorption']
        for key in list(sub_dictionary.keys()):
            if not key in disease_names:
                del sub_dictionary[key]
    
    elif pathology == 15 or pathology == 16:
        for key in list(sub_dictionary.keys()):
            if 'Degenerativ' in key:
                del sub_dictionary[key]

    return sub_dictionary
                

In [3]:
def whatever(whole_case,pathology):
    sub_dictionary = {}
    pathology_dictionary = {
        '6': 'ACL',
        '7': 'PCL',
        '15': 'Inner_Meniscus',
        '16': 'Outer_Meniscus'
    }
    pathology_name = pathology_dictionary[str(pathology)]

    for disease in whole_case['annotationsOriginal']:
        if disease['isAnnotationCorrect'] == 'false':
            continue
        element = disease['annotation']['elements'][pathology]
        if 'student3d' in element:
            for case in disease['student3d']:
                if case != []:
                    for studentcase in case:
                        coordinate_dic = studentcase['coordinate']
                        x = float(coordinate_dic["x"])
                        y = float(coordinate_dic["y"])
                        z = float(coordinate_dic["z"])
                        coordinate = (x,y,z)
                        disease_name = studentcase["name"].title()
                        disease_name = f"{pathology_name}_{disease_name}"
                        sub_dictionary[disease_name] = coordinate
        if element['3d'] != []:
            for location in element['3d']:
                coordinate_dic = location["coordinate"]
                x = float(coordinate_dic["x"])
                y = float(coordinate_dic["y"])
                z = float(coordinate_dic["z"])
                coordinate = (x,y,z)
                disease_name = location["name"].title()
                disease_name = f"{pathology_name}_{disease_name}"
                sub_dictionary[disease_name] = coordinate
    
    element = whole_case['finalAnnotation']['elements'][pathology]
    if 'student3d' in element:
        for case in element['student3d']:
            if case != []:
                for studentcase in case:
                    coordinate_dic = studentcase['coordinate']
                    x = float(coordinate_dic["x"])
                    y = float(coordinate_dic["y"])
                    z = float(coordinate_dic["z"])
                    coordinate = (x,y,z)
                    disease_name = studentcase["name"].title()
                    disease_name = f"{pathology_name}_{disease_name}"
                    sub_dictionary[disease_name] = coordinate
    if element['3d'] != []:
        for location in element['3d']:
            coordinate_dic = location['coordinate']
            x = float(coordinate_dic["x"])
            y = float(coordinate_dic["y"])
            z = float(coordinate_dic["z"])
            coordinate = (x,y,z)
            disease_name = location["name"].title()
            disease_name = f"{pathology_name}_{disease_name}"
            sub_dictionary[disease_name] = coordinate
    
    sub_dictionary = detect(sub_dictionary,pathology)

    return sub_dictionary

In [4]:
ACL = 6               # ACL is Nr. 6 
PCL = 7               # PCL is Nr. 7
Inner_Meniscus = 15   # Inner_Meniscus is Nr. 15
Outer_Meniscus = 16   # Outer_Meniscus is Nr. 16

bad_project = ['test_mpi_2','test_mpi3','Schulter ohne Text','Schulter mit Text']

whole_data = []
for whole_case in datas:
    sub_dictionary = {}
    name = whole_case['studyInstanceUID']
    projectName = whole_case['projectName']

    if not name in case_names:
        continue

    if projectName in bad_project:
        continue

    sub_dictionary['StudyUID'] = name
    sub_dictionary['projectName'] = projectName
    
    ACL_sub = whatever(whole_case,ACL)
    sub_dictionary.update(ACL_sub)
    PCL_sub = whatever(whole_case,PCL)
    sub_dictionary.update(PCL_sub)
    Inner_Meniscus_sub = whatever(whole_case,Inner_Meniscus)
    sub_dictionary.update(Inner_Meniscus_sub)
    Outer_Meniscus_sub = whatever(whole_case,Outer_Meniscus)
    sub_dictionary.update(Outer_Meniscus_sub)

    whole_data.append(sub_dictionary)

# Convert the list of dictionaries to a Pandas DataFrame
df = pd.DataFrame(whole_data)

# If index is not specified in the DataFrame, set it to None
if df.index.name is None:
    df.index.name = ''

# Write the DataFrame to a CSV file
df.to_csv('ACL_Meniscus_ProjectName.csv', na_rep='', index=False)

*Sorted the sequence*

In [5]:
import pandas as pd
unsorted_csv = pd.read_csv("ACL_Meniscus_ProjectName.csv")
new_order = sorted(unsorted_csv.columns.tolist()[2:])
print(len(new_order))
# for disease in new_order:
#     print(f"{disease}")
new_order.insert(0,'StudyUID')
new_order.insert(1,'projectName')

sorted_csv = unsorted_csv.reindex(columns=new_order)
sorted_csv.to_csv('ACL_Meniscus_ProjectName.csv', index=False)

16


*Delete the labels based on the sequence*

In [6]:
# Return the small values so we can delete that

good_project = ['Knie_ohne_Textmarker_100','knie mit textmarker','Knie_Markov_Textmarker_JMH']
def check_projectNames(projectNames):
    name_1 = projectNames[0]
    name_2 = projectNames[1]
    for i,project in enumerate(good_project):
        if name_1 == project:
            name_1_num = i
        elif name_2 == project:
            name_2_num = i
    if name_1_num > name_2_num:
        return name_2
    elif name_1_num < name_2_num:
        return name_1
    else:
        print("Error!")

In [7]:
import pandas as pd
import numpy as np

location_csv = pd.read_csv('whatever.csv')
names = location_csv['StudyUID'].tolist()

# Find out how many names appear 2 times
problem_names = []
for name in names:
    count = names.count(name)
    if count == 2:
        problem_names.append(name)
problem_names = np.unique(np.array(problem_names)).tolist()

# Delete them based on the sequence
for name in problem_names:
    projectNames = location_csv.loc[location_csv['StudyUID'] == name, 'projectName'].tolist()
    wrong_projectName = check_projectNames(projectNames)
    row_to_delete = location_csv[(location_csv['StudyUID'] == name) & (df['projectName'] == wrong_projectName)].index
    location_csv.drop(row_to_delete, inplace=True)

location_csv.to_csv('whatever.csv',index=False)

*Make a new label only with **ACL**, **PCL**, **Inner_Meniscus** and **Outer_Meniscus***

In [39]:
import pandas as pd

diseases_group = ['ACL','PCL','Inner_Meniscus','Outer_Meniscus']

sorted_csv = pd.read_csv("./whatever.csv")
disease_names = sorted_csv.columns.tolist()[2:]
case_names = sorted_csv['StudyUID'].tolist()
projectNames = sorted_csv['projectName'].tolist()

def grouped_list(group_name):
    list = []
    for name in case_names:
        num = 0
        for disease_name in disease_names:
            if group_name in disease_name:
                if not sorted_csv.loc[sorted_csv['StudyUID'] == name, disease_name].isna().all():
                    num += 1
        if num != 0:
            num = 1
        list.append(num)
    return list

dictionary = {}
dictionary['StudyUID'] = case_names
dictionary['projectName'] = projectNames
for group_name in diseases_group:
    dictionary[group_name] = grouped_list(group_name)

df = pd.DataFrame(dictionary)
data_dir = f"./new_grouped_whatever.csv"
df.to_csv(data_dir,index=False)

In [7]:
import pandas as pd

csv = pd.read_csv("./ACL_Meniscus_ProjectName.csv")
names = csv.columns.tolist()[2:]
for name in names:
    print(name)

ACL_Vord. Kreuzband-Faserunterbrechung
ACL_Vord. Kreuzband-Konturunterbrechung
ACL_Vord. Kreuzband-Resorption
Inner_Meniscus_Horizontal R.
Inner_Meniscus_Komplex/Lappen/Longitudinal R.
Inner_Meniscus_Korbhenkelriss
Inner_Meniscus_Radiaer R.
Inner_Meniscus_Wurzel R.
Outer_Meniscus_Horizontal R.
Outer_Meniscus_Komplex/Lappen/Longitudinal R.
Outer_Meniscus_Korbhenkelriss
Outer_Meniscus_Radiaer R.
Outer_Meniscus_Wurzel R.
PCL_Hint. Kreuzband-Faserunterbrechung
PCL_Hint. Kreuzband-Konturunterbrechung
PCL_Hint. Kreuzband-Resorption


# Compare with the old label

In [7]:
# Check how many wrong cases in ACL
import pandas as pd

label_csv_old = pd.read_csv("../combination.csv")
label_csv_new = pd.read_csv("./new_grouped_whatever.csv")
old_names = label_csv_old["StudyUID"].tolist()
new_names = label_csv_new["StudyUID"].tolist()
print(len(old_names))
print(len(new_names))

wrong_cases_ACL = []
zero_one = 0
one_zero = 0
one_zero_cases = []
for old_name in old_names:
    label_old = label_csv_old.loc[label_csv_old['StudyUID'] == old_name, "ACL"].tolist()
    if old_name in new_names:
        label_new = label_csv_new.loc[label_csv_new['StudyUID'] == old_name, "ACL"].tolist()
        if label_old != label_new:
            wrong_cases_ACL.append(old_name)
            
            if label_old == [0] and label_new == [1]:
                zero_one += 1
            else:
                one_zero += 1
                one_zero_cases.append(old_name)

print(f"There are {len(wrong_cases_ACL)} cases wrong in ACL")
print(f"\nThere are {zero_one} cases, old=0, new=1,"
      f"\nThere are {one_zero} cases, old=1, new=0")

wrong_cases_Meniscus = []
meniscus_one_zero_cases = []
for old_name in old_names:
    label_old = label_csv_old.loc[label_csv_old['StudyUID'] == old_name, "Meniscus"].tolist()
    if old_name in new_names:
        label_new_Inner = label_csv_new.loc[label_csv_new['StudyUID'] == old_name, "Inner_Meniscus"].tolist()
        label_new_Outer = label_csv_new.loc[label_csv_new['StudyUID'] == old_name, "Outer_Meniscus"].tolist()
        if label_new_Inner == [1] or label_new_Outer == [1]:
            label_new = [1]
        else:
            label_new = [0]
        if label_old != label_new:
            wrong_cases_Meniscus.append(old_name)

print(f"There are {len(wrong_cases_Meniscus)} cases wrong in Meniscus")

3794
3511
There are 87 cases wrong in ACL

There are 37 cases, old=0, new=1,
There are 50 cases, old=1, new=0
There are 112 cases wrong in Meniscus


# Make a label agreed by both of them

In [3]:
import pandas as pd

label_csv_old = pd.read_csv("../combination.csv")
label_csv_new = pd.read_csv("./new_grouped_whatever.csv")
old_names = label_csv_old["StudyUID"].tolist()
new_names = label_csv_new["StudyUID"].tolist()

def check_same(name):
    # ACL
    label_old_ACL = label_csv_old.loc[label_csv_old['StudyUID'] == old_name, "ACL"].tolist()
    label_new_ACL = label_csv_new.loc[label_csv_new['StudyUID'] == old_name, "ACL"].tolist()

    # Meniscus
    label_old_Meniscus = label_csv_old.loc[label_csv_old['StudyUID'] == old_name, "Meniscus"].tolist()
    label_new_Inner = label_csv_new.loc[label_csv_new['StudyUID'] == old_name, "Inner_Meniscus"].tolist()
    label_new_Outer = label_csv_new.loc[label_csv_new['StudyUID'] == old_name, "Outer_Meniscus"].tolist()
    if label_new_Inner == [1] or label_new_Outer == [1]:
        label_new_Meniscus = [1]
    else:
        label_new_Meniscus = [0]

    if label_old_ACL == label_new_ACL and label_old_Meniscus == label_new_Meniscus:
        return name
    

names_list = []
for old_name in old_names:
    if old_name in new_names:
        name = check_same(old_name)
        if not name == None:
            names_list.append(name)

print(f"We still have {len(names_list)} cases can be trained")

We still have 3317 cases can be trained


In [10]:
# Make the new csv file
new_file = []
for name in names_list:
    specific_row = label_csv_new.loc[label_csv_new['StudyUID'] == name]
    new_file.append(specific_row)

combined_df = pd.concat(new_file,ignore_index=True)
combined_df.to_csv('new_grouped_whatever_new.csv',index=False)

In [12]:
# Count how many disease cases
new_grouped_new_csv = pd.read_csv('new_grouped_whatever_new.csv')

diseases = ['ACL','PCL','Inner_Meniscus','Outer_Meniscus']
for disease in diseases:
    sick_num = 0
    sick_list = new_grouped_new_csv[disease].tolist()
    for i in sick_list:
        if str(i) == '1':
            sick_num += 1
    print(f"There are {sick_num} cases {disease} among {len(sick_list)} patients.")


There are 180 cases ACL among 3317 patients.
There are 16 cases PCL among 3317 patients.
There are 943 cases Inner_Meniscus among 3317 patients.
There are 286 cases Outer_Meniscus among 3317 patients.


In [15]:
# Count how many disease cases
label_csv_new = pd.read_csv("./new_grouped_whatever.csv")

diseases = ['ACL','PCL','Inner_Meniscus','Outer_Meniscus']
for disease in diseases:
    sick_num = 0
    sick_list = label_csv_new[disease].tolist()
    for i in sick_list:
        if str(i) == '1':
            sick_num += 1
    print(f"There are {sick_num} cases {disease} among {len(sick_list)} patients.")

There are 218 cases ACL among 3511 patients.
There are 17 cases PCL among 3511 patients.
There are 1015 cases Inner_Meniscus among 3511 patients.
There are 309 cases Outer_Meniscus among 3511 patients.


In [2]:
import pandas as pd

diseases_group = ['ACL','PCL','Inner_Meniscus','Outer_Meniscus']

sorted_csv = pd.read_csv("./ACL_Meniscus_ProjectName.csv")
disease_names = sorted_csv.columns.tolist()[2:]
case_names = sorted_csv['StudyUID'].tolist()
projectNames = sorted_csv['projectName'].tolist()

def grouped_list(group_name):
    list = []
    for i, name in enumerate(case_names):
        num = 0
        for disease_name in disease_names:
            if group_name in disease_name:
                if not sorted_csv.loc[(sorted_csv['StudyUID'] == name)&(sorted_csv['projectName'] == projectNames[i]),disease_name].isna().all():
                    num += 1
        if num != 0:
            num = 1
        list.append(num)
    return list

dictionary = {}
dictionary['StudyUID'] = case_names
dictionary['projectName'] = projectNames
for group_name in diseases_group:
    dictionary[group_name] = grouped_list(group_name)

# df = pd.DataFrame(dictionary)
# data_dir = f"./ACL_Meniscus_ProjectName_grouped.csv"
# df.to_csv(data_dir,index=False)

In [20]:
import numpy as np
import json

json_dir = "./pretty.json"
with open(json_dir,'r') as data_file:
    datas = json.load(data_file)

case_names = np.unique(np.array(case_names)).tolist()
print(len(case_names))
grouped_csv = pd.read_csv("./ACL_Meniscus_ProjectName_grouped.csv")
diseases_group = ['ACL','PCL','Inner_Meniscus','Outer_Meniscus']

def disease_detect(disease_num_list):
    disease_list = []
    for i,disease_num in enumerate(disease_num_list):
        if str(disease_num) == "1":
            disease_list.append(diseases_group[i])
    return disease_list

def patientID(name):
    for case in datas:
        StudyUID = case['studyInstanceUID']   
        if str(StudyUID) == str(name):
            for disease in case['finalAnnotation']['elements']:
                if disease['3d'] != []:
                    for location in disease['3d']:
                        PatientID = location["PatientID"]
                        if PatientID is not None:
                            break
                    break
        
    return PatientID

wrong_cases = []
for name in case_names:
    dictionary_1 = {}
    dictionary_2 = {}
    row = grouped_csv.loc[grouped_csv['StudyUID'] == name]
    row_list = row.values.tolist()
    if len(row_list) == 2:
        if row_list[0][2:] != row_list[1][2:]:
            dictionary_1["StudyUID"] = row_list[0][0]
            dictionary_1['AccesionNumber'] = patientID(row_list[0][0])
            dictionary_1['projectName'] = row_list[0][1]
            diseases_1 = row_list[0][2:]
            dictionary_1['diseases'] = disease_detect(diseases_1)
            wrong_cases.append(dictionary_1)

            dictionary_2['projectName'] = row_list[1][1]
            diseases_2 = row_list[1][2:]
            dictionary_2['diseases'] = disease_detect(diseases_2)
            wrong_cases.append(dictionary_2)

    elif len(row_list) >2:
        print("Error!")

    

# Convert the list of dictionaries to a Pandas DataFrame
df = pd.DataFrame(wrong_cases)

# If index is not specified in the DataFrame, set it to None
if df.index.name is None:
    df.index.name = ''

# Write the DataFrame to a CSV file
df.to_csv('ProjectName_conflicts.csv', index=False)

3511


In [19]:
import pandas as pd

cvsDataframe = pd.read_csv('ProjectName_conflicts.csv')
resultExcelFile = pd.ExcelWriter('ProjectName_conflicts.xlsx')
cvsDataframe.to_excel(resultExcelFile, index=False)

resultExcelFile.save()

  resultExcelFile.save()


## Create a new ACL coordinates label

In [17]:
import pandas as pd

ACL_list = ["ACL_Vord. Kreuzband-Konturunterbrechung","ACL_Vord. Kreuzband-Resorption","ACL_Vord. Kreuzband-Faserunterbrechung"]
location_new_csv = pd.read_csv("./whatever.csv")
case_names = location_new_csv["StudyUID"].tolist()

acl = []

for name in case_names:
    dictionary_acl = {}
    for disease in ACL_list:
        coordinate = location_new_csv.loc[location_new_csv['StudyUID'] == name, disease].tolist()
        if isinstance(coordinate[0], str):
            break
    if isinstance(coordinate[0], str):
        dictionary_acl["StudyUID"] = name
        dictionary_acl["ACL"] = coordinate[0]
        acl.append(dictionary_acl)
    else:
        continue

# Convert the list of dictionaries to a Pandas DataFrame
df = pd.DataFrame(acl)

# If index is not specified in the DataFrame, set it to None
if df.index.name is None:
    df.index.name = ''

# Write the DataFrame to a CSV file
df.to_csv('ACL.csv', index=False)