# Generating Images Matirx and IS

In [1]:
"""
Generates A dictionary with key=Patient_ID and values = an other dictionnary whose keys are: 
        name of the matrix and values are the number associated to each tile according to their coordinates.
Each matrix are saved in a csvfile accessible for possible vectorization and visualization.

 Need requirements.txt
 pip install -r requirements.txt
 The path are set in the script, if needed change the line in the script
 To acces the matrix after running the script with: python IS_StageMDT_Generating_images_matrix.py
 import IS_StageMDT_Generating_images_matrix
 IS_StageMDT_Generating_images_matrix.load_patients("cache/cd3")
"""

import os
import glob
import math
from collections import defaultdict


import numpy
import pandas
import PIL.Image

# A tile is 78 pixel wide but the delimitation of the tile is 2 pixel thick
# therefore 78.5 was chosen as the pixel size of a tile
TILESIZE = 78.5

#Currently the image are in Densitymap_new, to change when needed
IMGPATH = os.path.join('..', 'HEGP', 'Densitymap_new')
IMGPATH_CD8 = os.path.join(IMGPATH, "cd8")
IMGPATH_CD3 = os.path.join(IMGPATH, "cd3")
DATA_CD3 = "../HEGP/CD3 Tile statistics final"
DATA_CD8 = "../HEGP/CD8 Tile statistics final"

# Also the patient file have all already been rename
# when needed, change the name of the file to OfficialID.anapathID

### DATA GENERATION ##########

# for tumor and invasive front 0 means no and 1 means yes
# if invasivesice Front is a no then the number of cells in the invasive Front is a NaN
# and not a 0
# if all values in the matrix are NaN then that means the tiles don't exist or were rejected

def get_tile_dimensions(image):
 h = int(math.ceil(image.height / TILESIZE))
 w = int(math.ceil(image.width / TILESIZE))
 return h, w


def get_patient_name(filename):
 return os.path.basename(filename)[:6]


def get_patients_matrices(img_path, data_path):

 patients = {}

 for filename in glob.iglob(os.path.join(img_path, '*.jpg')):

     # Extract image and get dimensions (in tile units)
     img = PIL.Image.open(filename)
     img_height, img_width = get_tile_dimensions(img)

     # Extract patient ID
     patient_id = get_patient_name(filename)
     print(patient_id, "({}x{} tiles)".format(img_height, img_width))

     # Extract data frame from CSV data file
     data_filename = next(glob.iglob(os.path.join(data_path, "{}.*".format(patient_id))))
     data = pandas.read_csv(data_filename, sep=';')

     # Get a custom tile id to coordinates mapper for the current file
     def coordinates_from_tile_id(tile_id):
         x = tile_id // img_width
         y = tile_id % img_width
         return x, y


     # Create a default dict
     patients[patient_id] = patient_dict = defaultdict(
         lambda: numpy.full((img_height, img_width), numpy.nan)
     )

     # Iterate on the rows of the data frame
     for index, row in data.iterrows():

         # Get the tile coordinates
         tile_id = int(row['Tile Name'][4:])
         x, y = coordinates_from_tile_id(tile_id)

         # print(tile_id, (x, y), end="\r")
         #Data assignation if no data assigned then nan by default
         #Tiles can be both tumor and IM, one rejected does not mean the other is automatically
         try:
             float(row['Number_Cells_All']) #both IM and CT rejected
         except (TypeError, ValueError):
             print('Both Tile {} rejected\n'.format(tile_id))
             continue

         if row['Tumor_present'] == 'yes':
             patient_dict['Tumor_present'][x, y] = 1
             # check for tile rejection
             assignT=False
             try:
                 float(row['Number_Cells_Tumor'])
                 float(row['Ratio_Cells_per_Tumor'])
                 assignT=True
             except (TypeError, ValueError):
                 print(assignT,'T Tile {} rejected\n'.format(tile_id))

             if assignT:
                 patient_dict['Number_Cells_Tumor'][x, y] = row['Number_Cells_Tumor']
                 patient_dict['Ratio_Cells_per_All_Area'][x, y]=row['Ratio_Cells_per_All_Area']
                 patient_dict['Ratio_Cells_per_Tumor'][x, y]=row['Ratio_Cells_per_Tumor']
                 patient_dict['Area_Tumor_um2'][x, y] = row['Area_Tumor_um2']
                 patient_dict['Area_Cells_Tumor_um2'][x, y] = row['Area_Cells_Tumor_um2']
                 patient_dict['Density_Tumor_images'][x, y]=patient_dict['Ratio_Cells_per_Tumor'][x, y]*patient_dict['Area_Tumor_um2'][x, y]/512251.79746
         else:
             patient_dict['Tumor_present'][x, y] =0
         if row['IM_present'] == 'yes':
             patient_dict['IM_present'][x, y] = 1
             # check for tile rejection
             assignIM=False
             try:
                 float(row['Number_Cells_InvasiveFront'])
                 float(row['Ratio_Cells_per_InvasiveFront'])
                 assignIM=True
             except (TypeError, ValueError): # values for IM exists and are accepted
                 print(assignIM,'IM Tile {} rejected\n'.format(tile_id))
             if assignIM:
                 patient_dict['Number_Cells_InvasiveFront'][x, y] = row['Number_Cells_InvasiveFront']
                 patient_dict['Ratio_Cells_per_All_Area'][x, y]=row['Ratio_Cells_per_All_Area']
                 patient_dict['Ratio_Cells_per_InvasiveFront'][x, y]=row['Ratio_Cells_per_InvasiveFront']
                 patient_dict['Area_IvasiveFront_um2'][x, y] = row['Area_IvasiveFront_um2']
                 patient_dict['Area_Cells_IvasiveFront_um2'][x, y] = row['Area_Cells_IvasiveFront_um2']
                 patient_dict['Density_InvasiveFront_images'][x, y]=patient_dict['Ratio_Cells_per_InvasiveFront'][x, y]*patient_dict['Area_IvasiveFront_um2'][x, y]/512251.79746


         else:
             patient_dict['IM_present'][x, y] = 0

         if patient_dict['Tumor_present'][x, y] == 1 and not patient_dict['Number_Cells_Tumor'][x, y]: #tile rejected even if tumor was identified
             patient_dict['Tumor_present'][x, y] = numpy.nan
         if patient_dict['IM_present'][x, y] ==1 and not patient_dict['Number_Cells_InvasiveFront'][x, y]:#tile rejected even if front was identified
             patient_dict['IM_present'][x, y] = numpy.nan

         if patient_dict['Number_Cells_Tumor'][x, y] or patient_dict['Number_Cells_InvasiveFront'][x, y]: # a tile exists
             patient_dict['Area_All_um2'][x, y] = row['Area_All_um2']
             patient_dict['Area_Cells_All_um2'][x, y] = row['Area_Cells_All_um2']
             patient_dict['Surface_of_stained_cells_um2'][x, y] = row['Surface_of_stained_cells_um2']


 return patients

#### DATA SERIALIZATION

def save_patients(patients, cachedir):
 if not os.path.exists(cachedir):
     os.makedirs(cachedir)
 for patient_id, patient in patients.items():
     if not os.path.exists(os.path.join(cachedir, patient_id)):
         os.mkdir(os.path.join(cachedir, patient_id))
     for arg, matrix in patient.items():
         numpy.savetxt(os.path.join(cachedir, patient_id, arg), matrix)



def load_patients(cachedir):
 patients = {}
 for patient_id in os.listdir(cachedir):
     patients[patient_id] = {}
     for arg in os.listdir(os.path.join(cachedir, patient_id)):
         patients[patient_id][arg] = numpy.genfromtxt(os.path.join(cachedir, patient_id, arg))
 return patients



if __name__ == "__main__":

 PATIENTS_CD3 = get_patients_matrices(IMGPATH_CD3, DATA_CD3)
 PATIENTS_CD8 = get_patients_matrices(IMGPATH_CD8, DATA_CD8)
    
## Check for unnecessary ditionary due to either CD3 images or CD8 images missing
# Since the dictionnary are generated only if the images exist in the folder, checking for existing csv data is irrelevant
for patient_id in list(PATIENTS_CD3.keys()):
    if patient_id not in PATIENTS_CD8:
        PATIENTS_CD3.pop(patient_id)
for patient_id in list(PATIENTS_CD8.keys()):
    if patient_id not in PATIENTS_CD3:
        PATIENTS_CD8.pop(patient_id)


save_patients(PATIENTS_CD3, os.path.join('../Data_output',"cache_HEGP", "cd3"))

save_patients(PATIENTS_CD8, os.path.join('../Data_output',"cache_HEGP", "cd8"))



PAR170 (21x26 tiles)
PAR171 (23x24 tiles)
PAR172 (16x35 tiles)
PAR173 (14x25 tiles)
PAR174 (10x25 tiles)
PAR175 (19x23 tiles)
PAR176 (24x20 tiles)
PAR177 (20x25 tiles)
PAR178 (10x8 tiles)
PAR179 (19x29 tiles)
PAR180 (20x35 tiles)
PAR181 (18x9 tiles)
PAR182 (18x24 tiles)
PAR183 (17x11 tiles)
PAR184 (12x28 tiles)
PAR185 (17x30 tiles)
PAR186 (22x35 tiles)
PAR187 (22x38 tiles)
PAR188 (16x29 tiles)
PAR189 (14x25 tiles)
PAR190 (15x15 tiles)
PAR191 (28x34 tiles)
Both Tile 323 rejected

Both Tile 324 rejected

Both Tile 358 rejected

Both Tile 359 rejected

PAR192 (9x16 tiles)
PAR193 (31x39 tiles)
PAR194 (20x32 tiles)
PAR195 (30x31 tiles)
PAR196 (20x30 tiles)
PAR197 (17x31 tiles)
PAR198 (29x19 tiles)
PAR199 (22x41 tiles)
PAR200 (11x18 tiles)
PAR201 (24x23 tiles)
PAR202 (17x35 tiles)
PAR203 (28x14 tiles)
PAR204 (16x32 tiles)
PAR205 (27x34 tiles)
PAR206 (18x32 tiles)
PAR207 (23x17 tiles)
PAR208 (26x29 tiles)
PAR209 (17x24 tiles)
PAR210 (18x16 tiles)
PAR211 (25x25 tiles)
PAR212 (17x31 tiles)
PAR2

PAR225 (20x29 tiles)
PAR226 (17x21 tiles)
PAR227 (17x22 tiles)
PAR228 (24x20 tiles)
PAR229 (12x33 tiles)
PAR231 (31x19 tiles)
False IM Tile 203 rejected

PAR232 (17x16 tiles)
PAR233 (19x34 tiles)
PAR234 (20x16 tiles)
PAR235 (6x5 tiles)
PAR236 (20x22 tiles)
PAR238 (16x22 tiles)
PAR239 (23x22 tiles)
PAR240 (20x31 tiles)
PAR241 (19x34 tiles)
PAR242 (13x18 tiles)
PAR243 (20x14 tiles)
False T Tile 162 rejected

PAR244 (13x21 tiles)
False T Tile 97 rejected

PAR245 (24x28 tiles)
PAR246 (18x23 tiles)
PAR247 (28x28 tiles)
PAR248 (19x15 tiles)
PAR249 (18x28 tiles)
False T Tile 392 rejected

PAR250 (20x29 tiles)
PAR251 (15x27 tiles)
PAR252 (10x15 tiles)
PAR253 (21x21 tiles)
PAR255 (13x28 tiles)
PAR256 (16x26 tiles)
PAR257 (11x30 tiles)
PAR258 (20x28 tiles)
PAR259 (14x18 tiles)
PAR260 (17x22 tiles)
PAR261 (15x30 tiles)
PAR262 (21x30 tiles)
PAR263 (21x32 tiles)
PAR264 (16x28 tiles)
PAR265 (21x26 tiles)
PAR266 (19x19 tiles)
PAR267 (23x38 tiles)
PAR268 (19x34 tiles)
PAR269 (21x34 tiles)
PAR270 (12x3

## Generating the immunoscore from the matrix
Also gets the original Immonuscore
Uses the CutOff file

In [2]:
# File with the cutoffs
Data_IS_original=pandas.read_csv(os.path.join('..','Immunoscore_CutOFF.csv'),sep='\t')
Cutoff_Mean_CD3CT=list(Data_IS_original['CD3CT-mean'])

Cutoff_Mean_CD3IM=list(Data_IS_original['CD3IM-mean'])

Cutoff_Mean_CD8CT=list(Data_IS_original['CD8CT-mean'])

Cutoff_Mean_CD8IM=list(Data_IS_original['CD8IM-mean'])

score_cutoff=list(Data_IS_original['score'])
print("{}\n".format(score_cutoff))

 # For each patient_id get mean of Tumor and IM per CD in csv with header
with open('../Data_output/HEGP_Immunoscore_mean.csv', 'w') as f:

    f.write('OfficialID,meanCD3_Tumor_A,meanCD3_IM_A,meanCD8_Tumor_A,meanCD8_IM_A,score_cd3ct,score_cd3im,score_cd8ct,score_cd8im,score_mean,IS_MDT\n')


    for patient_id in PATIENTS_CD3:

        try:
            cd3_data = PATIENTS_CD3[patient_id]
            cd8_data = PATIENTS_CD8[patient_id]
        except KeyError:
            continue

        # Ratio pondéré
       # mean_tumor_cd3_R = numpy.nanmean(cd3_data['Density_Tumor_images'])
       # mean_tumor_cd8_R = numpy.nanmean(cd8_data['Density_Tumor_images'])

       # mean_IM_cd3_R=numpy.nanmean(cd3_data['Density_InvasiveFront_images'])
       # mean_IM_cd8_R=numpy.nanmean(cd8_data['Density_InvasiveFront_images'])
        #amos, sum nb cell/sum surface
        mean_tumor_cd3_A=((numpy.nansum(cd3_data['Number_Cells_Tumor']))/(numpy.nansum(cd3_data['Area_Tumor_um2'])))*(1e6)

        mean_tumor_cd8_A=((numpy.nansum(cd8_data['Number_Cells_Tumor']))/(numpy.nansum(cd8_data['Area_Tumor_um2'])))*(1e6)
        mean_IM_cd3_A=((numpy.nansum(cd3_data['Number_Cells_InvasiveFront']))/(numpy.nansum(cd3_data['Area_IvasiveFront_um2'])))*(1e6)
        mean_IM_cd8_A=((numpy.nansum(cd8_data['Number_Cells_InvasiveFront']))/(numpy.nansum(cd8_data['Area_IvasiveFront_um2'])))*(1e6)

        score_cd3ct=0
        score_cd8im=0
        score_cd8ct=0
        score_cd3im=0
        score_mean=0
        for i in range(0,len(score_cutoff),1):
            if score_cd3ct==0:
                if mean_tumor_cd3_A <=Cutoff_Mean_CD3CT[i]:
                    score_cd3ct=score_cutoff[i-1]
                elif i==(len(score_cutoff)-1):
                    score_cd3ct=score_cutoff[i]
            if score_cd8ct==0:
                if mean_tumor_cd8_A <=Cutoff_Mean_CD8CT[i]:
                    score_cd8ct=score_cutoff[i-1]
                elif i==(len(score_cutoff)-1):
                    score_cd8ct=score_cutoff[i]
            if score_cd3im==0:
                if mean_IM_cd3_A <=Cutoff_Mean_CD3IM[i]:
                    score_cd3im=score_cutoff[i-1]
                elif i==(len(score_cutoff)-1):
                    score_cd3im=score_cutoff[i]
            if score_cd8im==0:
                if mean_IM_cd8_A <=Cutoff_Mean_CD8IM[i]:
                    score_cd8im=score_cutoff[i-1]
                elif i==(len(score_cutoff)-1):
                    score_cd8im=score_cutoff[i]

        score_mean=(score_cd8ct+score_cd8im+score_cd3im+score_cd3ct)/4
        #assigns Immunosore
        if score_mean<=25:
            IS_MDT=0
        elif score_mean<=70:
            IS_MDT=1
        elif score_mean>70:
            IS_MDT=2
        else:
            IS_MDT='NaN'

        f.write('{},{},{},{},{},{},{},{},{},{},{}\n'.format(patient_id, mean_tumor_cd3_A,mean_IM_cd3_A, mean_tumor_cd8_A,mean_IM_cd8_A,score_cd3ct,score_cd3im,score_cd8ct,score_cd8im,score_mean,IS_MDT))


[2.5, 7.5, 12.5, 17.5, 22.5, 27.5, 32.5, 37.5, 42.5, 47.5, 52.5, 57.5, 62.5, 67.5, 72.5, 77.5, 82.5, 87.5, 92.5, 97.5]

