# Prediction of tumor immune cell infiltration based on extracellular matrix organization

In [89]:
# Import modules

#
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats

# Local modules
import auxiliary
import plots
import ExtractMap

## Lecture du jeu de données

On s'intéresse ici à des descripteurs qui sont extraits à partir de différentes positions au sein d'images de coupe histologique de tumeurs.

In [2]:
filepath_wt = "../data/WTconcatenate.csv.gz"
filepath_ki = "../data/KIconcatenate.csv.gz"

df_wt = auxiliary.read_dataframe(filepath_wt, low_memory=False)
df_ki = auxiliary.read_dataframe(filepath_ki, low_memory=False)

### Conversion du type des colonnes

On attribut à chaque colonne un type spécifique, pour les entier nous les convertissons en type int32 ou uint32 pour les variables strictements positives (ici, X et Y).

Les variables à valeurs flottantes sont convertis en float64. Et les variables informant sur la condition (WT/CD64-hDTR), le fichier associé ainsi que le type cellulaire étudié (Ly6/CD3) sont convertis en np.object.

Cela est pratique afin de clarifier les types associés aux variables, et pour la gestion de l'espace mémoire.

In [3]:
# Assigned columns to types
str_columns = ["Condition", "FileName", "Type"]
integer_columns = ["Mask"]
nonsigned_columns = ["X", "Y"]
float_columns = [
    "Angle100", "Coherency100", "Energy100", "MeanInt100", "VarInt100", "Density100", "VarDensity100", "OrientationRef100",
    "Angle140", "Coherency140", "Energy140", "MeanInt140", "VarInt140", "Density140", "VarDensity140", "OrientationRef140",
    "Angle20", "Coherency20", "Energy20", "MeanInt20", "VarInt20", "Density20", "VarDensity20", "OrientationRef20",
    "Angle60", "Coherency60", "Energy60", "MeanInt60", "VarInt60", "Density60", "VarDensity60", "OrientationRef60",
    "Dist", "MinDist", "MedDist", "CellArea", "CellEcc",
    "Cells100um", "MinDist100um", "MedDist100um", "CellArea100um", "CellEcc100um",
    "Frac", "Cells"
]

# Associate a type to each columns
data_type = {
    **dict.fromkeys(str_columns, object),
    **dict.fromkeys(nonsigned_columns, np.uint32),
    **dict.fromkeys(float_columns, np.float64),
    **dict.fromkeys(integer_columns, np.int32)
}

df_wt = df_wt.astype(data_type)
df_ki = df_ki.astype(data_type)

# Fusion des deux jeu de données
df_all = pd.concat([df_wt, df_ki])
print(f"Nombre de lignes au sein du jeu de données total: {df_all.shape[0]}")
print(f"Nombre de colonnes au sein du jeu de données total: {df_all.shape[1]}")

Nombre de lignes au sein du jeu de données total: 6697691
Nombre de colonnes au sein du jeu de données total: 50


## Lymphocytes T

Dans un premier temps, nous nous intéressons seulement au lymphocytes T dans l'étude de la corrélation des descripteurs (forme de la matrice, distribution) avec leur présence.

In [80]:
# Keep only T cells
df_all_cd3 = df_all[df_all["Type"] == "CD3"]
dt_byfile = df_all_cd3[["Condition", "FileName"]]\
    .groupby(["FileName"]).value_counts().to_frame()\
    .reset_index(level=1).sort_values(by="Condition", ascending=False)
dt_byfile["Name"] = [filename[2:filename.find(".tif")] for filename in dt_byfile.index]
dt_byfile

In [139]:
# View cell with density
m = ExtractMap.ExtractMap(df_all_cd3, "Density20", choose="./MAX_12b_187_CD3eFITC.tif_SHG.tif")
plt.imshow(m)
plt.show()

Unnamed: 0_level_0,Condition,count,Name
FileName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
./MAX_12b_187_CD3eFITC.tif_SHG.tif,WT,162252,MAX_12b_187_CD3eFITC
./FWT_507_down_CD3FITC.tif_max.tif_SHG.tif,WT,76503,FWT_507_down_CD3FITC
./MAX_12b_189_CD3eFITC.tif_SHG.tif,WT,153919,MAX_12b_189_CD3eFITC
./MAX_12b_184_CD3eFITC.tif_SHG.tif,WT,130062,MAX_12b_184_CD3eFITC
./MAX_12b_FWT331_CD3_SHG.tif,WT,152904,MAX_12b_FWT331_CD3_SHG
./MAX_12b_FWT_333_CD3_SHG.tif,WT,105724,MAX_12b_FWT_333_CD3_SHG
./FWT_928_right_CD3FITC.tif_max.tif_SHG.tif,WT,105086,FWT_928_right_CD3FITC
./FWT_861_big_CD3FITC.tif_max.tif_SHG.tif,WT,102595,FWT_861_big_CD3FITC
./FWT_511_CD3FITC_F480_AF647_Phal546.tif_max.tif_SHG.tif,WT,62151,FWT_511_CD3FITC_F480_AF647_Phal546
./MAX_MWT738_CD3_SHG.tif,WT,124915,MAX_MWT738_CD3_SHG


In [105]:
table_cond_mask = pd.crosstab(df_all_cd3["Condition"], df_all_cd3["Mask"]).sort_index(ascending=False)
table_byfile_condition = dt_byfile["Condition"].value_counts()
table_mask = df_all_cd3['Mask'].value_counts()

# Print
print("Lymphocytes T")
print(f"Nombre de lignes au sein du jeu de données : {df_all_cd3.shape[0]}")
print(f"Nombre de colonnes au sein du jeu de données : {df_all_cd3.shape[1]}")
#
print()
print(f"Nombre d'images uniques : {dt_byfile.shape[0]}")
print(f"Nombre de tissus WT : {table_byfile_condition['WT']}\n"
      f"Nombre de tissus CD64-hDTR : {table_byfile_condition['CD64-hDTR']}")
# 
print()
print(f"Nombre d'observations en dehors de la tumeur (Mask=0): {table_mask[0]}")
print(f"Nombre d'observations au sein de la tumeur (Mask=1): {table_mask[1]}")
#
print(f"\nColonnes du jeu de données :\n{df_all_cd3.columns.values}")

Lymphocytes T
Nombre de lignes au sein du jeu de données : 3896365
Nombre de colonnes au sein du jeu de données : 50

Nombre d'images uniques : 34
Nombre de tissus WT : 14
Nombre de tissus CD64-hDTR : 20

Nombre d'observations en dehors de la tumeur (Mask=0): 2028501
Nombre d'observations au sein de la tumeur (Mask=1): 1867864
Mask             0        1
Condition                  
WT          851811   770766
CD64-hDTR  1176690  1097098

Colonnes du jeu de données :
['Condition' 'FileName' 'X' 'Y' 'Coherency100' 'Energy100' 'MeanInt100'
 'VarInt100' 'Density100' 'VarDensity100' 'Coherency140' 'Energy140'
 'MeanInt140' 'VarInt140' 'Density140' 'VarDensity140' 'Coherency20'
 'Energy20' 'MeanInt20' 'VarInt20' 'Density20' 'VarDensity20'
 'Coherency60' 'Energy60' 'MeanInt60' 'VarInt60' 'Density60'
 'VarDensity60' 'OrientationRef20' 'OrientationRef60' 'OrientationRef100'
 'OrientationRef140' 'Dist' 'Angle20' 'Angle60' 'Angle100' 'Angle140'
 'Mask' 'Type' 'Cells' 'MinDist' 'MedDist' 'CellArea

In [106]:
# 
table_cond_mask.plot(kind="bar")
plt.title("Jeu de données CD3\n"
          "Nombre d'observation en fonction de Condition et Mask")
plt.show()
print(table_cond_mask)

In [81]:
# Nombre d'observations par image
sns.barplot(data=dt_byfile, x="Name", y="count", hue="Condition")
plt.title("Jeu de données CD3\n"
          "Nombre d'observations au sein des tissus WT/CD64-hDTR"
          )
plt.xticks(rotation=90)
plt.ylabel("Nombre d'observations")
plt.show()

  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):


In [79]:
# Distribution du nombre d'observations par image
sns.boxplot(data=dt_byfile, x="Condition", y="count", hue="Condition")
plt.title("Jeu de données CD3\n"
          "Distribution du nombre d'observations\n"
          "au sein des tissus WT/CD64-hDTR")
plt.xticks(rotation=90)
plt.ylabel("Nombre d'observations")
plt.show()

  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):


In [91]:
# Statistics
stats.ttest_ind(
    dt_byfile[dt_byfile["Condition"] == "WT"]["count"],
    dt_byfile[dt_byfile["Condition"] == "CD64-hDTR"]["count"],
    equal_var = False
)  # No differences between number of value distribution in the two condition

TtestResult(statistic=0.19013370919743838, pvalue=0.8504487125901021, df=30.880958038289354)

In [155]:
plots.single_plot(df_all_cd3["Density20"], title="Distribution", bins=50, mode="hist")
plt.show()

## Comparaison des distribution entre WT et CD64-hDTR

On s'intéresse ici à la distribution des différents descripteurs selon la Condition étudiée, ici si le tissu est de phénotype WT (avec macrophage) et lorsque le tissu est de phénotype CD64-hDTR (déplété en macrophage).

In [161]:
pixel_size = 20
descriptors = ["Angle", "Coherency", "Energy", "MeanInt", "VarInt", "Density", "VarDensity", "OrientationRef"]
descriptors = [i+str(pixel_size) for i in descriptors]
fig, ax = plt.subplots(2, 4, figsize=(15, 10))
fig.suptitle("Jeu de données CD3\n"
             "Distributions des différents descripteurs")

df_tmp = df_all_cd3[df_all_cd3["Density20"] > 0]
for i, des in enumerate(descriptors):
    idx0, idx1 = i//4, i%4
    actual_axis = ax[idx0][idx1]
    actual_axis.hist(df_tmp[df_tmp["Condition"] == "WT"][des], label="WT", bins=50, alpha=0.5)
    actual_axis.hist(df_tmp[df_tmp["Condition"] == "CD64-hDTR"][des], label="CD64-hDTR", bins=50, alpha=0.4)
    actual_axis.set_title(des)
    actual_axis.legend()
plt.show()

In [175]:
fig, ax = plt.subplots(2, 4, figsize=(15, 10))
fig.suptitle("Jeu de données CD3\n"
             "Distributions des différents descripteurs")

for i, des in enumerate(descriptors):
    idx0, idx1 = i//4, i%4
    actual_axis = ax[idx0][idx1]
    actual_axis.boxplot([df_tmp[df_tmp["Condition"] == "WT"][des], df_tmp[df_tmp["Condition"] == "CD64-hDTR"][des]])
    actual_axis.set_title(des)
    actual_axis.legend()
plt.show()

No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.
No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.
No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.
No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.
No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.
No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.
No artists with labels found to put in legend.  Note that 

In [183]:
plots.dual_plot(
    df_all_cd3.index,
    df_all_cd3[df_all_cd3["Condition"] == "WT"]["OrientationRef20"],
    df_all_cd3[df_all_cd3["Condition"] == "CD64-hDTR"]["OrientationRef20"]
)
plt.show()

ValueError: operands could not be broadcast together with shapes (1622577,) (2273788,) 

In [None]:
Cells100um