In [1]:
# replicate previous study that visualized the cell type information for mibi toff data
import os
import csv
import numpy as np
import pandas as pd


In [2]:
#load data
path_cell_summary=r"D:\MIBI-TOFF\Data_For_Amos\cleaned_expression_with_both_classification_prob_spatial_27_09_23.csv"
cell_sum=pd.read_csv(path_cell_summary)
#remove columns that are not the proteins that we want as well as the numerical index of cell numbers
columns_drop=[0]+list(range(2,28))
filt_cell_sum=cell_sum.drop(columns=cell_sum.columns[columns_drop],axis=1)
print(cell_sum.head())

   Unnamed: 0  cell_size     128Te     129Xe  12C     130Xe     131Xe  \
0           0       22.0  4.772296  5.298342  0.0  4.846394  5.040530   
1           1       64.0  5.875641  5.910126  0.0  4.636036  5.830791   
2           2       75.0  5.399690  6.469769  0.0  5.041314  6.284138   
3           3       59.0  5.930317  5.874748  0.0  4.265500  5.511256   
4           4      135.0  5.957570  5.918503  0.0  4.325043  5.804698   

      132Xe     137Ba     138Ba  ...  centroid_dif  num_concavities   fov  \
0  5.465389  2.219161  2.903433  ...      0.000000              0.0  FOV2   
1  6.095266  3.337921  3.337921  ...      0.010626              0.0  FOV2   
2  6.423250  3.179785  3.814529  ...      0.005771              0.0  FOV2   
3  5.999910  3.014948  2.612470  ...      0.015455              0.0  FOV2   
4  6.006177  2.479528  3.389912  ...      0.009059              0.0  FOV2   

            pred  pred_prob         class  score        spatial  Group  \
0    DC sign Mac   0.559

In [3]:
filt_cell_sum['pred'] = filt_cell_sum['pred'].astype(str)
all_column_dtypes = filt_cell_sum['pred'].dtypes
print("Data types of all columns:")
print(all_column_dtypes)
unique_values = filt_cell_sum['pred'].unique()
print(unique_values)

Data types of all columns:
object
['DC sign Mac' 'blood vessels' 'Unidentified' 'Collagen_sma' 'B cell'
 'CD4 APC' 'CD4 T cell' 'CD20_neg_B_cells' 'SMA' 'CD8 T cell' 'Mac'
 'Collagen' 'Memory_CD4_T_Cells' 'CD206_Mac' 'Neutrophil' 'NK cell'
 'Mono_CD14_DR' 'CD11_CD11c_DCsign_DCs' 'CD68_Mac' 'Hevs' 'CD4 Treg'
 'CD14_CD11c_DCs' 'DCs' 'Follicular_Germinal_B_Cell' 'Tfh' 'Immune'
 'tumor' 'CD3 only']


In [4]:
#drop unknown and blood vessels and those counted as just immune
filt_cell_sum=filt_cell_sum[(filt_cell_sum['pred'] != 'Unidentified')].dropna()
filt_cell_sum=filt_cell_sum[(filt_cell_sum['pred'] != 'blood vessels')].dropna()
filt_cell_sum=filt_cell_sum[(filt_cell_sum['pred'] != 'Immune')].dropna()
filt_cell_sum=filt_cell_sum.dropna()
print(filt_cell_sum.shape)



(1562914, 64)


In [5]:
#trying to get the sum of each cell type
from collections import Counter
value_array = filt_cell_sum['pred'].to_numpy()
print(value_array)

count_dict = Counter(value_array)

count_dict = dict(sorted(count_dict.items(), key=lambda item: item[1], reverse=True))

for string, count in count_dict.items():
    print(f'{string}: {count}')

total_count = sum(count_dict.values())
print(f'Total Count: {total_count}')

print(f'Tumor Cell ratio:',{count_dict['tumor']/total_count})





['DC sign Mac' 'Collagen_sma' 'B cell' ... 'tumor' 'tumor' 'tumor']
CD4 T cell: 491864
B cell: 319888
Memory_CD4_T_Cells: 163469
CD8 T cell: 156633
Follicular_Germinal_B_Cell: 94592
tumor: 40051
Hevs: 39939
CD20_neg_B_cells: 36075
DC sign Mac: 28117
CD4 APC: 25002
Mono_CD14_DR: 24441
Collagen_sma: 21914
CD206_Mac: 21727
CD4 Treg: 15799
SMA: 14757
Collagen: 13349
Neutrophil: 12308
Tfh: 12198
DCs: 9086
CD14_CD11c_DCs: 5995
NK cell: 5619
CD68_Mac: 4150
Mac: 3538
CD11_CD11c_DCsign_DCs: 1737
CD3 only: 666
Total Count: 1562914
Tumor Cell ratio: {0.025625850174737702}


In [6]:
#varify tumor count
value_array = filt_cell_sum['class'].to_numpy()
print(value_array)

count_dict = Counter(value_array)

count_dict = dict(sorted(count_dict.items(), key=lambda item: item[1], reverse=True))

for string, count in count_dict.items():
    print(f'{string}: {count}')

total_count = sum(count_dict.values())
print(f'Total Count: {total_count}')

print(f'Tumor Cell ratio:',{count_dict['tumor']/total_count})


['immune' 'immune' 'immune' ... 'tumor' 'tumor' 'tumor']
immune: 1522863
tumor: 40051
Total Count: 1562914
Tumor Cell ratio: {0.025625850174737702}


In [7]:
X = filt_cell_sum.drop('pred', axis=1)  # Features
y = filt_cell_sum['pred']  # Labels
print(X.head())
X_columns_drop = list(range(40, X.shape[1]))#These numbers are determined from the spread sheet after removing everything we wont use
X.drop(columns=X.columns[X_columns_drop], axis=1, inplace=True)
print(X.head())


   cell_size  Alexa Fluor 488  Bax  CCR7  CD11c  CD14  CD163      CD20  CD206  \
0       22.0              0.0  0.0   0.0    0.0   0.0    0.0  0.000000    0.0   
3       59.0              0.0  0.0   0.0    0.0   0.0    0.0  0.000000    0.0   
4      135.0              0.0  0.0   0.0    0.0   0.0    0.0  3.689504    0.0   
5      139.0              0.0  0.0   0.0    0.0   0.0    0.0  0.000000    0.0   
6      122.0              0.0  0.0   0.0    0.0   0.0    0.0  0.000000    0.0   

   CD21  ...  convex_hull_resid  centroid_dif  num_concavities   fov  \
0   0.0  ...           0.000000      0.000000              0.0  FOV2   
3   0.0  ...           0.032787      0.015455              0.0  FOV2   
4   0.0  ...           0.028777      0.009059              0.0  FOV2   
5   0.0  ...           0.034722      0.007883              0.0  FOV2   
6   0.0  ...           0.008130      0.004532              0.0  FOV2   

   pred_prob   class  score        spatial  Group  patient number  
0   0.559885

In [9]:
#Simple PCA analysis clearly to many variables and cell types to get a meaningful results. 
%matplotlib widget
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

pca = PCA(n_components=3)
X_pca = pca.fit_transform(X)


df_pca = pd.DataFrame(X_pca, columns=['PC1', 'PC2', 'PC3'])
df_pca['pred'] = y

fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')

cmap = plt.get_cmap('viridis') 

unique_categories = df_pca['pred'].unique()
colors = [cmap(i / len(unique_categories)) for i in range(len(unique_categories))]

for category, color in zip(unique_categories, colors):
    indices = df_pca['pred'] == category
    ax.scatter(
        df_pca.loc[indices, 'PC1'],
        df_pca.loc[indices, 'PC2'],
        df_pca.loc[indices, 'PC3'],
        c=[color],
        label=category
    )

ax.set_xlabel('PC1')
ax.set_ylabel('PC2')
ax.set_zlabel('PC3')
ax.legend()

plt.title('3D Scatter Plot with PCA')
plt.show()

<IPython.core.display.Javascript object>