In [2]:
import torch
import pandas as pd
from pathlib import Path

feature_dir = Path('/mnt/hpc/pathology/hipt_features/primary_vs_metastasis_4k')
dmtr = pd.read_csv('/mnt/c/Users/user/data/tables/dmtr.csv').set_index('id')

primary_slides = [file.stem for c in Path('/mnt/hpc/pathology/hipt_preprocessed/4096/primary/').iterdir() for file in (c / 'patches').iterdir()]
metastasis_slides = [file.stem for c in Path('/mnt/hpc/pathology/hipt_preprocessed/4096/metastasis/').iterdir() for file in (c / 'patches').iterdir()]

labels = pd.DataFrame(columns = ['patient','center', 'primary', 'metastasis'])

for path in feature_dir.iterdir():
    stem = path.stem

    if 'VU' in stem:
        patient = stem[:11]
        center = 'vumc'
    elif 'MAX' in stem:
        patient = stem[:7].replace('-','_')
        center = 'maxima'
    elif 'LU' in stem:
        patient = 'PREM' + stem.split('PREM')[1][:8].replace('-','_')
        center = 'lumc'
    elif 'MS' in stem:
        patient = stem[:11].replace('-','_')
        center = 'mst'
    elif 'AM' in stem:
        patient = stem[:11].replace('-','_')
        center = 'amphia'
    elif 'IS' in stem:
        patient = stem[:11].replace('-','_')
        center = 'isala'
    elif 'IM' in stem:
        patient = stem[:6]
        center = 'umcu'
    elif 'UNI' in stem:
        patient = stem[:7].replace('-','_')
        center = 'umcu'
    elif 'RA' in stem:
        patient = stem[:11].replace('-','_')
        center = 'radboud'
    elif 'M-' in stem[:2]:
        patient = stem[:5].replace('-','_')
        center = 'umcu'
    elif 'ZU' in stem:
        patient = stem[:11]
        center = 'zuyderland'
    else:
        patient = float('nan')
        center = float('nan')
        print(stem)

    primary = stem in primary_slides
    metastasis = stem in metastasis_slides
    
    labels.loc[path] = [patient, center, primary, metastasis]

labels = labels.join(on='patient', other=dmtr[['dcb','response','typbraf0n']])

labels.to_csv('/home/rens/repos/premium_pathology/hipt_feature_extractor/data/labels.csv')

  dmtr = pd.read_csv('/mnt/c/Users/user/data/tables/dmtr.csv').set_index('id')


PREMIUM-71c772-0177-I1_HE  1-001


In [56]:
import h5py
import numpy as np
from tqdm.notebook import tqdm

features = pd.DataFrame(columns=[f'feature{x}' for x in range(192)]+['patient','y'])

rows = []
for ix, row in tqdm(list(labels.iterrows())):
    f = torch.load(ix)
    f = pd.DataFrame(f, columns=[f'feature{x}' for x in range(192)])
    f['patient'] = row['patient']
    f['slide'] = ix.stem

    try:
        coords_file = Path('/mnt/hpc/pathology/hipt_preprocessed/4096') / ('primary' if row['primary'] else 'metastasis') / row['center'] / 'patches' / (ix.stem + '.h5')
        coords = h5py.File(coords_file, 'r')
   
        f['x'] = np.array(coords['coords'])[:,0]
        f['y'] = np.array(coords['coords'])[:,1]
    except:
        print(ix)

    rows.append(f)

features = pd.concat(rows)
features.to_csv('/home/rens/repos/premium_pathology/hipt_feature_extractor/data/features.csv')


  0%|          | 0/1603 [00:00<?, ?it/s]

/mnt/hpc/pathology/hipt_features/primary_vs_metastasis_4k/PREM-RA-006_644aa81749 HE1.pt
/mnt/hpc/pathology/hipt_features/primary_vs_metastasis_4k/PREM-RA-009 187b65d58c - 2022-08-24 14.53.24.pt
/mnt/hpc/pathology/hipt_features/primary_vs_metastasis_4k/PREMIUM-71c772-0177-I1_HE  1-001.pt
/mnt/hpc/pathology/hipt_features/primary_vs_metastasis_4k/PREM-RA-008_1275902244.pt
/mnt/hpc/pathology/hipt_features/primary_vs_metastasis_4k/PREM-RA-008_5545bf94cb HE1 - 2022-04-19 23.38.34.pt
/mnt/hpc/pathology/hipt_features/primary_vs_metastasis_4k/PREM-RA-003_ea3d6ffcac HE1 - 2022-04-20 13.01.53.pt
/mnt/hpc/pathology/hipt_features/primary_vs_metastasis_4k/PREM-RA-009 9368f7dfa0 - 2022-08-24 14.49.47.pt
/mnt/hpc/pathology/hipt_features/primary_vs_metastasis_4k/PREM-RA-010_dcbe9c67a3 HE1 - 2022-04-19 22.29.52.pt
/mnt/hpc/pathology/hipt_features/primary_vs_metastasis_4k/PREM-RA-002_1d5f7930a0 HE1 - 2022-04-19 21.09.42.pt
/mnt/hpc/pathology/hipt_features/primary_vs_metastasis_4k/PREM-RA-004_19727acf68 H

In [53]:
features = pd.read_csv('/home/rens/repos/premium_pathology/hipt_feature_extractor/data/features.csv')

In [57]:
features

Unnamed: 0,feature0,feature1,feature2,feature3,feature4,feature5,feature6,feature7,feature8,feature9,...,feature186,feature187,feature188,feature189,feature190,feature191,patient,slide,x,y
0,-1.782632,-0.158250,-0.369399,-0.116985,-1.222638,-1.776469,0.068258,-1.643695,1.352754,0.386998,...,-0.978767,0.512478,1.122406,-1.256142,-0.522718,0.957125,PREM_ZU_038,PREM_ZU_038 - T20-020312-I-S-1 - 2021-08-12 15...,3520.0,78848.0
1,-1.966238,-0.053246,-0.267426,0.009680,-1.276277,-1.934873,-0.118801,-1.654184,1.173315,0.421328,...,-1.140093,0.401437,1.135581,-1.192348,-0.661202,1.081556,PREM_ZU_038,PREM_ZU_038 - T20-020312-I-S-1 - 2021-08-12 15...,11712.0,70656.0
2,-1.780648,-0.154268,-0.362752,-0.104681,-1.245291,-1.790625,0.058912,-1.645428,1.328295,0.384640,...,-0.961332,0.508690,1.115088,-1.244897,-0.537261,0.965399,PREM_ZU_038,PREM_ZU_038 - T20-020312-I-S-1 - 2021-08-12 15...,11712.0,78848.0
3,-1.736614,-0.183088,-0.407474,-0.137808,-1.245996,-1.794523,0.105284,-1.650582,1.367105,0.327138,...,-0.894833,0.554776,1.080624,-1.215041,-0.514905,0.948860,PREM_ZU_038,PREM_ZU_038 - T20-020312-I-S-1 - 2021-08-12 15...,7616.0,52736.0
4,-1.963327,-0.080931,-0.304451,-0.033353,-1.202755,-1.876474,-0.062850,-1.654203,1.256200,0.420029,...,-1.193285,0.428146,1.164850,-1.236299,-0.608697,1.048663,PREM_ZU_038,PREM_ZU_038 - T20-020312-I-S-1 - 2021-08-12 15...,7616.0,60928.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
169,0.590546,0.255692,-0.558055,0.643945,-0.211310,-0.189649,-0.861529,0.488620,-0.582933,-0.192452,...,-0.409748,-1.072126,-0.578867,1.148652,0.909383,-1.026821,PREM_AM_013,PREM-AM-013__ - 2021-11-04 11.16.34,206912.0,65536.0
170,0.590260,0.319732,-0.580952,0.700570,-0.214356,-0.221413,-0.876757,0.514377,-0.670679,-0.169213,...,-0.442924,-1.091711,-0.525240,1.177617,0.902549,-1.003896,PREM_AM_013,PREM-AM-013__ - 2021-11-04 11.16.34,206912.0,73728.0
171,0.640466,0.415242,-0.716460,1.208194,-0.696122,-0.255905,-0.653241,-0.186722,-0.438851,0.184572,...,-0.650333,-0.944467,-0.443142,0.343557,0.454298,-1.365006,PREM_AM_013,PREM-AM-013__ - 2021-11-04 11.16.34,206912.0,81920.0
172,-0.168883,0.590321,-0.970883,1.460829,-0.552437,-0.396976,-0.297263,-0.949092,-0.734780,1.041504,...,-0.808907,-0.682403,0.079630,-1.457723,-1.163502,-0.685839,PREM_AM_013,PREM-AM-013__ - 2021-11-04 11.16.34,206912.0,90112.0


In [58]:
labels['slide'] = [x.stem for x in labels.index]
features = features.join(labels.set_index('slide').primary, on='slide')

In [59]:
from sklearn.model_selection import GridSearchCV, cross_val_predict, GroupKFold
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

# model = GridSearchCV(
#     SVC(),
#     param_grid={
#         'C':[1e-3, 1e-2, 1e-1,1,1e2],
#         'gamma':[1e-6,1e-5,1e-4,1e-3,1e-2,1e-1,1,10]
#     })

# model = SVC()
model = LogisticRegression()

subset = features.join(labels.set_index('patient')['center'], on=['patient']).dropna()
target = 'primary'

test_centers = ['maxima','lumc']
train = subset[~subset.center.isin(test_centers)]
test = subset[subset.center.isin(test_centers)]

X_train, y_train, groups_train = train[[f'feature{x}' for x in range(192)]], train[target].astype(int), train.patient
X_test, y_test, groups_test = test[[f'feature{x}' for x in range(192)]], test[target].astype(int), test[['patient']]

model.fit(X_train, y_train)

In [61]:
preds = model.predict_proba(X_test)[:,1]


In [71]:
test['preds'] = preds
results = test.groupby('slide').preds.mean().to_frame().join(test.groupby('slide').primary.first().apply(int))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['preds'] = preds


In [73]:
from sklearn.metrics import roc_auc_score

roc_auc_score(results.primary, results.preds)

0.5648351648351648

In [52]:
features

Unnamed: 0,feature0,feature1,feature2,feature3,feature4,feature5,feature6,feature7,feature8,feature9,...,feature187,feature188,feature189,feature190,feature191,patient,slide,x,y,primary
0,0.575444,0.482876,-0.610929,0.853825,-0.224593,-0.294878,-0.945162,0.585770,-0.889877,-0.107681,...,-1.119141,-0.396098,1.241398,0.936726,-0.945530,PREM_AM_013,PREM-AM-013__ - 2021-11-04 11.16.34,83584,90752,False
1,0.580852,0.473584,-0.604184,0.840333,-0.211235,-0.279791,-0.930501,0.590164,-0.872756,-0.102793,...,-1.118067,-0.416145,1.224177,0.932839,-0.943325,PREM_AM_013,PREM-AM-013__ - 2021-11-04 11.16.34,91776,90752,False
2,0.585006,0.438790,-0.582861,0.790394,-0.190037,-0.258314,-0.895029,0.585276,-0.807099,-0.107308,...,-1.105240,-0.459669,1.179941,0.917000,-0.944862,PREM_AM_013,PREM-AM-013__ - 2021-11-04 11.16.34,99968,90752,False
3,0.565251,0.448850,-0.605347,0.819678,-0.230452,-0.290395,-0.938149,0.570257,-0.843239,-0.127804,...,-1.108811,-0.416589,1.228593,0.936024,-0.956154,PREM_AM_013,PREM-AM-013__ - 2021-11-04 11.16.34,62976,7680,False
4,0.566564,0.456792,-0.608476,0.830566,-0.230941,-0.291865,-0.943892,0.574238,-0.858807,-0.124366,...,-1.112293,-0.410615,1.234812,0.938465,-0.953637,PREM_AM_013,PREM-AM-013__ - 2021-11-04 11.16.34,71168,7680,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
169,0.590546,0.255692,-0.558055,0.643945,-0.211310,-0.189649,-0.861529,0.488620,-0.582933,-0.192452,...,-1.072126,-0.578867,1.148652,0.909383,-1.026821,PREM_AM_013,PREM-AM-013__ - 2021-11-04 11.16.34,206912,65536,False
170,0.590260,0.319732,-0.580952,0.700570,-0.214356,-0.221413,-0.876757,0.514377,-0.670679,-0.169213,...,-1.091711,-0.525240,1.177617,0.902549,-1.003896,PREM_AM_013,PREM-AM-013__ - 2021-11-04 11.16.34,206912,73728,False
171,0.640466,0.415242,-0.716460,1.208194,-0.696122,-0.255905,-0.653241,-0.186722,-0.438851,0.184572,...,-0.944467,-0.443142,0.343557,0.454298,-1.365006,PREM_AM_013,PREM-AM-013__ - 2021-11-04 11.16.34,206912,81920,False
172,-0.168883,0.590321,-0.970883,1.460829,-0.552437,-0.396976,-0.297263,-0.949092,-0.734780,1.041504,...,-0.682403,0.079630,-1.457723,-1.163502,-0.685839,PREM_AM_013,PREM-AM-013__ - 2021-11-04 11.16.34,206912,90112,False
