# ANOVA
Process Image.csv files, which has one measurement per WSI patch.

Sources of variance (ignoring the WSI level):
* Total = between patches = sum over patches (patch deviation from global mean, sq)
* Between class = weight sum over class (sum class mean deviation from global mean, sq)
* Within class = sum over class (sum class mean deviation from global mean, sq)
* Between patient = weight sum over patients (sum patient mean deviation from global mean, sq) 
* Within patient = sum over patients (sum patch deviation from patient mean, sq)
* Residual = Total - explained

In [1]:
import numpy as np
import pandas as pd
import os
import csv
from datetime import datetime
print(datetime.now())

2022-07-26 06:55:57.897447


In [2]:
BASE_PATH_IN='/home/jrm/Adjeroh/Naved/CP_80K/'
BASE_PATH_IN='/Users/jasonmiller/Downloads/CellProfilerFiltered/'
CLASS_PATH_IN='/Users/jasonmiller/Downloads/TrainTestSplit/'
CSVFILE='Process100_Image.csv'
MEASUREMENT='Mean_Nucleus_AreaShape_MeanRadius'
MCOLUMN = 888  # zero based column number within csv
ALL_PATIENTS = [p for p in os.listdir(BASE_PATH_IN) if p.startswith('TCGA-')]
print('Num patients:',len(ALL_PATIENTS))
CANCER_CLASSES = 6

Num patients: 108


In [3]:
# load classes
def load_patient_to_class():
    PATIENT_TO_CLASS={}
    file1 = CLASS_PATH_IN + 'fold0_train.txt'
    file2 = CLASS_PATH_IN + 'fold0_test.txt'
    for file in [file1,file2]:
        with open(file) as infile:
            reader = csv.reader(infile)
            for row in reader:
                patchfile = row[0]
                patient = patchfile[:19]
                cancer = int(row[1])
                # The file gives this information redundantly
                PATIENT_TO_CLASS[patient] = cancer
    return PATIENT_TO_CLASS
MAPPER = load_patient_to_class()

In [4]:
def validate_column():
    print('Validating column',MCOLUMN,'named',MEASUREMENT,':')
    for patient in ALL_PATIENTS:
        print('.', end='')
        file = BASE_PATH_IN + patient + '/' + CSVFILE
        # print(file)
        with open(file) as infile:
            reader = csv.reader(infile)
            rows = 0
            for row in reader:
                value = row[MCOLUMN]
                if rows == 0:
                    if value!=MEASUREMENT:
                        raise Exception('For '+patient+', column name is '+value)
                else:
                    numeric = float(value)  # throws exception if not numeric
                rows += 1
            if rows < 2:
                raise Exception('no data rows for '+patient)
    print('\nColumn validated!')
    
if False:  # no need to run this again on same dataset
    validate_column()

In [5]:
def load_basics():
    patient_basics={}
    class_basics=[[0,0,0]] * CANCER_CLASSES
    overall_basics=[0,0,0]
    for patient in ALL_PATIENTS:
        file = BASE_PATH_IN + patient + '/' + CSVFILE
        cancer_class = MAPPER[patient]
        patient_total = 0
        patient_count = 0
        with open(file) as infile:
            reader = csv.reader(infile)
            header = True
            for row in reader:
                value = row[MCOLUMN]
                if header:
                    header = False
                else:
                    value = float(value)
                    patient_total += value
                    patient_count += 1
        patient_mean = patient_total/patient_count
        count_total_mean = (patient_count,patient_total,patient_mean)
        patient_basics[patient] = count_total_mean
        # running totals
        count,total,mean = class_basics[cancer_class]
        count+=patient_count
        total+=patient_total
        mean=total/count
        class_basics[cancer_class] = [count,total,mean]  
        overall_basics[0] = overall_basics[0]+patient_count
        overall_basics[1] = overall_basics[1]+patient_total
        overall_basics[2] = overall_basics[1]/overall_basics[0]
    return patient_basics,class_basics,overall_basics

patient_basics,class_basics,overall_basics=load_basics()
print('Class basics',class_basics)
print('Overall basics',overall_basics)

Class basics [[41289, 126105.65857789041, 3.0542192491436078], [12124, 44940.446064271106, 3.7067342514245385], [12209, 46142.85273802422, 3.7794129525779527], [5775, 21443.821306558217, 3.7132158106594315], [2850, 10773.871238824517, 3.780305697833164], [1518, 5775.828955408042, 3.804893910018473]]
Overall basics [75765, 255182.4788809765, 3.368078649521237]


In [6]:
print('class mean')
for cls in range(CANCER_CLASSES):
    print(class_basics[cls][2])
print('overall mean')    
print(overall_basics[2])

class mean
3.0542192491436078
3.7067342514245385
3.7794129525779527
3.7132158106594315
3.780305697833164
3.804893910018473
overall mean
3.368078649521237


In [9]:
def get_within_sse():
    patient_sse={}
    class_sse=[0] * CANCER_CLASSES
    overall_sse=0
    overall_mean = overall_basics[2]
    for patient in ALL_PATIENTS:
        patient_sse[patient] = 0
    for patient in ALL_PATIENTS:
        file = BASE_PATH_IN + patient + '/' + CSVFILE
        pcount,ptotal,pmean = patient_basics[patient]
        cancer_class = MAPPER[patient]
        ccount,ctotal,cmean = class_basics[cancer_class]
        with open(file) as infile:
            reader = csv.reader(infile)
            header = True
            for row in reader:
                value = row[MCOLUMN]
                if header:
                    header = False
                else:
                    value = float(value)
                    deviation = overall_mean - value
                    overall_sse += deviation**2
                    deviation = cmean - value
                    class_sse[cancer_class] = class_sse[cancer_class] + deviation**2
                    deviation =  pmean - value
                    patient_sse[patient] = patient_sse[patient] + deviation**2
    return overall_sse,class_sse,patient_sse

overall_sse,class_sse,patient_sse = get_within_sse()
print('Overall SSE',overall_sse)
print('Class SSE',class_sse)
print('Patient SSE',patient_sse)

Overall SSE 21681.121924016566
Class SSE [6939.138869654018, 1904.765959639705, 2246.6322440864324, 884.796427698649, 572.1598256689408, 148.29201989130306]
Patient SSE {'TCGA-02-0010-01Z-00': 232.47944407257305, 'TCGA-HT-7616-01Z-00': 45.59234470635053, 'TCGA-14-1452-01Z-00': 284.37105336981494, 'TCGA-28-1746-01Z-00': 11.802895789675933, 'TCGA-DU-6404-01Z-00': 38.30176726739088, 'TCGA-S9-A6WE-01Z-00': 17.153189830600297, 'TCGA-S9-A6WI-01Z-00': 44.729748606819285, 'TCGA-QH-A6CZ-01Z-00': 41.48198943908842, 'TCGA-DU-6408-01Z-00': 23.861689153725436, 'TCGA-HT-7881-01Z-00': 151.1156013332581, 'TCGA-DU-7009-01Z-00': 22.634537383511624, 'TCGA-26-5139-01Z-00': 50.16706088188967, 'TCGA-DU-8167-01Z-00': 73.42382604971937, 'TCGA-HW-7491-01Z-00': 53.294186843216735, 'TCGA-14-1829-01Z-00': 233.9820800071377, 'TCGA-06-0129-01Z-00': 85.68093339366233, 'TCGA-DU-8165-01Z-00': 42.60760976876505, 'TCGA-DU-7015-01Z-00': 12.307376386088478, 'TCGA-06-0125-01Z-00': 36.49788422607574, 'TCGA-HT-7854-01Z-00': 

In [10]:
print('class mean & stdev')
for cls in range(CANCER_CLASSES):
    print(class_basics[cls][2],np.sqrt(class_sse[cls]/class_basics[cls][0]))
print('overall mean & stdev')    
print(overall_basics[2],np.sqrt(overall_sse/overall_basics[0]))

class mean & stdev
3.0542192491436078 0.4099544520508094
3.7067342514245385 0.39636732590761664
3.7794129525779527 0.42896903802884073
3.7132158106594315 0.3914224094655296
3.780305697833164 0.44806007807887704
3.804893910018473 0.31255251984627314
overall mean & stdev
3.368078649521237 0.5349418337034457


In [12]:
def get_between_sse():
    patient_between=0
    class_between=0
    overall_mean = overall_basics[2]
    for cls in range(CANCER_CLASSES):
        ccount,ctotal,cmean = class_basics[cls]
        deviation = overall_mean - cmean
        wsse = ccount * deviation**2
        class_between += wsse
    for patient in ALL_PATIENTS:
        pcount,ptotal,pmean = patient_basics[patient]
        cancer_class = MAPPER[patient]
        ccount,ctotal,cmean = class_basics[cancer_class]
        deviation = cmean - pmean
        wsse = pcount * deviation**2
        patient_between += wsse
    return class_between,patient_between

class_between,patient_between = get_between_sse()
print('SSE between classes',class_between)
print('SSE between patients',patient_between)

SSE between classes 8985.336577377375
SSE between patients 3037.7352735549966
