# ANOVA
Process Image.csv files, which has one measurement per WSI patch.

Sources of variance (ignoring the WSI level):
* Total = between patches = sum over patches (patch deviation from global mean, sq)
* Between class = weight sum over class (sum class mean deviation from global mean, sq)
* Within class = sum over class (sum class mean deviation from global mean, sq)
* Between patient = weight sum over patients (sum patient mean deviation from global mean, sq) 
* Within patient = sum over patients (sum patch deviation from patient mean, sq)
* Residual = Total - explained

In [29]:
import numpy as np
import pandas as pd
import os
import csv
from datetime import datetime
print(datetime.now())

2022-07-25 13:06:32.985685


In [30]:
BASE_PATH_IN='/home/jrm/Adjeroh/Naved/CP_80K/'
BASE_PATH_IN='/Users/jasonmiller/Downloads/CellProfilerFiltered/'
CLASS_PATH_IN='/Users/jasonmiller/Downloads/TrainTestSplit/'
CSVFILE='Process100_Image.csv'
MEASUREMENT='Mean_Nucleus_AreaShape_MeanRadius'
MCOLUMN = 888  # zero based column number within csv
ALL_PATIENTS = [p for p in os.listdir(BASE_PATH_IN) if p.startswith('TCGA-')]
print('Num patients:',len(ALL_PATIENTS))
CANCER_CLASSES = 6

Num patients: 108


In [31]:
# load classes
def load_patient_to_class():
    PATIENT_TO_CLASS={}
    file1 = CLASS_PATH_IN + 'fold0_train.txt'
    file2 = CLASS_PATH_IN + 'fold0_test.txt'
    for file in [file1,file2]:
        with open(file) as infile:
            reader = csv.reader(infile)
            for row in reader:
                patchfile = row[0]
                patient = patchfile[:19]
                cancer = int(row[1])
                # The file gives this information redundantly
                PATIENT_TO_CLASS[patient] = cancer
    return PATIENT_TO_CLASS
MAPPER = load_patient_to_class()

In [33]:
def validate_column():
    print('Validating column',MCOLUMN,'named',MEASUREMENT,':')
    for patient in ALL_PATIENTS:
        print('.', end='')
        file = BASE_PATH_IN + patient + '/' + CSVFILE
        # print(file)
        with open(file) as infile:
            reader = csv.reader(infile)
            rows = 0
            for row in reader:
                value = row[MCOLUMN]
                if rows == 0:
                    if value!=MEASUREMENT:
                        raise Exception('For '+patient+', column name is '+value)
                else:
                    numeric = float(value)  # throws exception if not numeric
                rows += 1
            if rows < 2:
                raise Exception('no data rows for '+patient)
    print('\nColumn validated!')
    
if False:  # no need to run this again on same dataset
    validate_column()

In [36]:
# patient totals and means
def load_patient_basics():
    basics={}
    for patient in ALL_PATIENTS:
        file = BASE_PATH_IN + patient + '/' + CSVFILE
        with open(file) as infile:
            total = 0
            count = 0
            reader = csv.reader(infile)
            header = True
            for row in reader:
                value = row[MCOLUMN]
                if header:
                    header = False
                else:
                    total += float(value)
                    count += 1
        mean = total/count
        count_total_mean = (count,total,mean)
        basics[patient] = count_total_mean
    return basics
BASICS = load_patient_basics()

In [38]:
def get_means():
    overall_count = 0
    overall_total = 0
    class_counts=np.zeros(6,dtype=int)
    class_totals=np.zeros(6,dtype=float)
    for patient in ALL_PATIENTS:
        count,total,mean = BASICS[patient]
        patient_class = MAPPER[patient]
        overall_count += count
        overall_total += total
        class_counts[patient_class] += count
        class_totals[patient_class] += total
    class_means=class_totals/class_counts
    overall_mean=overall_total/overall_count
    return class_means,overall_mean
CLASS_MEANS, OVERALL_MEAN = get_means()
print(CLASS_MEANS)
print(OVERALL_MEAN)

[3.05421925 3.70673425 3.77941295 3.71321581 3.7803057  3.80489391]
3.368078649521237


In [34]:
# Parameter could be patient mean, class mean, or overall mean
def sse_one_patient(directory, the_mean):
    file = BASE_PATH_IN + directory + '/' + CSVFILE
    SSE = 0
    with open(file) as infile:
        reader = csv.reader(infile)
        header = True
        for row in reader:
            value = row[MCOLUMN]
            if header:
                header = False
            else:
                value = float(value)
                error = the_mean - value
                square = error**2
                SSE += square
    return SSE

In [35]:
# within patient SSE
patient_SSE_within = 0
for patient in ALL_PATIENTS:
    pcount,ptotal,pmean = BASICS[patient]
    SSE_within = within_variance_one_patient(patient, pmean)
    patient_SSE_within += SSE_within
    print('Patient',patient,'mean',patient_mean,'SSE_w',SSE_within)
print('Patient-level SSE within:',patient_SSE_within)

Patient TCGA-02-0010-01Z-00 mean 2.99919405777337 SSE_w 232.47944407257305
Patient TCGA-HT-7616-01Z-00 mean 2.99919405777337 SSE_w 45.59234470635053
Patient TCGA-14-1452-01Z-00 mean 2.99919405777337 SSE_w 284.37105336981494
Patient TCGA-28-1746-01Z-00 mean 2.99919405777337 SSE_w 11.802895789675933
Patient TCGA-DU-6404-01Z-00 mean 2.99919405777337 SSE_w 38.30176726739088
Patient TCGA-S9-A6WE-01Z-00 mean 2.99919405777337 SSE_w 17.153189830600297
Patient TCGA-S9-A6WI-01Z-00 mean 2.99919405777337 SSE_w 44.729748606819285
Patient TCGA-QH-A6CZ-01Z-00 mean 2.99919405777337 SSE_w 41.48198943908842
Patient TCGA-DU-6408-01Z-00 mean 2.99919405777337 SSE_w 23.861689153725436
Patient TCGA-HT-7881-01Z-00 mean 2.99919405777337 SSE_w 151.1156013332581
Patient TCGA-DU-7009-01Z-00 mean 2.99919405777337 SSE_w 22.634537383511624
Patient TCGA-26-5139-01Z-00 mean 2.99919405777337 SSE_w 50.16706088188967
Patient TCGA-DU-8167-01Z-00 mean 2.99919405777337 SSE_w 73.42382604971937
Patient TCGA-HW-7491-01Z-00 mea

In [None]:
# within class SSE
counts=np.array(CANCER_CLASSES,dtype=int)
totals=np.array(CANCER_CLASSES,dtype=float)
for patient in ALL_PATIENTS:
        cancer_class = MAPPER[patient]
        

In [10]:
def get_total_variance(global_mean):
    total_SSE = 0
    for patient in os.listdir(BASE_PATH_IN):
        if patient.startswith('TCGA-'):
            file = BASE_PATH_IN + patient + '/' + CSVFILE
            with open(file) as infile:
                reader = csv.reader(infile)
                header = True
                for row in reader:
                    value = row[MCOLUMN]
                    if header:
                        header = False
                    else:
                        value = float(value)
                        error = global_mean - value
                        square = error**2
                        total_SSE += square
    return total_SSE

In [7]:
def get_global_count():
    grand_count = 0
    grand_total = 0
    for patient in os.listdir(BASE_PATH_IN):
        if patient.startswith('TCGA-'):
            patient_count, patient_total = count_one_patient(patient)
            grand_count += patient_count
            grand_total += patient_total
            patient_mean = patient_total / patient_count
            # print(patient,patient_mean)
    return grand_count, grand_total


In [8]:
global_count, global_total = get_global_count()
global_mean = global_total / global_count
print('Global mean',global_mean)

TCGA-02-0010-01Z-00 2.9064318134506393
TCGA-HT-7616-01Z-00 3.704587956514372
TCGA-14-1452-01Z-00 2.9461776766095933
TCGA-28-1746-01Z-00 3.2849103796962877
TCGA-DU-6404-01Z-00 3.6864243564491557
TCGA-S9-A6WE-01Z-00 3.3019203550651817
TCGA-S9-A6WI-01Z-00 3.376396004883294
TCGA-QH-A6CZ-01Z-00 3.5959338396654132
TCGA-DU-6408-01Z-00 3.514234180896044
TCGA-HT-7881-01Z-00 3.6548921233716807
TCGA-DU-7009-01Z-00 3.38643809327903
TCGA-26-5139-01Z-00 3.287579389830524
TCGA-DU-8167-01Z-00 3.746975492448168
TCGA-HW-7491-01Z-00 3.558147832054447
TCGA-14-1829-01Z-00 3.1441575435397016
TCGA-06-0129-01Z-00 3.0907424041228335
TCGA-DU-8165-01Z-00 3.878493938639796
TCGA-DU-7015-01Z-00 3.340709092089833
TCGA-06-0125-01Z-00 2.9174883245532013
TCGA-HT-7854-01Z-00 3.894404010019071
TCGA-DU-5874-01Z-00 3.6498957610084566
TCGA-QH-A6XA-01Z-00 3.4080852779569106
TCGA-HT-7606-01Z-00 3.8012705232240425
TCGA-02-0439-01Z-00 3.15328227036864
TCGA-26-5133-01Z-00 3.8956279552755624
TCGA-HT-A618-01Z-00 3.5840964414621093

In [11]:
total_SSE = get_total_variance(global_mean)
print('Total SSE',total_SSE)

Total SSE 21681.121924016566
