# ANOVA
Process Image.csv files, which has one measurement per WSI patch.

Here, whether there is a class effect on one measurement.  
Ignore the variance within patient; just use each patient's mean.  
Compute the F-test rather than full anova.   

[Link](https://saylordotorg.github.io/text_introductory-statistics/s15-04-f-tests-in-one-way-anova.html#:~:text=Test%20Statistic%20for%20Testing%20the%20Null%20Hypothesis%20that%20K%20Population%20Means%20Are%20Equal&text=If%20the%20K%20populations%20are,f2%3Dn%E2%88%92K.) to academic page on F-test with formulas.
[Link](http://www.socr.ucla.edu/Applets.dir/F_Table.html) to table of F significance values.

Given 1-N observactions x for 1-k classes.   
Each class has ni observations.    
Class mean $\bar{x_i} = [ \Sigma_i x_i ] / (n_i)$   
Overall mean $\bar{x} = [ \Sigma_1^N x_i ] / (N)$    
Or, overall mean $\bar{x} = [ \Sigma_1^k n_i \bar{x_i} ] / (N)$    
Sample variance $s^2 = [ \Sigma (\bar{x}-\bar{x_i}) ] /(N-1)$
This is corrected for extrapolation to the population; 
population variance $\sigma^2$ is not used.   
Between and within sum of square error:   
SSE_b = MST = $[ \Sigma_1^k n_i (\bar{x}-\bar{x_i}) ] / (k-1)$   
SSE_w MSE = $[ \Sigma_1^k (n_i-1)s_i^2 ] / (N-k)$    
also called mean square for treatment/class and for error.


In [2]:
import numpy as np
import pandas as pd
import os
import csv
from datetime import datetime
print(datetime.now())

2022-07-26 16:49:07.803970


In [3]:
BASE_PATH_IN='/home/jrm/Adjeroh/Naved/CP_80K/'
BASE_PATH_IN='/Users/jasonmiller/Downloads/CellProfilerFiltered/'
CLASS_PATH_IN='/Users/jasonmiller/Downloads/TrainTestSplit/'
CSVFILE='Process100_Image.csv'
MEASUREMENT='Mean_Nucleus_AreaShape_MeanRadius'
MCOLUMN = 888  # zero based column number within csv
ALL_PATIENTS = [p for p in os.listdir(BASE_PATH_IN) if p.startswith('TCGA-')]
print('Num patients:',len(ALL_PATIENTS))
CANCER_CLASSES = 6

Num patients: 108


In [4]:
# load classes
def load_patient_to_class():
    PATIENT_TO_CLASS={}
    file1 = CLASS_PATH_IN + 'fold0_train.txt'
    file2 = CLASS_PATH_IN + 'fold0_test.txt'
    for file in [file1,file2]:
        with open(file) as infile:
            reader = csv.reader(infile)
            for row in reader:
                patchfile = row[0]
                patient = patchfile[:19]
                cancer = int(row[1])
                # The file gives this information redundantly
                PATIENT_TO_CLASS[patient] = cancer
    return PATIENT_TO_CLASS
MAPPER = load_patient_to_class()

In [5]:
def load_basics():
    patient_basics={}
    class_basics=[[0,0,0]] * CANCER_CLASSES
    overall_basics=[0,0,0]
    for patient in ALL_PATIENTS:
        file = BASE_PATH_IN + patient + '/' + CSVFILE
        cancer_class = MAPPER[patient]
        patient_total = 0
        patient_count = 0
        with open(file) as infile:
            reader = csv.reader(infile)
            header = True
            for row in reader:
                value = row[MCOLUMN]
                if header:
                    header = False
                else:
                    value = float(value)
                    patient_total += value
                    patient_count += 1
        patient_mean = patient_total/patient_count
        count_total_mean = (patient_count,patient_total,patient_mean)
        patient_basics[patient] = count_total_mean
        # running totals per class
        count,total,mean = class_basics[cancer_class]
        count+=patient_count
        total+=patient_total
        mean=total/count
        class_basics[cancer_class] = [count,total,mean] 
        # running totals overall
        count,total,mean = overall_basics
        count+=patient_count
        total+=patient_total
        mean=total/count
        overall_basics = [count,total,mean] 
        print('.',end='')  # progress bar
    print()
    return patient_basics,class_basics,overall_basics

patient_basics,class_basics,overall_basics=load_basics()
print('Class basics',class_basics)
print('Overall basics',overall_basics)

Class basics [[41289, 126105.65857789041, 3.0542192491436078], [12124, 44940.446064271106, 3.7067342514245385], [12209, 46142.85273802422, 3.7794129525779527], [5775, 21443.821306558217, 3.7132158106594315], [2850, 10773.871238824517, 3.780305697833164], [1518, 5775.828955408042, 3.804893910018473]]
Overall basics [75765, 255182.4788809765, 3.368078649521237]


In [6]:
print('class mean')
for cls in range(CANCER_CLASSES):
    count,total,mean = class_basics[cls]
    print(mean)
print('overall mean')   
count,total,mean = overall_basics
print(mean)

class mean
3.0542192491436078
3.7067342514245385
3.7794129525779527
3.7132158106594315
3.780305697833164
3.804893910018473
overall mean
3.368078649521237


In [18]:
def get_between_sse():
    ocount,ototal,omean = overall_basics
    numerator = 0
    for cls in range(CANCER_CLASSES):
        ccount,ctotal,cmean = class_basics[cls]
        deviation = omean - cmean 
        term = (ccount-1) * deviation**2
        numerator += term
    denominator = CANCER_CLASSES - 1
    SSE = numerator / denominator
    return SSE

MST = get_between_sse()
MST

1796.8948655514876

In [19]:
def get_variances():
    variances = [0] * CANCER_CLASSES
    for patient,pbasics in patient_basics.items():
        # print(patient)
        pcount,ptotal,pmean = pbasics
        cancer_class = MAPPER[patient]
        ccount,ctotal,cmean = class_basics[cancer_class]
        # consider the patient mean as scalar, no variance
        deviation = cmean - pmean
        ds = deviation**2
        variances[cancer_class] = variances[cancer_class] + ds
    for cls in range(CANCER_CLASSES):
        ccount,ctotal,cmean = class_basics[cls]
        numerator = variances[cls]
        denominator = ccount
        variances[cls] = numerator / denominator
    return variances

s2 = get_variances()
s2

[8.546539085607812e-05,
 0.00010241537954172574,
 9.056057382927914e-05,
 7.857465983257299e-05,
 0.00012526445027391536,
 5.081550759273388e-05]

In [20]:
def get_within_sse(s2):
    ocount,ototal,omean = overall_basics
    numerator = 0
    for cls in range(CANCER_CLASSES):
        ccount,ctotal,cmean = class_basics[cls]
        var = s2[cls]
        term = (ccount-1) * var
        numerator += term
    denominator = ocount - CANCER_CLASSES
    SSE = numerator / denominator
    return SSE

MSE = get_within_sse(s2)
MSE

8.927646641164446e-05

In [22]:
Fstat = MST/MSE
print(MST,MSE,Fstat)

1796.8948655514876 8.927646641164446e-05 20127307.203963395


In [None]:
within SSE:     
    
    patient_sse={}
    class_sse=[0] * CANCER_CLASSES
    overall_sse=0
    overall_mean = overall_basics[2]
    for patient in ALL_PATIENTS:
        patient_sse[patient] = 0
    for patient in ALL_PATIENTS:
        file = BASE_PATH_IN + patient + '/' + CSVFILE
        pcount,ptotal,pmean = patient_basics[patient]
        cancer_class = MAPPER[patient]
        ccount,ctotal,cmean = class_basics[cancer_class]
        with open(file) as infile:
            reader = csv.reader(infile)
            header = True
            for row in reader:
                value = row[MCOLUMN]
                if header:
                    header = False
                else:
                    value = float(value)
                    deviation = overall_mean - value
                    overall_sse += deviation**2
                    deviation = cmean - value
                    class_sse[cancer_class] = class_sse[cancer_class] + deviation**2
                    deviation =  pmean - value
                    patient_sse[patient] = patient_sse[patient] + deviation**2
    return overall_sse,class_sse,patient_sse

overall_sse,class_sse,patient_sse = get_within_sse()
print('Overall SSE',overall_sse)
print('Class SSE',class_sse)
print('Patient SSE',patient_sse)

In [14]:
print('class count, mean, stdev')
for cls in range(CANCER_CLASSES):
    ccount,ctotal,cmean = class_basics[cls]
    csse = class_sse[cls]
    print(ccount,cmean,np.sqrt(csse/(ccount-1)))
print('overall count, mean, stdev')    
ocount,ototal,omean = overall_basics
print(ocount,omean,np.sqrt(overall_sse/(ocount-1)))

class count, mean, stdev
41289 3.0542192491436078 0.40995941659219903
12124 3.7067342514245385 0.39638367331141006
12209 3.7794129525779527 0.42898660684657497
5775 3.7132158106594315 0.3914563032534847
2850 3.780305697833164 0.44813870580223025
1518 3.804893910018473 0.31265551952606574
overall count, mean, stdev
75765 3.368078649521237 0.5349453640085283


In [12]:
def get_between_sse():
    patient_between=0
    class_between=0
    overall_mean = overall_basics[2]
    for cls in range(CANCER_CLASSES):
        ccount,ctotal,cmean = class_basics[cls]
        deviation = overall_mean - cmean
        wsse = ccount * deviation**2
        class_between += wsse
    for patient in ALL_PATIENTS:
        pcount,ptotal,pmean = patient_basics[patient]
        cancer_class = MAPPER[patient]
        ccount,ctotal,cmean = class_basics[cancer_class]
        deviation = cmean - pmean
        wsse = pcount * deviation**2
        patient_between += wsse
    return class_between,patient_between

class_between,patient_between = get_between_sse()
print('SSE between classes',class_between)
print('SSE between patients',patient_between)

SSE between classes 8985.336577377375
SSE between patients 3037.7352735549966
