# Anova study on Shapley explanation maps 

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import xlsxwriter
from matplotlib import pyplot as plt


import scipy.stats as stats
import pandas as pd
import seaborn as sns
import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.multicomp import pairwise_tukeyhsd

In [None]:
upper_face_aus = ['AU01','AU02','AU04','AU05','AU06','AU07','AU09','AU45']
lower_face_aus = ['AU10','AU12','AU14','AU15','AU17','AU20','AU23','AU25','AU26']

df = pd.read_csv('../dataset/pickled_datasets/output_attribution/shapley_sampling_all_dataset_om.csv')
df = pd.read_csv('../dataset/pickled_datasets/output_attribution/shapley_sampling_all_dataset.csv')
# df = pd.read_csv('../dataset/pickled_datasets/output_attribution/shapley_sampling_all_dataset_om_V2.csv')


def query_df(df,subject=None,study=None,paradigm=None,trial=None,timestep=None,label=None):
    """
    Function to query a dataframe to very granular level. Expects lists as inputs.
    """
    query = []
    if subject!=None:
        query.append("subject in " + str(subject))
    if study!=None:
        query.append("study in " + str(study))
    if paradigm!=None:
        query.append("paradigm in " + str(paradigm))
    if trial!=None:
        query.append("trial in " + str(trial))
    if timestep!=None:
        query.append("timestep in " + str(timestep))
    if label!=None:
        query.append("label in " + str(label))
    query = " & ".join(query)
    return(df.query(query))

def run_anova(subjects, columns, aus='both', timestep_range=0, showgraph=False):
    
    # Choose a timestep.
    if timestep_range==0:
        timestep = list(range(0,87))
    if timestep_range==1:
        timestep = list(range(0,34))
    if timestep_range==2:
        timestep = list(range(34,58))
    if timestep_range==3:
        timestep = list(range(58,87))
    
    dfq = query_df(df, label=[0,1], subject=subjects, timestep=timestep)
    
    # Fix the columns to consider, group data, and replace combine paradigms if chosen.
    cols_consider = columns
    dfin_anova = dfq.groupby(cols_consider).mean()[upper_face_aus+lower_face_aus].reset_index()
    dfin_anova = dfin_anova.replace('cw1', 'cw')
    dfin_anova = dfin_anova.replace('cw2', 'cw')
    dfin_anova = dfin_anova.replace('caw1', 'caw')
    dfin_anova = dfin_anova.replace('caw2', 'caw')
    dfin_anova = dfin_anova.replace('wg1', 'wg')
    dfin_anova = dfin_anova.replace('wg2', 'wg')
    dfin_anova = dfin_anova.replace('wag1', 'wag')
    dfin_anova = dfin_anova.replace('wag2', 'wag')

    # Select the AUs to study.
    if aus=='both':
        aus_to_study = upper_face_aus + lower_face_aus
    elif aus=='upper':
        aus_to_study = upper_face_aus
    elif aus=='lower':
        aus_to_study = lower_face_aus
    elif aus in upper_face_aus + lower_face_aus:
        aus_to_study = [aus]
    else:
        raise ValueError("AU choices are 'both', 'upper', 'lower', or one in 17 AUs considered for the study. Please choose one among these.")
    
    # Define the columns to extract
    columns_to_extract = cols_consider + aus_to_study
    
    # Melt the columns for ANOVA
    d_melt = pd.melt(dfin_anova[columns_to_extract], id_vars=['label'], value_vars=aus_to_study)
    d_melt.columns = ['label', 'AU', 'value']
    
    # Saving hack to fix issues with pandas
    d_melt.to_csv('meltdf_predictions.csv')
    d_melt = pd.read_csv('meltdf_predictions.csv')
    
    if showgraph:
        plt.figure(figsize=(10,10))
        sns.boxplot(x="label", y="value", hue="AU", data=d_melt, palette="Set3")

    # Ordinary Least Squares (OLS) model
    model = ols('value ~ C(label) + C(AU) ', data=d_melt).fit()
    anova_table = sm.stats.anova_lm(model, typ=2)
    return (anova_table)


global row
global col

row = 0
col = 0

def init_workbook():
    workbook = xlsxwriter.Workbook('anova_study.xlsx')
    worksheet = workbook.add_worksheet()
    
    cell_format = workbook.add_format()
    cell_format.set_bold()
    cell_format.set_font_color('red')
    
    global row 
    row = 0
    global col 
    col = 0

    worksheet.write(row, col, "Subject ID")
    worksheet.write(row, col+1, "Paradigm")
    worksheet.write(row, col+2, "Study")
    worksheet.write(row, col+3, "Trial")
    worksheet.write(row, col+4, "AUs")
    worksheet.write(row, col+5, "Timestep Range")
    worksheet.write(row, col+6, "F_label")
    worksheet.write(row, col+7, "p_label")
    worksheet.write(row, col+8, "p_label_significance")
    worksheet.write(row, col+9, "F_AU")
    worksheet.write(row, col+10, "p_AU")
    worksheet.write(row, col+11, "p_AU_significance")
    
    return(workbook, worksheet, row, col)

def write_to_worksheet(F_label, p_label, F_AU, p_AU, cols_to_consider, subject):
    global row
    global col
    
    if type(subject)==list:
        if len(subject)==1:
            worksheet.write(row+1, col, subject[0])
        elif len(subject)==2:
            worksheet.write(row+1, col, 'high rate')
        elif len(subject)==7:
            worksheet.write(row+1, col, 'all')
        else:
            worksheet.write(row+1, col, 'low rate')
    else:
        worksheet.write(row+1, col, subject)

    if 'paradigm' in cols_to_consider:
        worksheet.write(row+1, col+1, "True", cell_format)
    else:
        worksheet.write(row+1, col+1, "False")
    if 'study' in cols_to_consider:
        worksheet.write(row+1, col+2, "True", cell_format)
    else:
        worksheet.write(row+1, col+2, "False")
    if 'trial' in cols_to_consider:
        worksheet.write(row+1, col+3, "True", cell_format)
    else:
        worksheet.write(row+1, col+3, "False")

    worksheet.write(row+1, col+4, aus)
    worksheet.write(row+1, col+5, timestep_range)
    worksheet.write(row+1, col+6, F_label)
    worksheet.write(row+1, col+7, p_label)
    if p_label < 0.005:
        worksheet.write(row+1, col+8, "Yes", cell_format)
    else:
        worksheet.write(row+1, col+8, "No")
    worksheet.write(row+1, col+9, F_AU)
    worksheet.write(row+1, col+10, p_AU)
    if p_AU < 0.005:
        worksheet.write(row+1, col+11, "Yes", cell_format)
    else:
        worksheet.write(row+1, col+11, "No")
    

def return_anova_results(anova_results):
    F_label = anova_results['F'][0]
    p_label = anova_results['PR(>F)'][0]
    F_AU = anova_results['F'][1]
    p_AU = anova_results['PR(>F)'][1]
    
    return(F_label, p_label, F_AU, p_AU)

def run_experiments(subject, aus, timestep_range):
    global row
    global col
    
    if type(subject) != list:
        subject = [subject]
    
    # single trial analysis
    cols_to_consider = ['paradigm', 'trial', 'study', 'label']
    anova_results = run_anova(subjects=subject, columns=cols_to_consider, aus=aus, timestep_range=timestep_range)
    F_label, p_label, F_AU, p_AU = return_anova_results(anova_results)
    write_to_worksheet(F_label, p_label, F_AU, p_AU, cols_to_consider, subject)
    row+=1
    
    # if we don't consider impact of study days
    cols_to_consider = ['paradigm', 'trial', 'label']
    anova_results = run_anova(subjects=subject, columns=cols_to_consider, aus=aus, timestep_range=timestep_range)
    F_label, p_label, F_AU, p_AU = return_anova_results(anova_results)
    write_to_worksheet(F_label, p_label, F_AU, p_AU, cols_to_consider, subject)
    row+=1
    
    # if we don't consider impact of different trials
    cols_to_consider = ['paradigm', 'study', 'label']
    anova_results = run_anova(subjects=subject, columns=cols_to_consider, aus=aus, timestep_range=timestep_range)
    F_label, p_label, F_AU, p_AU = return_anova_results(anova_results)
    write_to_worksheet(F_label, p_label, F_AU, p_AU, cols_to_consider, subject)
    row+=1
    
    # if we just consider the impact of study days
    cols_to_consider = ['study', 'label']
    anova_results = run_anova(subjects=subject, columns=cols_to_consider, aus=aus, timestep_range=timestep_range)
    F_label, p_label, F_AU, p_AU = return_anova_results(anova_results)
    write_to_worksheet(F_label, p_label, F_AU, p_AU, cols_to_consider, subject)
    row+=1
    
    # if we just consider the impact of paradigms
    cols_to_consider = ['paradigm', 'label']
    anova_results = run_anova(subjects=subject, columns=cols_to_consider, aus=aus, timestep_range=timestep_range)
    F_label, p_label, F_AU, p_AU = return_anova_results(anova_results)
    write_to_worksheet(F_label, p_label, F_AU, p_AU, cols_to_consider, subject)
    row+=1


workbook = xlsxwriter.Workbook('anova_study_all.xlsx')
worksheet = workbook.add_worksheet()

cell_format = workbook.add_format({'bold': True, 'font_color': 'red'})

worksheet.write(0,0, "Subject")
worksheet.write(0,1, "Paradigm")
worksheet.write(0,2, "Study")
worksheet.write(0,3, "Trial")
worksheet.write(0,4, "AUs")
worksheet.write(0,5, "Timestep Range")
worksheet.write(0,6, "F_label")
worksheet.write(0,7, "p_label")
worksheet.write(0,8, "p_label_significance")
worksheet.write(0,9, "F_AU")
worksheet.write(0,10, "p_AU")
worksheet.write(0,11, "p_AU_significance")

subjects = ['942', '970', '971', '982', '1131', '1196', '1214']
for i in range(len(subjects)):
    subject = subjects[i]
    for timestep_range in range(4):
        for aus in ['both', 'upper', 'lower']:
            run_experiments(subject, aus, timestep_range)
            row+=1

subjects = ['971', '982']
for timestep_range in range(4):
    for aus in ['both', 'upper', 'lower']:
        run_experiments(subjects, aus, timestep_range)
        row+=1

subjects = ['942', '970', '1131', '1196', '1214']
for timestep_range in range(4):
    for aus in ['both', 'upper', 'lower']:
        run_experiments(subjects, aus, timestep_range)
        row+=1

subjects = ['942', '970', '971', '982', '1131', '1196', '1214']
for timestep_range in range(4):
    for aus in ['both', 'upper', 'lower']:
        run_experiments(subjects, aus, timestep_range)
        row+=1

workbook.close()

print("Ran experiments and wrote to Excel file")