# Validate Accuracy Scores

In [None]:
import os
import pandas as pd
from sklearn.metrics import root_mean_squared_error, r2_score
from scipy import stats
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

In [None]:
inputDict = {
    'JASMIN-NL':'/vol/tensusers2/wharmsen/JASMIN-fluency-features/comp-q-read_nl_age7-11_nat',
    'JASMIN-VL':'/vol/tensusers2/wharmsen/JASMIN-fluency-features/comp-q-read_vl_age7-11_nat',
    'SERDA-NL':'/vol/tensusers2/wharmsen/SERDA-fluency-features/comp1'
}

corrDF_list = []
combiDF_list = []
for corpus, basepath in inputDict.items():
    print(corpus)

    manFile_inter = os.path.join(basepath,'06_manual_fluency_features/json-fluency-features/inter_timing.tsv')
    manDF_inter = pd.read_csv(manFile_inter, sep = '\t', index_col=0).sort_index()
    manFile_intra = os.path.join(basepath,'06_manual_fluency_features/json-fluency-features/intra_timing.tsv')
    manDF_intra = pd.read_csv(manFile_intra, sep = '\t', index_col=0).sort_index()
    manDF = pd.concat([manDF_inter, manDF_intra], axis=1)

    accVars = list(manDF.columns)
    accVars.remove('other') if 'other' in list(manDF.columns) else list(manDF.columns)
    print('accVars: ', accVars)   

    asrSystem = 'whispert_vad_dis'
    autoFile_inter = os.path.join(basepath,'05_automatic_fluency_features/' + asrSystem + '/inter_timing.tsv')
    autoDF_inter = pd.read_csv(autoFile_inter, sep = '\t', index_col=0).sort_index()
    autoFile_intra = os.path.join(basepath,'05_automatic_fluency_features/' + asrSystem + '/intra_timing.tsv')
    autoDF_intra = pd.read_csv(autoFile_intra, sep = '\t', index_col=0).sort_index()
    autoDF = pd.concat([autoDF_inter, autoDF_intra], axis=1)
    autoDF.columns = [asrSystem + '_' + x for x in autoDF.columns]

    combiDF = pd.concat([autoDF, manDF], axis=1)
    combiDF['corpus'] = [corpus] * len(combiDF)
    # combiDF = combiDF.dropna()
    print('Length combiDF:', len(combiDF))
    combiDF_list.append(combiDF)

# outputCorrDF = pd.concat(corrDF_list, axis=1)
combiDF_final = pd.concat(combiDF_list)


In [None]:
asrSystem = 'whispert_vad_dis'
corrDF_list = []

for corpus in ['total', 'JASMIN-NL', 'JASMIN-VL', 'SERDA-NL']:
    outputList = []

    if corpus != 'total':
        selDF = combiDF_final[combiDF_final['corpus'] == corpus]
    else:
        selDF = combiDF_final
    print(corpus, len(selDF))

    for accVar in accVars:
        # Select only the automatic and manual variable
        autoVar = asrSystem + '_' + accVar
        manVar = accVar
        varDF = selDF.loc[:, [autoVar, manVar]].replace(0, np.nan, inplace=False).dropna()
        length = len(varDF)
        corr = stats.pearsonr(varDF[asrSystem + '_' + accVar], varDF[accVar])[0]
        outputList.append([accVar, length, corr])

    corrDF = pd.DataFrame(outputList, columns = ['var', 'N', corpus]).set_index('var')
    print(len(corrDF))
    corrDF_list.append(corrDF)

outputCorrDF = pd.concat(corrDF_list, axis=1)
outputCorrDF

In [None]:
outputCorrDF