In [1]:
import os
import pandas as pd
import numpy as np

In [2]:
import seaborn as sns

In [3]:
import matplotlib.pyplot as plt
%matplotlib inline

## Setup path and load data

In [4]:
author = ["Chang_et_al", "Tse_et_al"][0]
task = ["Naming", "LD"][0]

zscored = ["", " (z-scored)"][1]
normed = ["", " (normed)"][0]

iter_ver = ["inward", "uniform", "uniform_3sd", None][2]

F1 = "LogCF"
F2 = ["CON", "IMG"][0]

# ------

if iter_ver is None:
    data_folder = os.path.join("Output_Linguistic", author, task)
    data_name = f"regression_results{zscored}{normed}.csv"
else:
    data_folder = os.path.join(
        "Output_Linguistic", author, task, f"CSR_change_{iter_ver}")
    data_name = f"{F1} & {F2}{zscored}{normed}.csv"

data_path = os.path.join(data_folder, data_name)

# ------

print(data_folder)
print(data_name)
print(os.path.exists(data_path))

Output_Linguistic\Chang_et_al\Naming\CSR_change_uniform_3sd
LogCF & CON (z-scored).csv
True


In [5]:
data = pd.read_csv(data_path)

print(data.columns)

Index(['SID', 'Iter', 'nT_fit', 'nT_test', 'X0', 'F1', 'F2', 'F1^2', 'F2^2',
       'F1F2', 'R^2', 'NRMSE', 'Test_R^2', 'Test_NRMSE'],
      dtype='object')


## Descriptive statistics

In [6]:
exclude_cols = ['SID', 'Iter', 'nT_fit', 'nT_test']
desc_cols = [ col for col in data.columns if col not in exclude_cols ]

data_desc = (data[desc_cols].describe()
             .astype('float')
             .applymap(lambda x: f"{x:.3f}")
             .loc[['min', 'max', 'mean', 'std'], :]
             .T)
    
print(data_desc)

                       min         max        mean          std
X0                -112.504    1160.080      -0.017       13.784
F1                -347.705    1930.048       0.053       22.150
F2                -208.592    5098.078       0.676       54.954
F1^2             -1416.045     174.547      -0.136       16.063
F2^2             -4467.947     456.029      -0.452       48.272
F1F2             -1486.800     258.089      -0.330       18.654
R^2                  0.043       1.000       0.853        0.185
NRMSE                0.000    4802.646       2.477       54.782
Test_R^2    -170489388.905       0.003  -22406.278  1820032.740
Test_NRMSE           0.274  115775.486      55.533     1455.187


In [7]:
dn = data_name.replace(".csv", "")
fp = os.path.join(data_folder, f"[Desc] {dn}.xlsx")

if not os.path.exists(fp):
    data_desc.to_excel(fp)

## Plot CSR change (per subject)

In [8]:
grouped_data = data.groupby("SID")

subj_list = list(set(data.SID))
print(len(subj_list))

320


In [9]:
## coefficients:

if iter_ver in ["uniform", "uniform_3sd"]:

    out_folder = f"{F1}_x_{F2}{zscored}{normed} coefficients"
    out_dir = os.path.join(data_folder, out_folder)
    if not os.path.exists(out_dir):
        os.mkdir(out_dir)

    for sid in subj_list:
        out_path = os.path.join(out_dir, f"sub-{sid}.png")

        if not os.path.exists(out_path):   
            fig = plt.figure(figsize=(8, 4), dpi=200)

            sub_data = grouped_data.get_group(sid) 
            df = sub_data.iloc[:, 5:10]
            x_labels = range(1, len(df)+1)    

            ax = fig.add_subplot()
            for col, vals in df.items():
                ax.plot(x_labels, vals, marker='.', label=col, alpha=.5)
            ax.grid(True)
            ax.set_xticks(x_labels)
            ax.set_ylabel('values', size=14)
            # ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
            plt.legend(loc='best')
            plt.title(f"F1 = {F1}, F2 = {F2}")   
            plt.suptitle(f"Subj {sid}", position=(0.05, 0.9), ha='left', size=16)

            plt.tight_layout() 
            plt.savefig(out_path, format='png')
            plt.close()

In [10]:
## fitting performance:

if iter_ver in ["uniform", "uniform_3sd"]:

    fig_folder = f"{F1}_x_{F2}{zscored}{normed} fitting performance"
    fp = os.path.join(data_folder, fig_folder)
    if not os.path.exists(fp):
        os.mkdir(fp)

    for sid in subj_list:    
        out_path = os.path.join(fp, f"sub-{sid}.png")

        if not os.path.exists(out_path):    
            fig = plt.figure(figsize=(8, 8), dpi=200)    

            sub_data = grouped_data.get_group(sid) 
            x_labels = range(1, len(sub_data)+1)    

            df_1 = sub_data.loc[:, ['R^2', 'NRMSE']]
            avg_nT_1 = round(sub_data.iloc[:, 2].mean(), 2)

            ax1 = fig.add_subplot(211)
            for col, vals in df_1.items():
                ax1.plot(x_labels, vals, marker='.', label=col, alpha=.5)
            ax1.grid(True)
            ax1.set_xticks(x_labels)
            ax1.set_xlabel('number of subset, sampling without replacement', size=10)
            # ax1.set_ylabel('values', size=14)
            plt.legend(loc='best')
            plt.title(f"Fit and evaluate on the same part of data (on average, {avg_nT_1} trials)")

            df_2 = sub_data.loc[:, ['Test_R^2', 'Test_NRMSE']]
            avg_nT_2 = round(sub_data.iloc[:, 3].mean(), 2)

            ax2 = fig.add_subplot(212)
            for col, vals in df_2.items():
                ax2.plot(x_labels, vals, marker='.', label=col, alpha=.5)
            ax2.grid(True)
            ax2.set_xticks(x_labels)
            # ax2.set_ylabel('values', size=14)
            plt.legend(loc='best')
            plt.title(f"Evaluate on all the other parts of data (on average, {avg_nT_2} trial)")

            plt.suptitle(f"Subj {sid}", position=(0.05, 0.95), ha='left', size=16)
            plt.tight_layout() 
            # plt.subplots_adjust(top = 0.85) 

            plt.savefig(out_path, format='png')
            plt.close()

In [11]:
## together:

overwrite = False # True

if iter_ver == "inward":
    
    limit = [6, None][1]
    
    if limit is not None:
        out_folder = os.path.join(
            data_folder, f"{F1}_x_{F2}{normed} # up to {limit}")
    else:
        out_folder = os.path.join(data_folder, f"{F1}_x_{F2}{normed}")
    
    if not os.path.exists(out_folder):
        os.mkdir(out_folder)

    # ------
        
    for sid in subj_list:
        out_path = os.path.join(out_folder, f"sub-{sid}.png")

        if not os.path.exists(out_path) or overwrite:    
            fig = plt.figure(figsize=(12, 4), dpi=200)  

            if limit is not None:
                sub_data = grouped_data.get_group(sid).iloc[:limit]
            else:
                sub_data = grouped_data.get_group(sid)

            x_labels = range(1, len(sub_data)+1)
            df_1 = sub_data.iloc[:, 5:10]
            df_2 = sub_data.loc[:, ['R^2', 'NRMSE']]
            df_3 = sub_data.loc[:, ['Test_R^2', 'Test_NRMSE']]
            
            ax1 = fig.add_subplot(131)
            
            for col, vals in df_1.items():
                ax1.plot(x_labels, vals, marker='o', label=col)
            ax1.grid(True)
            ax1.set_xticks(x_labels)
            ax1.set_ylabel('values', size=14)
            plt.legend(loc='best')
            plt.title(f"F1 = {F1}, F2 = {F2}")

            ax2 = fig.add_subplot(132)
            
            for col, vals in df_2.items():
                ax2.plot(x_labels, vals, marker='o', label=col)
            ax2.grid(True)
            ax2.set_xticks(x_labels)
            ax2.set_ylabel('')
            plt.legend(loc='best')
            plt.title(f"Fit and evaluate on selected trials")
            
            ax3 = fig.add_subplot(133)
            
            for col, vals in df_3.items():
                ax3.plot(x_labels, vals, marker='o', label=col)
            ax3.grid(True)
            ax3.set_xticks(x_labels)
            ax3.set_ylabel('')
            plt.legend(loc='best')
            plt.title(f"Evaluate on the remainning trials")
        
            txt = "".join([
                "For each iteration, ", 
                ", ".join([ f"{sub_data.iloc[x, 2]}" for x in range(len(sub_data)) ]), 
                " trials are selected to fit the CSR function, and ", 
                ", ".join([ f"{sub_data.iloc[x, 3]}" for x in range(len(sub_data)) ]), 
                " trials are used for testing."
            ])
            fig.text(0.55, 0.01, txt, ha='center', size=11)

            plt.suptitle(f"Subj {sid}", position=(0.05, 0.95), ha='left', size=16)
            plt.tight_layout() 
            plt.subplots_adjust(bottom = 0.12) 
            plt.savefig(out_path, format='png')
            plt.close()

## Distributions of each coefficient (strip plot)

In [12]:
if iter_ver is None:
    out_folder = os.path.join("Figs_Linguistic", author, task)
    fig_size = (10, 6)
    ms = 8
    a = .05
else:
    fd = os.path.join("Figs_Linguistic", author, task, f"CSR_change_{iter_ver}")
    fig_size = (4, 4)
    ms = 20
    a = .5
    
if not os.path.exists(fd):
    os.mkdir(fd)
               
for x_type, x_labels in {
    "coef": [ col for col in data.columns if col not in [
        'SID', 'Iter', 'nT_fit', 'nT_test', 
        'R_squared', 'R^2', 'NRMSE', 'Test_minRE', 'Test_R^2', 'Test_NRMSE'
    ] ], 
    "eval": [ col for col in data.columns if col in [
        'R_squared', 'R^2', 'NRMSE', 'Test_minRE', 'Test_R^2', 'Test_NRMSE'
    ] ]
}.items():

    dn = data_name.replace(".csv", "")
    fn = f"[{x_type}] {dn}.png"

    fig, ax = plt.subplots(figsize=fig_size, dpi=200)

    sns.stripplot(
        data=data.loc[:, ['SID']+x_labels].melt(id_vars="SID"), 
        x="variable", y="value", alpha=a, dodge=True, zorder=1
    )
    ax.plot(
        x_labels, data.loc[:, x_labels].median(), 
        color="lightgreen", linestyle="none", 
        marker="_", markersize=ms, markeredgewidth=3, zorder=2 
    )
    ax.plot(
        x_labels, data.loc[:, x_labels].mean(), 
        color="red", linestyle="none", 
        marker="_", markersize=ms, markeredgewidth=2, zorder=3
    )
    ax.set_xticks(range(len(x_labels)))
    ax.set_xticklabels(x_labels, rotation=90, size=14)
    ax.set_xlabel('')
    ax.set_ylabel('')
    plt.tight_layout()
    plt.savefig(os.path.join(fd, fn), format='png')
    plt.close()