In [1]:
import os
import pandas as pd
import numpy as np

In [2]:
import seaborn as sns

In [3]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.gridspec as grid_spec
%matplotlib inline

In [4]:
author = ["Chang_et_al", "Tse_et_al"][0]
task = ["Naming", "LD"][0]
data_folder = os.path.join("Output_Linguistic", author, task)

mdl = ["CSR", "GLM"][1]
zscored = ["", " (z-scored)"][1]
normed = ["", " (normed)"][0]
data_name = f"{mdl}_regression_results{zscored}{normed}.csv"

data_path = os.path.join(data_folder, data_name)
data = pd.read_csv(data_path)

print(data.columns)

Index(['SID', 'X0', 'LogCF', 'NS', 'CON', 'PC', 'SC', 'SAR', 'IMG', 'AoA',
       'R_squared', 'Adjusted_R2', 'LogLik', 'AIC', 'AICc', 'BIC', 'NRMSE'],
      dtype='object')


In [5]:
desc_cols = [
'X0', 'LogCF', 'NS', 'CON', 'PC', 'SC', 'SAR', 'IMG', 'AoA',
'R_squared', 'Adjusted_R2', 'LogLik', 'AIC', 'AICc', 'BIC', 'NRMSE'
]

data_desc = (data[desc_cols].describe()
             .astype('float')
             .applymap(lambda x: f"{x:.3f}")
             .loc[['min', 'max', 'mean', 'std'], :]
             .T)
    
print(data_desc)

                  min      max      mean     std
X0             -0.352    1.075     0.275   0.281
LogCF          -0.241    0.064    -0.090   0.069
NS             -0.114    0.173     0.019   0.049
CON            -0.206    0.148    -0.042   0.057
PC             -0.116    0.224     0.002   0.053
SC             -0.126    0.141     0.007   0.047
SAR            -0.204    0.141    -0.038   0.056
IMG            -0.178    0.088    -0.059   0.054
AoA            -0.143    0.173     0.036   0.056
R_squared       0.032    0.271     0.127   0.052
Adjusted_R2    -0.022    0.230     0.075   0.055
LogLik       -191.663  -64.529  -124.850  20.338
AIC           147.057  401.326   267.700  40.676
AICc          148.211  402.331   268.902  40.628
BIC           175.065  430.501   295.399  41.019
NRMSE           0.335  870.785    32.239  83.492


In [6]:
# dn = data_name.replace(".csv", "")
# fp = os.path.join(data_folder, f"[Desc] {dn}.xlsx")

# if not os.path.exists(fp):
#     data_desc.to_excel(fp)

In [7]:
fig_folder = os.path.join("Figs_Linguistic", author, task)
if not os.path.exists(fig_folder):
    os.makedirs(fig_folder) 

In [8]:
for targ_col, bin_range in zip([
     'R_squared', 'Adjusted_R2', 'NRMSE', 'AIC', 'AICc', 'BIC' 
], [
    (0, .52), (-.1, .32), (.25, 871), (147, 565), (147, 565), (147, 565)
]):
    fn = f"[histplot] {mdl} fitting {targ_col}.png"
    sns.histplot(
        data=data, 
        x=targ_col, 
        binrange=bin_range, 
        kde=True, 
        bins=30
    )
    plt.axvline(
        data[targ_col].mean(), color="red", linestyle="-", 
    )
    plt.axvline(
        data[targ_col].median(), color="lightgreen", linestyle="-", 
    )
    plt.tight_layout() 
    plt.savefig(os.path.join(fig_folder, fn), format='png', dpi=200)
    plt.close()

In [9]:
dn = data_name.replace(".csv", "")
fn = f"[coef] {dn}.png"

x_labels = ['X0', 'LogCF', 'NS', 'CON', 'PC', 'SC', 'SAR', 'IMG', 'AoA']

long_data = (data
             .loc[:, ['SID'] + x_labels]
             .melt(id_vars="SID"))

sorted_x_labs = sorted(x_labels, 
                       key=lambda x: 
                       data.loc[:, x_labels].mean().rank(ascending=False)[x])

colors = sns.cubehelix_palette(9, rot=-.25, light=.7)
sns.set_theme(style="white")

## Create ridgeplots in Matplotlib
## see: https://matplotlib.org/matplotblog/posts/create-ridgeplots-in-matplotlib/

gs = grid_spec.GridSpec(len(x_labels), 1)
fig = plt.figure(figsize=(10, 6), dpi=200)

ax_objs = []
for i, xlab in enumerate(sorted_x_labs):
    ax_objs.append(fig.add_subplot(gs[i:i+1, :]))
    
    plot = (long_data
            .query("variable == @xlab")["value"]
            .plot.kde(ax=ax_objs[-1], lw=0.5))
    
    # grabbing x and y data from the kde plot
    x = plot.get_children()[0]._x
    y = plot.get_children()[0]._y
    
    # filling the space beneath the distribution
    ax_objs[-1].fill_between(x, y, color=colors[i])

    # setting uniform x and y lims
    ax_objs[-1].set_xlim(-0.8, 1.1) # min(long_data["value"]), max(long_data["value"])
    ax_objs[-1].set_ylim(0, 8.5) # print(max(y))

    # make background transparent
    rect = ax_objs[-1].patch
    rect.set_alpha(0)

    # remove axis ticks and labels 
    ax_objs[-1].set_ylabel('')
    ax_objs[-1].set_yticklabels([])
    if xlab != sorted_x_labs[-1]:
        ax_objs[-1].set_xticklabels([])
    
    # remove borders
    for s in ["top", "right", "left", "bottom"]:
        ax_objs[-1].spines[s].set_visible(False)
        
    # add labels
    ax_objs[-1].text(-0.8, # min(long_data["value"])
                     0.5, xlab, 
                     fontweight="bold", fontsize=14, ha="right")
    
    # mark median, mean, and zero
    ax_objs[-1].axvline(
        data[xlab].median(), color="lightgreen", linestyle="-"
    )
    ax_objs[-1].axvline(
        data[xlab].mean(), color="red", linestyle="--"
    )
    ax_objs[-1].axvline(
        0, color="black", linestyle="-", lw=.5
    )

# overlapping axes objects
gs.update(hspace= -0.5)

# plt.show()
plt.savefig(os.path.join(fig_folder, fn), format='png', bbox_inches='tight')
plt.close()

print(f"saved: {fn}")

saved: [coef] GLM_regression_results (z-scored).png
