In [1]:
import os
import re
import glob
import pandas as pd
import numpy as np

In [2]:
import seaborn as sns

In [3]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.gridspec as grid_spec
import matplotlib.colors as mcolors
%matplotlib inline

In [4]:
import umap

In [None]:
# import hypertools as hyp

In [5]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [82]:
data_folder = os.path.join("..", "output", "psycholinguistic", "Chang_Naming", "individual")

mdl = ["CSR", "GLM"][1]
zscored = ["", " (z-scored)"][1]
normed = ["", " (normed)"][0]
rmout = ["", " (out-rm)"][1]

subj = ["", " (sub-5102"][0]
seed = ["", "_6180)"][0]

data_name = f"{mdl}_regression_results{zscored}{normed}{rmout}{subj}{seed}.csv"
print(data_name)

GLM_regression_results (z-scored) (out-rm).csv


In [83]:
data_path = os.path.join(data_folder, data_name)
data = pd.read_csv(data_path)
print(data.columns)

Index(['SID', 'X0', 'LogCF', 'NS', 'CON', 'PC', 'SC', 'SAR', 'IMG', 'AoA',
       'R_squared', 'Adjusted_R2', 'LogLik', 'AIC', 'AICc', 'BIC', 'NRMSE'],
      dtype='object')


## New

In [84]:
fp = os.path.join(data_folder, f"[summ] {data_name.replace('.csv', '.xlsx')}")
data_desc = (
    data.iloc[:, 1:]
    .describe()
    .astype('float')
    .map(lambda x: f"{x:.3f}")
    .loc[['min', 'max', 'mean', 'std'], :]
    .T
)
if not os.path.exists(fp):
    data_desc.to_excel(fp)

In [None]:
(8**2 + 8*3 + 2) / 2

In [9]:
def classify(term):
    if term == "X0":
        return "constant"
    elif "^2" in term:
        return "quadratic"
    elif term.startswith("F") and "F" in term[1:]:
        return "interaction"
    else:
        return "linear"

In [85]:
DF = pd.DataFrame(list(data_desc['mean'].iloc[:8].items()), columns=["term", "coef"])
# DF = pd.DataFrame(list(data_desc['mean'].iloc[:45].items()), columns=["term", "coef"])
DF["abs_coef"] = DF["coef"].astype(float).abs()
DF["category"] = DF["term"].apply(classify)

In [92]:
cat_order = ["constant", "linear"]
# cat_order = ["constant", "linear", "quadratic", "interaction"]
DF["category"] = pd.Categorical(DF["category"], categories=cat_order, ordered=True)

In [93]:
inner = DF.groupby("category", sort=False)["abs_coef"].sum().reindex(cat_order)
outer_weights = DF["abs_coef"].values

In [94]:
inner = DF.groupby("category")["abs_coef"].sum()
outer = DF.copy()

In [95]:
base_colors = {
    "constant": "#C44E52",    # red
    "linear": "#55A868",      # green
    "quadratic": "#4C72B0",   # blue
    "interaction": "#8172B2"  # purple
}

def lighten(hex_color, amount):
    rgb = np.array(mcolors.to_rgb(hex_color))
    return tuple(rgb + (1 - rgb) * amount)

outer_colors = []
for cat in cat_order:
    idx = DF.index[DF["category"] == cat].to_list()
    n = len(idx)
    if n == 0:
        continue
    shades = np.linspace(0.10, 0.65, n)
    outer_colors.extend([lighten(base_colors[cat], s) for s in shades])

inner_colors = [base_colors[c] for c in cat_order]

In [96]:
inner_total = inner.sum()
inner_labels = [ f"{c}\n({inner.loc[c]/inner_total:.1%})" for c in cat_order ]
outer_legend_labels = [ f"{t} ({v})" for t, v in zip(DF["term"], DF["coef"]) ]

In [97]:
fig, ax = plt.subplots(figsize=(6, 6), dpi=200)
wedges_outer, _ = ax.pie(
    outer_weights,
    radius=1,
    startangle=90,
    counterclock=False,
    colors=outer_colors,
    labels=None, 
    wedgeprops=dict(width=0.3, edgecolor="white", linewidth=1),
)
wedges_inner, _ = ax.pie(
    inner.values,
    radius=0.7,
    startangle=90,
    counterclock=False,
    colors=inner_colors,
    labels=inner_labels, 
    labeldistance=0.4,
    textprops=dict(fontsize=16),
    wedgeprops=dict(width=0.7, edgecolor="white", linewidth=1),
)
ax.set(aspect="equal")
plt.subplots_adjust(left=-0.1, right=1.1, top=1.1, bottom=-0.1)
plt.savefig(os.path.join("..", "figures", f"[pie] {data_name.replace('.csv', '.png')}"))
plt.close()

In [81]:
fig, ax = plt.subplots(figsize=(12, 4), dpi=200)
ax.axis("off")
ax.legend(
    wedges_outer,
    outer_legend_labels,
    loc="center",
    ncol=5,
    frameon=False,
    fontsize=16,
    handlelength=1.5,
    columnspacing=1.5,
    labelspacing=1.0,
)
plt.subplots_adjust(left=0, right=1, top=1, bottom=0)
plt.savefig(os.path.join("..", "figures", f"[pie-legend] {data_name.replace('.csv', '.png')}"))
plt.close()

## Discriptive statistics (table)

In [None]:
rawdata_folder = os.path.join("Data_Linguistic", author, task)
fn_regex = "zscored_sub_*.xlsx" if zscored == " (z-scored)" else "sub_*.xlsx"
data_paths = glob.glob(os.path.join(rawdata_folder, fn_regex))

if 'SID' in data.columns:
    data.set_index('SID', inplace=True)
data["nT"] = np.nan

for fp in data_paths:
    sid = int(re.findall('([\d]*).xlsx', os.path.basename(fp))[0])
    rawdata = pd.read_excel(fp)
    data.loc[sid, 'nT'] = len(rawdata)

In [None]:
desc_cols = [
'X0', 'LogCF', 'NS', 'CON', 'PC', 'SC', 'SAR', 'IMG', 'AoA',
'R_squared', 'Adjusted_R2', 'LogLik', 'AIC', 'AICc', 'BIC', 'NRMSE' # , 'nT'
]

data_desc = (data[desc_cols].describe()
             .astype('float')
             .map(lambda x: f"{x:.3f}")
             .loc[['min', 'max', 'mean', 'std'], :]
             .T)
    
print(data_desc)

In [None]:
dn = data_name.replace(".csv", "")
fp = os.path.join(data_folder, f"[Desc] {dn}.xlsx")

if not os.path.exists(fp):
    data_desc.to_excel(fp)

# Figures

In [None]:
fig_folder = os.path.join("Figs_Linguistic", author, task)
if not os.path.exists(fig_folder):
    os.makedirs(fig_folder) 

## Histogram chart

In [None]:
for targ_col, bin_range in zip([
     'R_squared', 'Adjusted_R2', 'NRMSE', 'AIC', 'AICc', 'BIC', 
    'nT'
], [
    (0, .52), (-.1, .32), (.25, 871), (147, 565), (147, 565), (147, 565), 
    (100, 200)
]):
    if targ_col == 'nT':
        fn = f"[histplot] {targ_col}.png"
    else:
        fn = f"[histplot] {mdl} fitting {targ_col}.png"

    if not os.path.exists(os.path.join(fig_folder, fn)):
        sns.histplot(
            data=data, 
            x=targ_col, 
            binrange=bin_range, 
            kde=True, 
            bins=30
        )
        plt.axvline(
            data[targ_col].mean(), color="red", linestyle="-", 
        )
        plt.axvline(
            data[targ_col].median(), color="lightgreen", linestyle="-", 
        )
        plt.tight_layout() 
        plt.savefig(os.path.join(fig_folder, fn), format='png', dpi=200)
        plt.close()

## Overlapping density plots

In [None]:
dn = data_name.replace(".csv", "")
fn = f"[coef] {dn}.png"

x_labels = ['X0', 'LogCF', 'NS', 'CON', 'PC', 'SC', 'SAR', 'IMG', 'AoA']

long_data = (data
             .loc[:, ['SID'] + x_labels]
             .melt(id_vars="SID"))

sorted_x_labs = sorted(x_labels, 
                       key=lambda x: 
                       data.loc[:, x_labels].mean().rank(ascending=False)[x])

colors = sns.cubehelix_palette(9, rot=-.25, light=.7)
sns.set_theme(style="white")

## Create ridgeplots in Matplotlib
## see: https://matplotlib.org/matplotblog/posts/create-ridgeplots-in-matplotlib/

gs = grid_spec.GridSpec(len(x_labels), 1)
fig = plt.figure(figsize=(10, 6), dpi=200)

ax_objs = []
for i, xlab in enumerate(sorted_x_labs):
    ax_objs.append(fig.add_subplot(gs[i:i+1, :]))
    
    plot = (long_data
            .query("variable == @xlab")["value"]
            .plot.kde(ax=ax_objs[-1], lw=0.5))
    
    # grabbing x and y data from the kde plot
    x = plot.get_children()[0]._x
    y = plot.get_children()[0]._y
    
    # filling the space beneath the distribution
    ax_objs[-1].fill_between(x, y, color=colors[i])

    # setting uniform x and y lims
    ax_objs[-1].set_xlim(-0.8, 1.1) # min(long_data["value"]), max(long_data["value"])
    ax_objs[-1].set_ylim(0, 8.5) # print(max(y))

    # make background transparent
    rect = ax_objs[-1].patch
    rect.set_alpha(0)

    # remove axis ticks and labels 
    ax_objs[-1].set_ylabel('')
    ax_objs[-1].set_yticklabels([])
    if xlab != sorted_x_labs[-1]:
        ax_objs[-1].set_xticklabels([])
    
    # remove borders
    for s in ["top", "right", "left", "bottom"]:
        ax_objs[-1].spines[s].set_visible(False)
        
    # add labels
    ax_objs[-1].text(-0.8, # min(long_data["value"])
                     0.5, xlab, 
                     fontweight="bold", fontsize=14, ha="right")
    
    # mark median, mean, and zero
    ax_objs[-1].axvline(
        data[xlab].median(), color="lightgreen", linestyle="-"
    )
    ax_objs[-1].axvline(
        data[xlab].mean(), color="red", linestyle="--"
    )
    ax_objs[-1].axvline(
        0, color="black", linestyle="-", lw=.5
    )

# overlapping axes objects
gs.update(hspace= -0.5)

# plt.show()
plt.savefig(os.path.join(fig_folder, fn), format='png', bbox_inches='tight')
plt.close()

print(f"saved: {fn}")

## Plot clustering

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [None]:
from sklearn.manifold import TSNE

In [None]:
coefs = data.iloc[:, 1:46]

In [None]:
X_scaled = StandardScaler().fit_transform(coefs)

In [None]:
X_pca = PCA(
    n_components=2
).fit_transform(X_scaled)

In [None]:
X_umap = umap.UMAP(
    n_neighbors=15, min_dist=0.1, n_components=2, n_jobs=-1
).fit_transform(X_scaled)

In [None]:
X_tsne = TSNE(
    n_components=2, perplexity=12, init='random', learning_rate='auto'
).fit_transform(X_scaled)

In [None]:
x = 2
X_reduced = [X_pca, X_umap, X_tsne][x]
plt.scatter(
    X_reduced[:, 0], X_reduced[:, 1], 
    c='r', # cmap='coolwarm', 
    alpha=0.1
)
plt.title(["PCA", "UMAP", "t-SNE"][x])
plt.plot()