In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('data/TRAIN/TRAIN_FUNCTIONAL_CONNECTOME_MATRICES.csv')

In [3]:
df.head()

Unnamed: 0,participant_id,0throw_1thcolumn,0throw_2thcolumn,0throw_3thcolumn,0throw_4thcolumn,0throw_5thcolumn,0throw_6thcolumn,0throw_7thcolumn,0throw_8thcolumn,0throw_9thcolumn,...,195throw_196thcolumn,195throw_197thcolumn,195throw_198thcolumn,195throw_199thcolumn,196throw_197thcolumn,196throw_198thcolumn,196throw_199thcolumn,197throw_198thcolumn,197throw_199thcolumn,198throw_199thcolumn
0,70z8Q2xdTXM3,0.22293,0.527903,0.429966,0.060457,0.566489,0.315342,0.508408,-0.07829,0.525692,...,0.224985,0.397448,0.422966,0.184642,0.305549,0.420349,0.016328,0.561864,0.47117,0.365221
1,WHWymJu6zNZi,0.614765,0.577255,0.496127,0.496606,0.404686,0.439724,0.12259,-0.085452,0.120673,...,0.217546,-0.014549,0.00044,-0.096451,0.454501,0.343916,0.167313,0.607656,0.550623,0.503176
2,4PAQp1M6EyAo,-0.116833,0.458408,0.260703,0.639031,0.769337,0.442528,0.63711,0.19201,0.520379,...,0.342487,-0.021141,-0.037836,0.075069,0.412712,0.292708,0.391005,0.461544,0.508912,0.624232
3,obEacy4Of68I,0.199688,0.752714,0.658283,0.575096,0.692867,0.645789,0.52275,0.412188,0.530843,...,0.103562,-0.178313,0.210983,-0.018666,0.436313,0.592982,0.216205,0.341272,0.440313,0.558193
4,s7WzzDcmDOhF,0.227321,0.613268,0.621447,0.562673,0.736709,0.589813,0.266676,0.359668,0.300771,...,-0.164956,0.007064,-0.120904,-0.488095,0.493575,-0.215361,0.210685,0.05585,0.119065,0.108273


In [4]:
df.columns

Index(['participant_id', '0throw_1thcolumn', '0throw_2thcolumn',
       '0throw_3thcolumn', '0throw_4thcolumn', '0throw_5thcolumn',
       '0throw_6thcolumn', '0throw_7thcolumn', '0throw_8thcolumn',
       '0throw_9thcolumn',
       ...
       '195throw_196thcolumn', '195throw_197thcolumn', '195throw_198thcolumn',
       '195throw_199thcolumn', '196throw_197thcolumn', '196throw_198thcolumn',
       '196throw_199thcolumn', '197throw_198thcolumn', '197throw_199thcolumn',
       '198throw_199thcolumn'],
      dtype='object', length=19901)

In [None]:
X_fmri_raw = df.drop(columns=['participant_id'])

In [None]:
def extract_stats_features(row):
    vals = row.values
    return pd.Series({
        "mean_conn": np.mean(vals),
        "std_conn": np.std(vals),
        "skew_conn": pd.Series(vals).skew(),
        "kurt_conn": pd.Series(vals).kurt(),
        "percentile_25": np.percentile(vals, 25),
        "percentile_50": np.percentile(vals, 50),
        "percentile_75": np.percentile(vals, 75),
        "strong_corr_count": np.sum(np.abs(vals) > 0.7)  
    })
    
fmri_stats = X_fmri_raw.apply(extract_stats_features, axis=1)

fmri_stats['participant_id'] = df['participant_id']

In [None]:
fmri_stats.shape

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=902, random_state=42)
fmri_pca = pca.fit_transform(X_fmri_raw)
fmri_pca_df = pd.DataFrame(fmri_pca, columns=[f"pca_{i}" for i in range(902)])
fmri_pca_df['participant_id'] = df['participant_id']

In [None]:
fmri_features_final = pd.merge(fmri_stats, fmri_pca_df, on='participant_id')
fmri_features_final.shape

In [None]:
fmri_features_final.to_csv("data/TRAIN/TRAIN_PROCESSED_FUNCTIONAL_CONNECTOME_MATRICES.csv", index=False)

In [None]:
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
import numpy as np

# Fit PCA on raw fMRI data
pca_all = PCA().fit(X_fmri_raw)
explained_var = np.cumsum(pca_all.explained_variance_ratio_)

# Find the smallest number of components to reach 95% variance
optimal_k = np.argmax(explained_var >= 0.95) + 1
print(f"Optimal n_components to retain 95% variance: {optimal_k}")

# Plot
plt.plot(explained_var)
plt.xlabel("Number of components")
plt.ylabel("Cumulative explained variance")
plt.grid(True)
plt.axhline(0.95, color='red', linestyle='--')
plt.axvline(optimal_k, color='green', linestyle='--', label=f"{optimal_k} components")
plt.title("PCA Variance Explained")
plt.legend()
plt.show()