In [None]:
import os
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from factor_analyzer import FactorAnalyzer
from models import Autoencoder
from sklearn.decomposition import NMF
from sklearn.linear_model import Lasso, LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import cx_Oracle
from utils import translate_text

# Main 

## Preparing the data

In [None]:
code_dir = Path(os.getcwd())
data_path = code_dir.parent / "data"
assert os.path.exists(
    data_path
), "Data directory not found. Make sure you're running this code from the root directory of the project."

with open(data_path / "cbcl_data_remove_unrelated.csv", "r", encoding="utf-8") as f:
    qns = pd.read_csv(f)

X = qns.iloc[:, 2:].values

# Standardize the data
# scaler = StandardScaler()
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# Split into training and validation sets
X_train_raw, X_temp = train_test_split(X, test_size=0.4)
X_val_raw, X_test_raw = train_test_split(X_temp, test_size=0.5)


X_train = scaler.fit_transform(X_train_raw)
X_val = scaler.transform(X_val_raw)
X_test = scaler.transform(X_test_raw)

## Train and save the model

In [None]:
autoencoder = Autoencoder(X_train, X_val, encoding_dim=5, 
                          layer1_neurons=69, layer2_neurons=58, layer3_neurons=53)
autoencoder.train()
(
    latent_factors,
    reconstruction_errors_test,
    explained_variance_ratios,
    explained_variance_ratio_test,
) = autoencoder.evaluate_on_data(X_scaled)

In [None]:
#save the model
autoencoder.export_to_onnx(X_train, onnx_path = "../output/autoencoder_real_input.onnx")  # export model

In [None]:
latent_factors = latent_factors  # convert to NumPy
X_scaled = np.array(X_scaled)  # convert to NumPy

# calculate loadings
loadings = []
for i in range(X_scaled.shape[1]):  # transversal all features
    reg = LinearRegression().fit(latent_factors, X_scaled[:, i])  # regression
    loadings.append(reg.coef_)  # store the coefficients

loading_matrix = np.array(
    loadings
).T  #  transpose to (latent factors, original features)

print("Loading matrix shape:", loading_matrix.shape)

## Draw the reconstuction error of the autoencoder model

In [None]:
# Define the datasets
datasets = {"Train": X_train, "Validation": X_val, "Test": X_test}

# Plot the reconstruction errors
autoencoder.plot_reconstruction_errors(datasets)

## Explore the possible dimensions of the hidden layers

In [None]:
# Initialize the DataFrame to store the results
dim_df = pd.DataFrame(
    columns=["dim"] + [f"Factor_{i+1}" for i in range(10)] + ["Total variance ratio"]
)

for latent_dim in range(4, 6):
    print("dim:", latent_dim)

    # Initialize and train the autoencoder
    autoencoder = Autoencoder(X_train, X_val, encoding_dim=latent_dim)
    autoencoder.train()

    # Evaluate the autoencoder on the scaled data
    (
        latent_factors,
        reconstruction_errors,
        explained_variance_ratios,
        explained_variance_ratio_total,
    ) = autoencoder.evaluate_on_data(X_scaled)

    # Create a temporary DataFrame to store the results for the current latent dimension
    temp_df = pd.DataFrame(
        {
            "dim": [latent_dim],  # Current dimension
            **{
                f"Factor_{i+1}": ratio
                for i, ratio in enumerate(explained_variance_ratios)
            },
            "Total variance ratio": [explained_variance_ratio_total],
        }
    )

    # Fill missing factor columns with NaN to maintain consistent column names
    for col in dim_df.columns:
        if col not in temp_df.columns:
            temp_df[col] = None  # Fill with NaN

    # Sort columns to match the order in dim_df
    temp_df = temp_df[dim_df.columns]

    # Concatenate the temporary DataFrame with the main DataFrame
    dim_df = pd.concat([dim_df, temp_df], ignore_index=True)

print(dim_df)

## Other approaches to extract factors: 

### NMF

In [None]:
data_cleaned = qns.iloc[:, 2:]  # the cbcl data

rank = 5  # number of components to extract

# NMF dicomposition
nmf_model = NMF(
    n_components=rank, init="random", solver="mu", max_iter=1000, random_state=42
)
W = nmf_model.fit_transform(data_cleaned)  # 基矩阵 W basis matrix
H = nmf_model.components_  # 系数矩阵 H coefficients matrix

# reconstruction error
reconstruction_error = nmf_model.reconstruction_err_
print(f"Reconstruction error: {reconstruction_error}")

# reconstruct the matrix
X_reconstructed = np.dot(W, H)

# calculate Frobenius norm
X_norm = np.linalg.norm(data_cleaned, ord="fro")
reconstruction_error_frobenius = np.linalg.norm(
    data_cleaned - X_reconstructed, ord="fro"
)

# calculate relative error
relative_error = reconstruction_error_frobenius / X_norm
print(f"Relative Error: {relative_error}")

# calculate explained variance
data_cleaned_np = data_cleaned.to_numpy()

# calculate the total variance of the original data matrix
total_variance = np.sum((data_cleaned_np - np.mean(data_cleaned_np)) ** 2)

# calculate the variance of the reconstructed matrix
reconstructed_variance = np.sum((X_reconstructed - np.mean(X_reconstructed)) ** 2)

# variance_explained
variance_explained = reconstructed_variance / total_variance
print(f"Variance explained by the NMF model: {variance_explained * 100:.2f}%")

# add ID to W
W_with_id = pd.concat([pd.Series(qns.iloc[:, 1], name="ID"), pd.DataFrame(W)], axis=1)

# 转置 H
H_transposed = H.T

### EFA

In [None]:
# Create factor analysis object and perform factor analysis
fa = FactorAnalyzer(n_factors=5, rotation="varimax")
fa.fit(X_scaled)
# Check Eigenvalues
ev, v = fa.get_eigenvalues()
ev
factor_loadings = fa.loadings_
# 计算每个因子的贡献方差和方差解释率
variance_explained = fa.get_factor_variance()
print(f"varience explained: {variance_explained[2][-1]:.2%}")

# 计算所有个体的因子得分
factor_scores = fa.transform(X_scaled)
factor_scores.shape

## Interpretability

### results from autoencoder

In [None]:
# 将 latent_df 转换为 numpy 数组（如果它是 Pandas DataFrame）
latent_factors = (
    latent_factors.values
    if isinstance(latent_factors, pd.DataFrame)
    else latent_factors
)
original_features = X if isinstance(X, np.ndarray) else X.values

# 存储每个原始特征的回归系数
n_original_features = original_features.shape[1]
n_latent_factors = latent_factors.shape[1]

loadings = []

# 对每个原始特征进行回归，使用 latent_factors 作为输入特征
for i in range(n_original_features):
    y = original_features[:, i]  # 当前原始特征
    reg = LinearRegression().fit(latent_factors, y)
    loadings.append(reg.coef_)

# 将结果转换为 DataFrame，便于查看
loadings_df = pd.DataFrame(
    loadings, columns=[f"Latent_{j+1}" for j in range(n_latent_factors)]
)
loadings_df.index = [f"Feature_{i+1}" for i in range(n_original_features)]

# 输出每个潜在因子对原始特征的贡献（类似于 PCA 的负载）the loading matrix of the autoencoder
print(loadings_df)

In [None]:
loadings_df.index = qns.iloc[:, 2:].columns
df = pd.DataFrame()
for i in range(latent_dim):
    df0 = (
        loadings_df["Latent_{0}".format(i + 1)]
        .reindex(
            loadings_df["Latent_{0}".format(i + 1)]
            .abs()
            .sort_values(ascending=False)
            .index
        )
        .to_frame(name="Latent_{0}".format(i + 1))
    )
    df0 = df0.reset_index().rename(columns={"index": "Row_Name"})
    df = pd.concat([df, df0], axis=1)
df_even_columns = df.iloc[
    :, ::2
]  # # Select all rows and every second column from the DataFrame

details_autoencoder = translate_text(df_even_columns, 5, "en")
details_autoencoder

### results from NMF

In [None]:
label = pd.read_csv(
    r"G:\ABCD\script\trail\trail_tsne_RF\factor analysis\output\NA\NMF_H.csv"
).iloc[:, 1:]
label.index = qns.iloc[:, 2:].columns
df_NMF = pd.DataFrame()
for i in range(5):
    df0 = (
        label["V{0}".format(i + 1)]
        .reindex(label["V{0}".format(i + 1)].abs().sort_values(ascending=False).index)
        .to_frame(name="V{0}".format(i + 1))
    )
    df0 = df0.reset_index().rename(columns={"index": "Row_Name{0}".format(i + 1)})
    df_NMF = pd.concat([df_NMF, df0], axis=1)
df_NMF_even_columns = df_NMF.iloc[:, ::2]  # 选择所有行和每隔两列的列

details = translate_text(df_NMF_even_columns, 5, "en")
details

## Lasso regression model from questionnaire items to factors generated with NMF (from Toby)

In [None]:
from utils import LassoAnalysis

qns = X
scores = pd.read_csv(
    r"G:\ABCD\script\trail\trail_tsne_RF\factor analysis\output\NA\NMF_W.csv"
).iloc[:, 2:]

# Initialize the LassoAnalysis class
lasso_analysis = LassoAnalysis(qns, scores)

# Perform the analysis
lasso_analysis.perform_analysis()

# Plot R² values
lasso_analysis.plot_r2_values()

# Plot factor predictions
lasso_analysis.plot_factor_predictions(alpha=0.125)

# Plot heatmap of coefficients
lasso_analysis.plot_heatmap(alpha=0.125)

## To be continued: 

### complete pipeline for getting fmri data

In [None]:
# set the path of Oracle Instant Client
os.environ["PATH"] = "C:\\oracle\\instantclient_23_7;" + os.environ["PATH"]
print(cx_Oracle.clientversion())

dsn = cx_Oracle.makedsn(
    "mindarvpc.cqahbwk3l1mb.us-east-1.rds.amazonaws.com", 1521, service_name="ORCL"
)
conn = cx_Oracle.connect(user="k21116947_1236370", password="<input>", dsn=dsn)

cursor = conn.cursor()

query = """
SELECT ENDPOINT
FROM S3_LINKS
WHERE ENDPOINT LIKE '%baseline%' AND ENDPOINT LIKE '%rsfMRI%' AND ENDPOINT LIKE '%NDARINV005V6D2C%' AND ENDPOINT LIKE '%MPROC%' 
"""
cursor.execute(query)

s3_samples = [row[0] for row in cursor.fetchall()]

cursor.close()
conn.close()

# for url in s3_samples:
#    print(url)

np.savetxt("data/s3_links.txt", s3_samples, fmt="%s")


!downloadcmd -dp 1236370 -t data/s3_links.txt -d ./data