In [27]:
import pandas as pd
from factor_analyzer import FactorAnalyzer
from factor_analyzer.factor_analyzer import calculate_bartlett_sphericity, calculate_kmo
import numpy as np

data_path = "G:\ABCD\script/trail/trail_tsne_RF"
# load data and drop the first column and the subject id
data = pd.read_csv(data_path + "/merged.csv").drop(columns=["Unnamed: 0", "src_subject_id"])
label_columns = data.columns[data.columns.str.startswith("cbcl")].tolist()

data = data[label_columns]

#out put directory
rotation = "oblimin"
output_dir = "G:/ABCD/script/trail/trail_tsne_RF/factor analysis/output/" + rotation


# EFA from previous research

In [8]:
# delete columns with low frequency (more than 99.5% of the values are 0)
low_frequency_columns = data.columns[data.apply(lambda col: (col == 0).mean() > 0.995)]
data_cleaned = data.drop(columns=low_frequency_columns)
print(f"Removed columns with low frequency: {low_frequency_columns.tolist()}")

#load corerlation matrix from polychoric correlation matrix

correlation_matrix = pd.read_csv(data_path + "/factor analysis/output/polychoric_correlation_matrix.csv", index_col=0)

# mark highly correlated pairs (r > 0.75)
high_corr_pairs = (correlation_matrix.abs() > 0.75).where(lambda x: np.triu(x, 1)).stack().index.tolist()
print(f"Highly correlated pairs (r > 0.75): {high_corr_pairs}")

Removed columns with low frequency: ['cbcl_q02_p', 'cbcl_q73_p', 'cbcl_q99_p', 'cbcl_q101_p', 'cbcl_q105_p']
Highly correlated pairs (r > 0.75): [('cbcl_q08_p', 'cbcl_q10_p'), ('cbcl_q08_p', 'cbcl_q78_p'), ('cbcl_q20_p', 'cbcl_q21_p'), ('cbcl_q21_p', 'cbcl_q106_p'), ('cbcl_q22_p', 'cbcl_q28_p'), ('cbcl_q23_p', 'cbcl_q28_p'), ('cbcl_q25_p', 'cbcl_q48_p'), ('cbcl_q53_p', 'cbcl_q55_p'), ('cbcl_q56c_p', 'cbcl_q56f_p'), ('cbcl_q57_p', 'cbcl_q97_p'), ('cbcl_q81_p', 'cbcl_q82_p')]


In [9]:
from collections import defaultdict

def find_connected_groups(pairs):
    # 建立图结构
    graph = defaultdict(set)
    for col1, col2 in pairs:
        graph[col1].add(col2)
        graph[col2].add(col1)
    
    # 深度优先搜索（DFS）找到所有连通分量
    visited = set()
    connected_groups = []

    def dfs(node, group):
        visited.add(node)
        group.add(node)
        for neighbor in graph[node]:
            if neighbor not in visited:
                dfs(neighbor, group)

    # 遍历所有节点，找到每个连通分量
    for node in graph:
        if node not in visited:
            group = set()
            dfs(node, group)
            connected_groups.append(tuple(sorted(group)))

    return connected_groups

# 使用函数
result = find_connected_groups(high_corr_pairs)
print("number of connected groups:", len(result))
print("columns for each connected group:", result)

number of connected groups: 8
columns for each connected group: [('cbcl_q08_p', 'cbcl_q10_p', 'cbcl_q78_p'), ('cbcl_q106_p', 'cbcl_q20_p', 'cbcl_q21_p'), ('cbcl_q22_p', 'cbcl_q23_p', 'cbcl_q28_p'), ('cbcl_q25_p', 'cbcl_q48_p'), ('cbcl_q53_p', 'cbcl_q55_p'), ('cbcl_q56c_p', 'cbcl_q56f_p'), ('cbcl_q57_p', 'cbcl_q97_p'), ('cbcl_q81_p', 'cbcl_q82_p')]


In [10]:
#create dataframe to store the final data
data_final = data_cleaned.copy()
for group in result:
    # calculate the average of the columns in the group
    data_final[f"avg_{'_'.join(group)}"] = data_cleaned[list(group)].mean(axis=1).round().astype(int)
    # delete the original columns
    data_final.drop(columns=list(group), inplace=True)

In [11]:
# #scale data_final
# from sklearn.preprocessing import StandardScaler
# scaler = StandardScaler()
# data_scaled = scaler.fit_transform(data_final)

# #make data_scaled to dataframe
# data_scaled = pd.DataFrame(data_scaled, columns=data_final.columns)
data_scaled = data_final

In [12]:
import pandas as pd
from factor_analyzer import FactorAnalyzer
from factor_analyzer.factor_analyzer import calculate_bartlett_sphericity, calculate_kmo
import matplotlib.pyplot as plt
import seaborn as sns


# Bartlett 和 KMO 测试
chi_square_value, p_value = calculate_bartlett_sphericity(data_scaled)
print(f"Bartlett's Test Chi-square: {chi_square_value}, p-value: {p_value}")
kmo_all, kmo_model = calculate_kmo(data_scaled)
print(f"KMO Test Score: {kmo_model}")

# factor analysis
fa = FactorAnalyzer(n_factors=16,  rotation= rotation, method = 'principal')
# fa = FactorAnalyzer(n_factors=16, rotation="equamax", method='principal')
# fa.fit(data_cleaned)
fa.fit(data_scaled)

# factor loadings
factor_loadings = fa.loadings_
# factor_loadings_df = pd.DataFrame(factor_loadings, columns=["Factor 1", "Factor 2", "Factor 3", "Factor 4", "Factor 5", "Factor 6"])
factor_loadings_df = pd.DataFrame(factor_loadings, columns=[f"Factor {i}" for i in range(1, 17)])
# factor_loadings_df = pd.DataFrame(factor_loadings, columns=["Factor 1", "Factor 2", "Factor 3", "Factor 4", "Factor 5"])
# factor_loadings_df.index = data_cleaned.columns
factor_loadings_df.index = data_final.columns
print("Factor Loadings DataFrame:\n", factor_loadings_df)

# variance explained
variance_explained = fa.get_factor_variance()
print("Variance Explained:\n", variance_explained)



Bartlett's Test Chi-square: 287595.40095527406, p-value: 0.0
KMO Test Score: 0.9628842767009993




Factor Loadings DataFrame:
                              Factor 1  Factor 2  Factor 3  Factor 4  Factor 5  \
cbcl_q01_p                   0.140634  0.123036  0.370795  0.027652 -0.066702   
cbcl_q03_p                   0.533706  0.046931  0.156962 -0.052718  0.062766   
cbcl_q04_p                   0.132776  0.025243  0.559464 -0.004890  0.067412   
cbcl_q05_p                   0.181576 -0.082961  0.199410  0.120356  0.072557   
cbcl_q06_p                  -0.012570 -0.032660 -0.012577 -0.019568  0.005859   
...                               ...       ...       ...       ...       ...   
avg_cbcl_q25_p_cbcl_q48_p   -0.021294 -0.027070  0.047929  0.076615 -0.025264   
avg_cbcl_q53_p_cbcl_q55_p   -0.060203  0.039356 -0.072492 -0.132508 -0.041524   
avg_cbcl_q56c_p_cbcl_q56f_p  0.024170  0.006781 -0.027914 -0.055772  0.671208   
avg_cbcl_q57_p_cbcl_q97_p    0.107472 -0.030714 -0.121707  0.103992  0.043553   
avg_cbcl_q81_p_cbcl_q82_p   -0.042711  0.003365  0.069821  0.005660 -0.098578   


In [13]:
variance_explained[1]

array([0.03245182, 0.01720842, 0.02686883, 0.01793263, 0.02223959,
       0.02282064, 0.01686017, 0.0254489 , 0.01819661, 0.01726673,
       0.02757612, 0.01682084, 0.02125505, 0.01760954, 0.01823394,
       0.01747789])

In [14]:
variance_explained[2][-1]

0.33626773081510103

In [15]:
for i in range(1, 17):
    # all loading values of the factor greater than 0.1
    factor_values = factor_loadings_df[f"Factor {i}"][factor_loadings_df[f"Factor {i}"] > 0.1]

    # print(f"Factor {i}:\n", factor_loadings_df[f"Factor {i}"].sort_values(ascending=False).head(20))

    # descending order
    print(f"Factor {i}:\n", factor_values.sort_values(ascending=False))

Factor 1:
 cbcl_q95_p                               0.576580
cbcl_q86_p                               0.562006
cbcl_q03_p                               0.533706
cbcl_q109_p                              0.519226
cbcl_q87_p                               0.509411
cbcl_q68_p                               0.483531
cbcl_q88_p                               0.474791
cbcl_q27_p                               0.386796
cbcl_q14_p                               0.377646
cbcl_q19_p                               0.361401
cbcl_q33_p                               0.354315
cbcl_q103_p                              0.268592
avg_cbcl_q22_p_cbcl_q23_p_cbcl_q28_p     0.265540
cbcl_q26_p                               0.244424
cbcl_q104_p                              0.235022
cbcl_q05_p                               0.181576
cbcl_q43_p                               0.178289
cbcl_q41_p                               0.166262
cbcl_q89_p                               0.155802
cbcl_q16_p                             

In [16]:
# filtered_columns = {}
result_df = pd.DataFrame()

for i in range(1, 17):
    # 筛选出符合条件的列名
    factor_values = factor_loadings_df[f"Factor {i}"][factor_loadings_df[f"Factor {i}"] > 0.1]
    # filtered_columns[f"Factor {i}"] = factor_values.index.tolist()  # 保存列名（或索引）
        # 创建一个临时数据框保存因子名、列名和加载值
    temp_df = pd.DataFrame({
        f"Factor {i} Variable": factor_values.index,     # 存储列名
        f"Factor {i} Loading": factor_values.values      # 存储加载值
    })
    
    # 将临时数据框合并到结果数据框
    result_df = pd.concat([result_df, temp_df], axis=1)


# # 保存为CSV文件
# filtered_columns_df.to_csv("filtered_factor_columns.csv", index=False)


# Edit and translate

In [30]:
import pandas as pd
from bs4 import BeautifulSoup
from googletrans import Translator
import re
import time

# 指定源语言和目标语言
translator = Translator()

# 解析 element.html 文件以获取列名和详细信息
with open("data/element.html", "r", encoding="utf-8") as file:
    soup = BeautifulSoup(file, "html.parser")

# 创建一个字典来存储列名和对应的详细信息
column_details = {}
result_df = pd.DataFrame()

# 提取 cbcl_q 列名的正则表达式
cbcl_pattern = re.compile(r"(cbcl_q\d+[a-z]*_p)")

for i in range(1, 17):
    # 筛选出符合条件的加载值
    factor_values = factor_loadings_df[f"Factor {i}"][factor_loadings_df[f"Factor {i}"] > 0.1]
    
    original_text = []
    translated_text = []
    for column_name in factor_values.index:
        # 查找 column_name 中的所有 cbcl_q 字段
        cbcl_items = cbcl_pattern.findall(column_name)  # 提取所有符合 cbcl_qXX_p 或 cbcl_qXXh_p 格式的子串

        # 初始化存储每个 cbcl 字段详细信息的列表
        original = []
        details = []
        for cbcl_item in cbcl_items:
            # 获取每个 cbcl 字段的详细信息
            target = soup.find(lambda tag: tag.name == "td" and cbcl_item in tag.get_text(strip=True))
            if target:
                detail_info = target.find_next("td").get_text(strip=True)
                # 保存原始详细信息
                original.append(detail_info)
                
                # 翻译详细信息并添加到结果
                try:
                    translated_detail = translator.translate(detail_info, src='es', dest='en').text
                except AttributeError as e:
                    print(f"An error occurred: {e}")
                    translated_detail = detail_info
                details.append(translated_detail)
                time.sleep(0.25)

        # 将所有细节合并为单个字符串，并添加到列表中
        original_text.append("; ".join(original) if original else "N/A")
        translated_text.append("; ".join(details) if details else "N/A")

    # 创建一个临时数据框保存因子名、列名、加载值和详细信息
    temp_df = pd.DataFrame({
        f"Factor {i} Variable": factor_values.index,  # 存储列名
        f"Factor {i} Loading": factor_values.values,  # 存储加载值
        f"Factor {i} Detail": original_text,  # 映射详细信息
        f"Factor {i} Translated_Detail": translated_text  # 映射翻译后详细信息
    })

    # 按加载值降序排序
    sorted_df = temp_df.sort_values(by=f"Factor {i} Loading", ascending=False).reset_index(drop=True)
    # 将临时数据框合并到结果数据框
    result_df = pd.concat([result_df.reset_index(drop=True), sorted_df.reset_index(drop=True)], axis=1)

# 保存结果为CSV文件
result_df.to_csv(output_dir + "/interpretable_information_EN.csv", index=False)


In [None]:
import pandas as pd
from bs4 import BeautifulSoup
from googletrans import Translator
import re
import time

# 指定源语言和目标语言
translator = Translator()

# 解析 element.html 文件以获取列名和详细信息
with open("data/element.html", "r", encoding="utf-8") as file:
    soup = BeautifulSoup(file, "html.parser")

# 创建一个字典来存储列名和对应的详细信息
column_details = {}
result_df = pd.DataFrame()

# 提取 cbcl_q 列名的正则表达式
cbcl_pattern = re.compile(r"(cbcl_q\d+[a-z]*_p)")

for i in range(1, 17):
    # 筛选出符合条件的加载值
    factor_values = factor_loadings_df[f"Factor {i}"][factor_loadings_df[f"Factor {i}"] > 0.1]
    
    original_text = []
    translated_text = []
    for column_name in factor_values.index:
        # 查找 column_name 中的所有 cbcl_q 字段
        cbcl_items = cbcl_pattern.findall(column_name)  # 提取所有符合 cbcl_qXX_p 或 cbcl_qXXh_p 格式的子串

        # 初始化存储每个 cbcl 字段详细信息的列表
        original = []
        details = []
        for cbcl_item in cbcl_items:
            # 获取每个 cbcl 字段的详细信息
            target = soup.find(lambda tag: tag.name == "td" and cbcl_item in tag.get_text(strip=True))
            if target:
                detail_info = target.find_next("td").get_text(strip=True)
                # 保存原始详细信息
                original.append(detail_info)
                
                # 翻译详细信息并添加到结果
                try:
                    translated_detail = translator.translate(detail_info, src='es', dest='zh-cn').text
                except AttributeError as e:
                    print(f"An error occurred: {e}")
                    translated_detail = detail_info
                details.append(translated_detail)
                time.sleep(0.25)

        # 将所有细节合并为单个字符串，并添加到列表中
        original_text.append("; ".join(original) if original else "N/A")
        translated_text.append("; ".join(details) if details else "N/A")

    # 创建一个临时数据框保存因子名、列名、加载值和详细信息
    temp_df = pd.DataFrame({
        f"Factor {i} Variable": factor_values.index,  # 存储列名
        f"Factor {i} Loading": factor_values.values,  # 存储加载值
        f"Factor {i} Detail": original_text,  # 映射详细信息
        f"Factor {i} Translated_Detail": translated_text  # 映射翻译后详细信息
    })

    # 按加载值降序排序
    sorted_df = temp_df.sort_values(by=f"Factor {i} Loading", ascending=False).reset_index(drop=True)
    # 将临时数据框合并到结果数据框
    result_df_CN = pd.concat([result_df.reset_index(drop=True), sorted_df.reset_index(drop=True)], axis=1)

# 保存结果为CSV文件
result_df_CN.to_csv(output_dir + "/interpretable_information_CN.csv", index=False)


In [32]:
EN_pattern = re.compile(r"Factor \d+ Translated_Detail")
selected_columns = [col for col in result_df.columns if EN_pattern.match(col)]

result_df[selected_columns].to_csv(output_dir + "/details.csv", index=False)

# Generate labels with the factors

In [19]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=3, random_state=42)
combined_label_cluster = kmeans.fit_predict(factor_scores_df[['Factor1', 'Factor2', 'Factor3']])


NameError: name 'factor_scores_df' is not defined