In [1]:
import pandas as pd
import numpy as np
import torch
import os
import pickle

In [2]:
# 从本地文件读取字典
with open('mapping_dict.pkl', 'rb') as f:
    mapping_dict = pickle.load(f)

print("mapping_dict已成功从mapping_dict.pkl读取")

# 从本地文件读取字典
with open('results_10_species.pkl', 'rb') as f:
    results = pickle.load(f)

print("results_10_species.pkl读取")

mapping_dict已成功从mapping_dict.pkl读取
results_10_species.pkl读取


In [3]:
# 提取所有 mean_representations_nm 作为聚类数据
data = np.array([item[1] for item in results])

Data_id = np.array([item[0] for item in results])

# 为每个Data_id中的值找到对应的Orthogroup行标签
mapped_ids = {gene: mapping_dict.get(gene, None) for gene in Data_id}


In [4]:
from sklearn.cluster import KMeans


In [5]:
# 设置聚类的超参数
num_clusters = 2000

# 初始化KMeans模型
kmeans = KMeans(n_clusters=num_clusters, random_state=0)

In [6]:
# 进行聚类
labels = kmeans.fit_predict(data)

In [7]:
# 将文件名和聚类标签保存到 DataFrame
clustered_results = pd.DataFrame({
    'File Name': [item[0] for item in results],
    'Cluster Label': labels,
    'orthogroups': [mapped_ids[gene] for gene in Data_id]
})

In [8]:
clustered_results

Unnamed: 0,File Name,Cluster Label,orthogroups
0,Anol_caro_LOC100337545,1472,OG0000097
1,Anol_caro_LOC100551521,1738,
2,Anol_caro_LOC100337549,144,
3,Anol_caro_LOC100551552,940,OG0004176
4,Anol_caro_LOC100551564,1951,OG0018819
...,...,...,...
272050,Taen_gutt_ZRANB3,1470,OG0024434
272051,Taen_gutt_ZNF831,302,OG0024432
272052,Taen_gutt_ZWILCH,1004,OG0029775
272053,Taen_gutt_ZSWIM1,5,OG0006861


In [27]:
clustered_results['species'] = clustered_results['File Name'].str.split('_').str[:2].str.join('_')


In [28]:
clustered_results

Unnamed: 0,File Name,Cluster Label,orthogroups,species
0,Anol_caro_LOC100337545,1472,OG0000097,Anol_caro
1,Anol_caro_LOC100551521,1738,,Anol_caro
2,Anol_caro_LOC100337549,144,,Anol_caro
3,Anol_caro_LOC100551552,940,OG0004176,Anol_caro
4,Anol_caro_LOC100551564,1951,OG0018819,Anol_caro
...,...,...,...,...
272050,Taen_gutt_ZRANB3,1470,OG0024434,Taen_gutt
272051,Taen_gutt_ZNF831,302,OG0024432,Taen_gutt
272052,Taen_gutt_ZWILCH,1004,OG0029775,Taen_gutt
272053,Taen_gutt_ZSWIM1,5,OG0006861,Taen_gutt


In [29]:
clustered_results.to_csv('Minibatch_kmeans_result_10_species_n_2000.csv')

In [30]:
clustered_results['Cluster Label'].value_counts()

Cluster Label
1851    393
302     392
809     382
632     371
172     369
       ... 
1825      5
1902      5
1045      5
1736      4
1333      2
Name: count, Length: 2000, dtype: int64

In [31]:
clustered_results['orthogroups'].value_counts()

orthogroups
OG0000000    238
OG0000001    232
OG0000002    221
OG0000003    216
OG0000004    211
            ... 
OG0030756      1
OG0030669      1
OG0029014      1
OG0010980      1
OG0019964      1
Name: count, Length: 35030, dtype: int64

In [32]:
clustered_results[clustered_results['orthogroups']=='OG0000055']

Unnamed: 0,File Name,Cluster Label,orthogroups,species
145,Anol_caro_LOC100556725,58,OG0000055,Anol_caro
169,Anol_caro_LOC100557654,58,OG0000055,Anol_caro
268,Anol_caro_LOC100561656,122,OG0000055,Anol_caro
473,Anol_caro_LOC103277990,1253,OG0000055,Anol_caro
637,Anol_caro_LOC103280102,1178,OG0000055,Anol_caro
...,...,...,...,...
19332,Anol_caro_LOC100565347,1932,OG0000055,Anol_caro
19421,Anol_caro_LOC103277484,58,OG0000055,Anol_caro
19688,Anol_caro_LOC103281210,1885,OG0000055,Anol_caro
19949,Anol_caro_LOC107983239,1005,OG0000055,Anol_caro


In [36]:
clustered_results[clustered_results['orthogroups']=='OG0000055']['species'].value_counts()

species
Anol_caro    77
Croc_poro     1
Name: count, dtype: int64

In [33]:
clustered_results[clustered_results['orthogroups']=='OG0000055']['Cluster Label'].value_counts()

Cluster Label
58      21
958     16
257      6
210      6
239      4
1178     3
122      3
426      2
871      1
1885     1
1932     1
614      1
1555     1
700      1
1116     1
828      1
1573     1
248      1
1011     1
1003     1
900      1
581      1
1779     1
1253     1
1005     1
Name: count, dtype: int64

In [34]:
clustered_results[clustered_results['Cluster Label']==58]

Unnamed: 0,File Name,Cluster Label,orthogroups,species
145,Anol_caro_LOC100556725,58,OG0000055,Anol_caro
169,Anol_caro_LOC100557654,58,OG0000055,Anol_caro
201,Anol_caro_LOC100558991,58,OG0014845,Anol_caro
484,Anol_caro_LOC103278140,58,OG0002480,Anol_caro
1013,Anol_caro_LOC107983779,58,OG0000803,Anol_caro
...,...,...,...,...
251571,Taen_gutt_LOC105760196,58,OG0001019,Taen_gutt
257017,Taen_gutt_LOC115492498,58,OG0001019,Taen_gutt
257498,Taen_gutt_LOC121468881,58,OG0000279,Taen_gutt
260210,Taen_gutt_LOC121468938,58,OG0001019,Taen_gutt


In [35]:
clustered_results[clustered_results['Cluster Label']==58]['orthogroups'].value_counts()

orthogroups
OG0000055    21
OG0000637    14
OG0000007    10
OG0000430     9
OG0000473     8
OG0000087     8
OG0002480     6
OG0001291     5
OG0001019     4
OG0000234     3
OG0002307     3
OG0000040     3
OG0001134     3
OG0000017     3
OG0003419     2
OG0013651     2
OG0001292     2
OG0001281     2
OG0004107     2
OG0007070     2
OG0000803     2
OG0021515     1
OG0001285     1
OG0000091     1
OG0013841     1
OG0000780     1
OG0003195     1
OG0000003     1
OG0001126     1
OG0010897     1
OG0031756     1
OG0002723     1
OG0014845     1
OG0017253     1
OG0000195     1
OG0009323     1
OG0022573     1
OG0003018     1
OG0018607     1
OG0000935     1
OG0018328     1
OG0000279     1
Name: count, dtype: int64

In [16]:
pwd

'/home/ftnws/project/Cross_spe_LLM/scripts'

In [26]:
clustered_results[(clustered_results['Cluster Label']==58)&
                    (clustered_results['orthogroups']=='OG0000055')]

Unnamed: 0,File Name,Cluster Label,orthogroups
145,Anol_caro_LOC100556725,58,OG0000055
169,Anol_caro_LOC100557654,58,OG0000055
2755,Anol_caro_LOC100551974,58,OG0000055
2886,Anol_caro_LOC100556859,58,OG0000055
3212,Anol_caro_LOC103277550,58,OG0000055
5864,Anol_caro_LOC103277521,58,OG0000055
6052,Anol_caro_LOC103280099,58,OG0000055
6088,Anol_caro_LOC103280564,58,OG0000055
8126,Anol_caro_LOC100553141,58,OG0000055
8195,Anol_caro_LOC100555552,58,OG0000055
