In [None]:
import os
os.environ['MKL_THREADING_LAYER'] = 'GNU'
import pickle
import numpy as np
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
from process import *
from sample import run

In [None]:
sec, _, seq = parse_pdb_for_secondary_structure('./data/test/4E3Q-S.pdb')
print(f'{seq}\n{sec}')

In [None]:
finder = HighFrequencySequenceFinder('./data/test/4E3Q-NR90-M2.fas', threshold=65)
seq_mask = finder.get_mask_seq()
print(seq_mask)
print(f'number of X : {seq_mask.count("X")}/{len(seq_mask)}')

In [None]:
# result = run(alignaaa, cache_time='11-30-02', original_data=seq_mask)
result = run(sec, cache_time='03-15-10', original_data=None, num_sample=10, classifier=False, scale=10)
result

In [None]:
with open('./4E3Q-65.FASTA', mode='w+') as f:
    for idx, sequence in enumerate(result):
        f.write(f'>{idx}\n')
        f.write(f'{sequence}\n')

In [None]:
fold_and_savePDB(result, '/workspace/sample', device='cuda:1')

In [None]:
path = '/workspace/sample/03-07-10-03/'
pdb_lst = [os.path.join(path, pdb) for pdb in os.listdir(path) if pdb.endswith('.pdb')]  # 只获取PDB文件
score_dict = {}

for pdb in pdb_lst:
    predict, _, sequence = parse_pdb_for_secondary_structure(pdb)
    # score = calculate_sequence_similarity_percentage(predict, alignaaa)
    score = calculate_sequence_similarity_percentage(predict, sec)  # 15:29 30:-2 14:36
    score_dict[pdb] = score

# 获取得分最小值对应的 PDB 文件名
max_score_pdb = max(score_dict, key=score_dict.get)
best_pdb_name = os.path.basename(max_score_pdb)  # 只获取文件名，不带路径
print(f"Best PDB File   : {best_pdb_name}")
print(f"Corresponding Score: {score_dict[max_score_pdb]}")

predict, _, _ = parse_pdb_for_secondary_structure(max_score_pdb)
idx = max_score_pdb.split('/')[-1].split('.')[0]
print(f'Prediction: {predict}\nCondition : {sec}\nSequence  : {result[int(idx)]}')

In [None]:
plot_structure_comparison(predict, calculate_ss_percentages_list(sec), plot_error=True)

In [None]:
# 提取所有值
values = list(score_dict.values())

# 计算平均值、最大值和最小值
average_value = np.mean(values)
max_value = np.max(values)
min_value = np.min(values)

# 打印结果
print(f"平均值: {average_value}")
print(f"最大值: {max_value}")
print(f"最小值: {min_value}")


# 使用列表推导式筛选大于阈值的值
threshold=50
values_above_threshold = [value for value in score_dict.values() if value >= threshold]

# 统计数量
count_above_threshold = len(values_above_threshold)

# 打印结果
print(f"大于阈值 {threshold} 的值的数量: {count_above_threshold}")



In [None]:
# 按值降序排序字典
sorted_dict_desc = dict(sorted(score_dict.items(), key=lambda item: item[1], reverse=True))
sorted_dict_desc

In [None]:
keys = []
values = []
for i in sorted_dict_desc:
    idx = i.split('/')[-1]
    keys.append(idx)
    values.append(sorted_dict_desc[i])

df = pd.DataFrame({'pdb':keys, 'score':values})
df_path = os.path.join(path, '4e3q-65.csv')
df.to_csv(df_path)

In [None]:
from Bio import SeqIO

seq_list = [str(i.seq) for i in SeqIO.parse('/workspace/sample/02-22-07-43/sequence.fasta', 'fasta')]

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# 示例数据
scores = list(score_dict.values())

# 绘制直方图

bins = range(0, 100, 5)
plt.hist(scores, bins=20, range=(0, 100), color='blue', alpha=0.7, density=True)
plt.xticks(np.arange(0, 100+1, 5))
plt.xlim(0, 100)
plt.title('Score Distribution')
plt.xlabel('Score')
plt.ylabel('Density')
plt.show()


In [None]:
import shutil

seq_lst = []
result_path = './result'
tag_time = time.strftime("%m-%d-%H-%S", time.localtime())
path = os.path.join(result_path, tag_time)
os.makedirs(path)
fasta_file = os.path.join(path, 'seq.fasta')
with open(fasta_file, mode='w+') as f:
    for i, key in enumerate(sorted_dict_desc.keys()):
        if i >= 50:
            break
        idx = int(key.split('/')[-1].split('.')[0])
        f.write(f'>{idx}\n')
        f.write(f'{seq_list[idx]}\n')
        shutil.copy(key, path)