In [61]:
from sklearn.model_selection import train_test_split

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sklearn.preprocessing import LabelEncoder

# Load the dataset containing sequence features
data = pd.read_csv("alpha_p.csv") 

# 假设 `data` 是已加载的pandas DataFrame，并且包含了上述的列

class_counts = data['antigen.epitope'].value_counts()

# Find classes that have only one instance
single_classes = class_counts[class_counts == 1].index


# Remove rows where 'antigen.epitope' belongs to classes with only one instance
data_filtered = data[~data['antigen.epitope'].isin(single_classes)]

# 首先，根据`cdr3_a_aa`、`v_a_gene`和`j_a_gene`来计算距离矩阵
# 计算距离矩阵的具体代码将根据所选算法和数据类型有所不同

# 定义特征列和目标列
feature_columns = ['cdr3_a_aa', 'v_a_gene', 'j_a_gene']
target_column = 'antigen.epitope'

# 提取特征和目标
X = data_filtered[feature_columns]  # 或是距离矩阵，取决于您的模型需求
y = data_filtered[target_column]
len(y)
len(X)
# 分层划分测试集和训练集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# 以下是计算距离矩阵和模型训练的代码，这需要进一步的信息
X_train.to_csv('human_alpha_train.csv')
X_test.to_csv('human_alpha_test.csv')

In [3]:
single_classes

Index(['LGYGFVNYI', 'FPQSAPHGVVF', 'GLLDEDFYA', 'HQNPVTGLLL', 'RVSTLRVSL',
       'SYMIMEIE', 'TLATHGLAAV', 'LLFNKVTLA', 'MPYGYVLNEF', 'KQIYKTPPI',
       ...
       'FSWGAEGQRPGF', 'MDFARVHFISALHGSG', 'ENPVVHFFKNIVTP', 'LQPLALEGSLQKRG',
       'QPLALEGSLQKRG', 'SMGVTYEM', 'YMGVSYEM', 'YMGVVYEM', 'KMGVTYEM',
       'RPPIFIRRL'],
      dtype='object', length=184)

In [6]:
import numpy as np
from tcrdist.repertoire import TCRrep
#创造‘TCRrep’类
#alpha chain
data_alpha = pd.read_csv("human_alpha_train.csv")
tr_alpha = TCRrep(cell_df=data_alpha, 
                  organism='human', 
                  chains=['alpha'], 
                  db_file='alphabeta_gammadelta_db.tsv')
tr_alpha.compute_distances()

distance_matrix = tr_alpha.pw_alpha
#np.save('alpha_distance_people_matrix.npy', alpha_distance_matrix)
tr_alpha.pw_alpha


  self._validate_cell_df()


array([[  0, 138,   0, ...,  90,  78, 159],
       [138,   0, 138, ..., 129, 141, 141],
       [  0, 138,   0, ...,  90,  78, 159],
       ...,
       [ 90, 129,  90, ...,   0,  42, 138],
       [ 78, 141,  78, ...,  42,   0, 144],
       [159, 141, 159, ..., 138, 144,   0]], dtype=int16)

In [8]:
# 划分数据集
# 注意：对于K-NN，我们不需要转换为特征向量，但我们需要索引来跟踪训练集和测试集
indices = range(len(y))
indices_train, indices_test, y_train, y_test = train_test_split(indices, y, test_size=0.2, stratify=y, random_state=42)

# 使用训练集的索引创建训练用的距离矩阵
distance_matrix_train = distance_matrix[np.ix_(indices_train, indices_train)]

# 使用训练集和测试集的索引创建预测用的距离矩阵
distance_matrix_test = distance_matrix[np.ix_(indices_test, indices_train)]

# 训练K-NN模型，假设k=5
knn = KNeighborsClassifier(n_neighbors=5, metric='precomputed')
knn.fit(distance_matrix_train, y_train)

# 预测
y_pred = knn.predict(distance_matrix_test)

IndexError: index 2223 is out of bounds for axis 0 with size 2054

In [11]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score

# 假设 data 是加载的 DataFrame，包含 'cdr3_a_aa', 'v_a_gene', 'j_a_gene' 和 'antigen.epitope' 列

# 首先，我们需要将分类数据转换为数值数据，这里使用 LabelEncoder
le = LabelEncoder()

# 对 'cdr3_a_aa', 'v_a_gene', 'j_a_gene' 进行编码
data['cdr3_a_aa_encoded'] = le.fit_transform(data['cdr3_a_aa'])
data['v_a_gene_encoded'] = le.fit_transform(data['v_a_gene'])
data['j_a_gene_encoded'] = le.fit_transform(data['j_a_gene'])

# 提取特征和目标变量
X = data[['cdr3_a_aa_encoded', 'v_a_gene_encoded', 'j_a_gene_encoded']]
y = le.fit_transform(data['antigen.epitope'])  # 对目标变量也进行编码

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 使用 K-近邻算法创建模型实例
knn = KNeighborsClassifier(n_neighbors=3)

# 训练模型
knn.fit(X_train, y_train)

# 在测试集上进行预测
y_pred = knn.predict(X_test)

# 打印分类报告和准确率
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.50      0.50      0.50         2
           2       0.00      0.00      0.00         1
           3       0.00      0.00      0.00         0
           4       0.00      0.00      0.00         1
           5       0.50      0.67      0.57         3
           6       0.00      0.00      0.00         0
           7       0.00      0.00      0.00         2
           8       0.00      0.00      0.00         0
           9       0.00      0.00      0.00         0
          12       1.00      1.00      1.00         1
          15       0.33      1.00      0.50         1
          16       0.00      0.00      0.00         1
          17       0.00      0.00      0.00         1
          18       0.00      0.00      0.00         1
          19       0.00      0.00      0.00         0
          20       0.17      0.50      0.25         2
          24       0.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [28]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score

# 假设 data 是加载的 DataFrame，包含 'cdr3_a_aa', 'v_a_gene', 'j_a_gene' 和 'antigen.epitope' 列

# 首先，我们需要将分类数据转换为数值数据，这里使用 LabelEncoder
le = LabelEncoder()

# 对 'cdr3_a_aa', 'v_a_gene', 'j_a_gene' 进行编码
data_filtered['cdr3_a_aa_encoded'] = le.fit_transform(data_filtered['cdr3_a_aa'])
data_filtered['v_a_gene_encoded'] = le.fit_transform(data_filtered['v_a_gene'])
data_filtered['j_a_gene_encoded'] = le.fit_transform(data_filtered['j_a_gene'])

# 提取特征和目标变量
X = data_filtered[['cdr3_a_aa_encoded', 'v_a_gene_encoded', 'j_a_gene_encoded']]
y = le.fit_transform(data_filtered['antigen.epitope'])  # 对目标变量也进行编码

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 使用 K-近邻算法创建模型实例
knn = KNeighborsClassifier(n_neighbors=5)

# 训练模型
knn.fit(X_train, y_train)

# 在测试集上进行预测
y_pred = knn.predict(X_test)

# 打印分类报告和准确率
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       0.00      0.00      0.00         0
           2       0.00      0.00      0.00         1
           3       0.00      0.00      0.00         1
           4       0.00      0.00      0.00         1
           5       0.00      0.00      0.00         1
           6       0.00      0.00      0.00         1
           7       1.00      1.00      1.00         6
           8       1.00      1.00      1.00         1
          10       0.00      0.00      0.00         1
          12       0.00      0.00      0.00         1
          13       0.20      0.50      0.29         2
          14       0.00      0.00      0.00         1
          15       0.00      0.00      0.00         0
          16       0.00      0.00      0.00         3
          17       0.25      1.00      0.40         1
          18       0.00      0.00      0.00         3
          19       1.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [73]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# 创建 OneHotEncoder，对序列和基因名称进行独热编码
column_transformer = ColumnTransformer([
    ('cdr3_a_aa_ohe', OneHotEncoder(), ['cdr3_a_aa']),
    ('v_a_gene_ohe', OneHotEncoder(), ['v_a_gene']),
    ('j_a_gene_ohe', OneHotEncoder(), ['j_a_gene'])
], remainder='drop')  # drop 表示除了指定的列以外的其他列将被丢弃

# 对特征进行独热编码转换
X_encoded = column_transformer.fit_transform(data_filtered)

# 标签也需要转换为数值型
y_encoded = LabelEncoder().fit_transform(data_filtered['antigen.epitope'])

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y_encoded, test_size=0.2, random_state=42)

# 创建并训练模型，这里我们仍然使用 KNN，但你也可以尝试其他模型
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train, y_train)

# 在测试集上进行预测
y_pred = knn_model.predict(X_test)

# 输出模型表现
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))


              precision    recall  f1-score   support

           2       0.00      0.00      0.00         1
           3       0.00      0.00      0.00         1
           4       0.33      1.00      0.50         1
           5       0.00      0.00      0.00         1
           6       0.00      0.00      0.00         1
           7       1.00      1.00      1.00         6
           8       1.00      1.00      1.00         1
          10       0.00      0.00      0.00         1
          12       0.00      0.00      0.00         1
          13       0.33      0.50      0.40         2
          14       0.00      0.00      0.00         1
          16       0.00      0.00      0.00         3
          17       0.00      0.00      0.00         1
          18       0.00      0.00      0.00         3
          19       0.33      0.50      0.40         2
          20       0.00      0.00      0.00         4
          21       0.00      0.00      0.00         2
          22       0.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [16]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.pipeline import Pipeline

# 假设 'data' 是一个 DataFrame，包含你的三个特征列和一个目标列 'antigen.epitope'

# 创建 OneHotEncoder 实例，自动忽略无法编码的数值型数据
column_transformer = ColumnTransformer([
    ('ohe', OneHotEncoder(), ['cdr3_a_aa', 'v_a_gene', 'j_a_gene'])
], remainder='passthrough')

# 创建逻辑回归模型
logistic_model = LogisticRegression(max_iter=1000)

# 创建包含预处理和模型的管道
model_pipeline = Pipeline([
    ('encoder', column_transformer),
    ('classifier', logistic_model)
])

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(
    data[['cdr3_a_aa', 'v_a_gene', 'j_a_gene']],  # 特征数据
    data['antigen.epitope'],  # 目标数据
    test_size=0.2, 
    random_state=42
)

# 训练模型
model_pipeline.fit(X_train, y_train)

# 在测试集上进行预测
y_pred = model_pipeline.predict(X_test)

# 输出模型表现
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))


ValueError: Found unknown categories ['CAVSGQGDDKIIF', 'CALMSARLMF', 'CWSPFGNEKLTF', 'CAGASPGGYGGSQGNLIF', 'CALSAPYSGGGADGLTF', 'CALSGANAGNMLTF', 'CAEGEAGTALIF', 'CAMSGYTNAGKSTF', 'CALKISGSGYALNF', 'CAVQAGNNNDMRF', 'CIALNARLMF', 'CAVQAGGNNRLAF', 'CALVYSGGYQKVTF', 'CAALANQAGTALIF', 'CAMREVNDYKLSF', 'CAGLNQGAQKLVF', 'CVVVRMDSSYKLIF', 'CAASSPSGGYQKVTF', 'CAVTGGGSQGNLIF', 'CLVGGAYTGGFKTIF', 'CALDDRGSTLGRLYF', 'CALRMIGGGSNYKLTF', 'CAVLPHGNNRLAF', 'CAVRGPMNTGFQKLVF', 'CIVRAPPDSWGKLQF', 'CAASLSGGGADGLTF', 'CAASLNTGKLIF', 'CALSENFIQGAQKLVF', 'CAVQAAREYNFNKFYF', 'CAGRTFDKIIF', 'CATEGDSGYSTLTF', 'CAEENAGNMLTF', 'CAVRDPLYNFNKFYF', 'CAEDAASTLTF', 'CAYRSVQGAQKLVF', 'CAASVLYGQNFVF', 'CVVRAGKLIF', 'CAASINSGNTPLVF', 'CVVNNAGNMLTF', 'CLVGGDNQGGKLIF', 'CLVGAGNMLTF', 'CALSNDYKLSF', 'CASGGGADGLTF', 'CAGNTGTASKLTF', 'CIVHTNSGGSNYKLTF', 'CAVDNARLMF', 'CAVKGSQGNLIF', 'CAGQLSGGSNYKLTF', 'CAASAGGNNRLAF', 'CAMSVNAGGTSYGKLTF', 'CASDDARLMF', 'CAESGGNNNDMRF', 'CAFFPYGQNFVF', 'CAVWDTGKLIF', 'CALSDRDGGTSYGKLTF', 'CALSEHTTDSWGKFQF', 'CATDGDSGAGSYQLTF', 'CAASAANFANDKLTF', 'CAAVHDYKLSF', 'CAFTNYNQGGKLIF', 'CAYRSGYMEYGNKLVF', 'CATEARMDSSYKLIF', 'CAFGRGNNDMRF', 'CAESEGKLIF', 'CAVGSNSGYALNF', 'CALSGSSVGAAGNKLTF', 'CAYRSSNFNEKLTF', 'CAVSSAGGFKTIF', 'CAVAHSGGYQKVTF', 'CAFMKHEDSGAGSYQLTF', 'CAMREDSIGNTPLVF', 'CALDNAGHMLTF', 'CAFMNHTGTASKLTF', 'CAVGEDSSYKLIF', 'CAEDNYGQNFVF', 'CAGRHGGTSYGKLTF', 'CAGYNSGGSNYKLTF', 'CAGQPGAGGSQGNLIF', 'CALQGWVRGADGLTF', 'CAVRDVNTGFQKLVF', 'CAGAGNTGKLIF', 'CAYNAGNMLTF', 'CAVNPGNQFYF', 'CAVGTAWRSGGGADGLTF', 'CAVRDYGQNFVF', 'CALSEGYNFNKFYF', 'CAVDISNAGNMLTF', 'CAYRSAFKLTF', 'CAVNPIGGYNKLIF', 'CAGHLFKAAGNKLTF', 'CAYYGGNQFYF', 'CPTLGGSNYKLTF', 'CALILNQAGTALIF', 'CAFTAAGNKLTF', 'CAVDGSQGNLIF', 'CAVSGYGGSQGNLIF', 'CAYRSAGGGTSYGKLTF', 'CAASHIQGAQKLVF', 'CAVGNAITSSSDKLIF', 'CAGPTTSGTYKYIF', 'CAAREDSSYKLIF', 'CAASLNSGGYQKVTF', 'CLKAGGFKTIF', 'CARPAAERDDKIIF', 'CVVNSWAGNQFYF', 'CAVGDGNNRLAF', 'CATYLTGNQFYF', 'CASISNTGNQFYF', 'CGADWKTSYDKVIF', 'CATDEAGRRALTF', 'CAYPYNNNDMRF', 'CAVSEISGTYKYIF', 'CILRDVSGGGSNYKLTF', 'CALSEAGYGGATNKLIF', 'CASKAAGTKLTF', 'CAGSYGGSQGNLIF', 'CAFETGNQFYF', 'CAFISTQGGSEKLVF', 'CAGLGNFGNEKLTF', 'CAVLQRRSGGSNYKLTF', 'CASTAGPNFGNEKLTF', 'CAWRGGGGADGLTF', 'CAAAASGGSYIPTF', 'CAETPTNDYKLSF', 'CAASARGNQGGKLIF', 'CAASAGSYNSDKLIF', 'CAFSGGSNYKLTF', 'CAMREGRYSSASKIIF', 'CALIQGAQKLVF', 'CLVGEAAGNKLTF', 'CALSEATSGTYKYIF', 'CAVFMDSNYQLIW', 'CAASKAAGNKLTF', 'CAEMNSGYSTLTF', 'CAVDTGTASKLTF', 'CAVRHTNAGKSTF', 'CAFTELNSGGSNYKLTF', 'CAGKSLFGTNAGKSTF', 'CAVYPGGSQGNLIF', 'CALEAGNKLTF', 'CAMNTGNQFYF', 'CAVSESRNRDDKIIF', 'CATDALGNGNEKLTF', 'CAVATGAAGNKLTF', 'CAVQGSQGNLIF', 'CAVDANNDMRF', 'CAVSESGGSYIPTF', 'CLVPSEQAGTALIF', 'CAVEPMEYGNKLVF', 'CAVSEGGATNKLIF', 'CAVTDDKIIF', 'CAVGGLSGANSKLTF', 'CAVRSDQAGTALIF', 'CAVPWGGNTGKLIF', 'CILRSSSGGGSNYKLTF', 'CGADFLMNRDDKIIF', 'CAAFDDKIIF', 'CARDAGNMLTF', 'CALFTGGGNKLTF', 'CASSGGNTPLVF', 'CAYVQDDKIIF', 'CAYIIIQGAQKLVF', 'CAVNALLGNQFYF', 'CVVNGNNNDMRF', 'CLVGDIGAAGNKLTF', 'CAAPNSGGSNYKLTF', 'CAESGGSNYKLTF', 'CAVQFMDSNYQLIW', 'CAVIKGYSTLTF', 'CAATSGTYKYIF', 'CAIQTGANNLFF', 'CAAQRANRDDKIIF', 'CALNKTHNNLTF', 'CAYRSHYTSGTYKYIF', 'CAVANQAGTALIF', 'CAVDVNDYKLSF', 'CALRSGYALNF', 'CAGQASQGNLIF', 'CALPREYGNKLVF', 'CAGQLQKAACNKLIF', 'CAVHTGARLMF', 'CAVDDLYSNYQLIW', 'CAVRAYGQNFVF', 'CAVEGAGSYQLTF', 'CAFCGGTSYGKLTF', 'CALSESGANSKLTF', 'CAETYTGNQFYF', 'CAVILRSNDYKLSF', 'CALSGGYQKVTF', 'CAENSNTGNQFYF', 'CVVRGMDSSYKLIF', 'CAAEAGNHRGSTLGRLYF', 'CAVYTGGFKTIF', 'CSKTSYDKVIF', 'CVAASYNTDKLIF', 'CALRGYGQNFVF', 'CILRDDNDMRF', 'CALDTARLMF', 'CAVERGGGNKLTF', 'CAFKGAGNKLTF', 'CAGGYGGSQGNLIF', 'CLVGDGGSFSGGYNKLIF', 'CALSEVQLMDSNYQLIW', 'CLLMEYGNKLVF', 'CAMRDPHLWSGATNKLIF', 'CAYREGAQKLVF', 'CVGGGGTSGGGADGLTF', 'CAESKRDGGATNKLIF', 'CAVRDNSITGGFKTIF', 'CAPRNAGGTSYGKLTF', 'CAVDSSASKIIF', 'CAVSDLEPNSSASKIIF', 'CAFEDSGGSNYKLTF', 'CAHNTGNQFYF', 'CVVIEGNKLVF', 'CAENGGGSTLGRLYF', 'CAERIQTGANNLFF', 'CAVKSWSGPGWGNQAGTALIF', 'CAGLKAAGNKLTF', 'CPFQTGANNLFF', 'CLVAGAGGYNKLIF', 'CAVQTLGNAGNMLTF', 'CAVRDINARLMF', 'CAGMDSNYQLIW', 'CAARGGADGLTF', 'CALSLYSGAGSYQLTF', 'CAVIPDFGNEKLTF', 'CALSENKLSF', 'CAITGGFKTIF', 'CVVNTGGSYIPTF', 'CAVRPGYSSASKIIF', 'CIVKTNSGGSNYKLTF', 'CSPQGGSEKLVF', 'CAEDQNARLMF', 'CAEKGGTALIF', 'CATDVAGRRALTF', 'CAVGGNDWNTDKLIF', 'CAPSAGTYNTDKLIF', 'CAYGANNLFF', 'CLVGDNDYKLSF', 'CAVTHRFHTASKLTF', 'CAVPSGSARQLTF', 'CAYRDDKIIF', 'CAGPYTGANSKLTF', 'CATDLKTSYDKVIF', 'CALSEFRGNTPLVF', 'CASGLPDTPLVF', 'CAPPEGGATNKLIF', 'CAVTDSWGKLQF', 'CAVEDTNSGYALNF', 'CADLNARLMF', 'CAVTAGGGNKLTF', 'CVGNSYGQNFVF', 'CACLTGTASKLTF', 'CAVRDPGNTDKLIF', 'CAVISGGGADGLTF'] in column 0 during transform

In [20]:
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Flatten, Dense, Concatenate
from tensorflow.keras.utils import to_categorical

# 假设 data 是你的 DataFrame，包含三个类别型特征和一个目标类别 'antigen.epitope'
data=data_filtered
# 将类别型特征编码为整数
label_encoders = {}
for column in ['cdr3_a_aa', 'v_a_gene', 'j_a_gene']:
    le = LabelEncoder()
    data[column] = le.fit_transform(data[column])
    label_encoders[column] = le

# 对目标变量进行编码
target_encoder = LabelEncoder()
y = target_encoder.fit_transform(data['antigen.epitope'])
y = to_categorical(y)  # 使用 one-hot 编码目标变量

# 划分数据集
X_train, X_test, y_train, y_test = train_test_split(
    data[['cdr3_a_aa', 'v_a_gene', 'j_a_gene']], y, test_size=0.2, random_state=42
)

# 构建模型
input_layers = []
embedding_layers = []
for column in ['cdr3_a_aa', 'v_a_gene', 'j_a_gene']:
    num_unique_values = int(data[column].nunique())
    embedding_dim = min(np.ceil(num_unique_values / 2), 50)
    input_layer = Input(shape=(1,))
    embedding_layer = Embedding(num_unique_values, int(embedding_dim), input_length=1)(input_layer)
    embedding_layer = Flatten()(embedding_layer)
    input_layers.append(input_layer)
    embedding_layers.append(embedding_layer)

# 合并嵌入层的输出
concat_layer = Concatenate()(embedding_layers)
dense_layer = Dense(64, activation='relu')(concat_layer)
output_layer = Dense(y.shape[1], activation='softmax')(dense_layer)

model = Model(inputs=input_layers, outputs=output_layer)

# 编译模型
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# 训练模型
model.fit([X_train[column] for column in ['cdr3_a_aa', 'v_a_gene', 'j_a_gene']], y_train, epochs=10, batch_size=32)

# 评估模型
model.evaluate([X_test[column] for column in ['cdr3_a_aa', 'v_a_gene', 'j_a_gene']], y_test)


Epoch 1/10




[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.1436 - loss: 4.9892 
Epoch 2/10
[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.2883 - loss: 4.0931
Epoch 3/10
[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.3624 - loss: 3.0302
Epoch 4/10
[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.4373 - loss: 2.6667
Epoch 5/10
[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.4979 - loss: 2.2825
Epoch 6/10
[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.5460 - loss: 1.9912
Epoch 7/10
[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.5803 - loss: 1.7862
Epoch 8/10
[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6424 - loss: 1.5239
Epoch 9/10
[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [

[2.2510392665863037, 0.548638105392456]

In [21]:
# 训练模型
history = model.fit([X_train[column] for column in ['cdr3_a_aa', 'v_a_gene', 'j_a_gene']], y_train, epochs=10, batch_size=32, validation_split=0.1)

# 评估模型，并获取测试集上的损失和准确度
test_loss, test_accuracy = model.evaluate([X_test[column] for column in ['cdr3_a_aa', 'v_a_gene', 'j_a_gene']], y_test)

# 输出测试集上的准确度
print("Test Accuracy:", test_accuracy)


Epoch 1/10
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.8297 - loss: 0.9052 - val_accuracy: 0.8835 - val_loss: 0.7283
Epoch 2/10
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8387 - loss: 0.8838 - val_accuracy: 0.8981 - val_loss: 0.6575
Epoch 3/10
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8959 - loss: 0.5878 - val_accuracy: 0.8981 - val_loss: 0.6021
Epoch 4/10
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9077 - loss: 0.5442 - val_accuracy: 0.9126 - val_loss: 0.5570
Epoch 5/10
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9256 - loss: 0.4347 - val_accuracy: 0.9175 - val_loss: 0.5309
Epoch 6/10
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9268 - loss: 0.3667 - val_accuracy: 0.9223 - val_loss: 0.5054
Epoch 7/10
[1m58/58[0m [32m━━━━━━━━━━

In [26]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import pairwise_distances
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

# 加载数据
# data = pd.read_csv('your_data.csv')  # 如果数据来自CSV文件

# 标准化特征数据，因为距离计算对于数据尺度敏感
scaler = StandardScaler()
features = data[['cdr3_a_aa', 'v_a_gene', 'j_a_gene']]
features_scaled = scaler.fit_transform(features)

# 计算特征的距离矩阵
distance_matrix = pairwise_distances(features_scaled, metric='euclidean')
# 目标变量
target = data['antigen.epitope']

# 划分数据集为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(distance_matrix, target, test_size=0.2, random_state=42)

# 使用预计算核的 SVM
svm_model = SVC(kernel='precomputed')

# 训练模型
svm_model.fit(X_train, y_train)  # 注意：这里用的是训练集的距离矩阵

# 进行预测，这里我们需要提供测试集和训练集之间的距离矩阵
# 测试集和训练集之间的距离矩阵需要特别构造
X_test_train = distance_matrix[y_test.index, :][:, y_train.index]
y_pred = svm_model.predict(X_test_train)

# 输出模型的分类报告和准确度
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))

ValueError: Precomputed matrix must be a square matrix. Input is a 2054x2568 matrix.

In [27]:
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import pairwise_distances
from sklearn.model_selection import train_test_split

# 数据标准化
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

# 计算整个数据集的距离矩阵
distance_matrix = pairwise_distances(features_scaled, metric='euclidean')

# 划分数据集
X_indices_train, X_indices_test, y_train, y_test = train_test_split(
    np.arange(distance_matrix.shape[0]), target, test_size=0.2, random_state=42
)

# 提取训练集的距离矩阵（方阵）
X_train = distance_matrix[np.ix_(X_indices_train, X_indices_train)]

# 提取测试数据和训练数据之间的距离矩阵
X_test = distance_matrix[np.ix_(X_indices_test, X_indices_train)]

from sklearn.svm import SVC

# 使用预计算核的 SVM
svm_model = SVC(kernel='precomputed')

# 使用训练集的距离矩阵训练模型
svm_model.fit(X_train, y_train)

# 使用测试集和训练集之间的距离矩阵进行预测
y_pred = svm_model.predict(X_test)

from sklearn.metrics import classification_report, accuracy_score
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))


                          precision    recall  f1-score   support

         AALALLLLDRLNQLE       0.00      0.00      0.00         1
         AAVVRFQEAANKQKQ       0.00      0.00      0.00         1
              ALDPHSGHFV       0.00      0.00      0.00         1
               ALHGGWTTK       0.00      0.00      0.00         1
               ALLPGLPAA       0.00      0.00      0.00         1
               ALSPVIPHI       0.00      0.00      0.00         6
               ALWGFFPVL       0.00      0.00      0.00         1
               ALYGFVPVL       0.00      0.00      0.00         1
            APFSEQEQPVLG       0.00      0.00      0.00         1
           APRGPHGGAASGL       0.00      0.00      0.00         2
               AVGSYVYSV       0.00      0.00      0.00         1
               CINGVCWTV       0.00      0.00      0.00         3
            CPSQEPMSIYVY       0.00      0.00      0.00         1
            DATYQRTRALVR       0.00      0.00      0.00         3
         

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [36]:
# 假设 data 是已经加载的 pandas DataFrame，包含 'antigen.epitope' 列

# 计算 'antigen.epitope' 列中每个特征的数量
epitope_counts = data['antigen.epitope'].value_counts()

epitope_counts


NLVPMVATV               467
GILGFVFTL               309
NEGVKAAW                209
FRDYVDRFYKTLRAEQASQE    139
TFEYVSQPFLMDLE          120
                       ... 
TPQDLNTML                 2
GYNSYSVSNSEKHIM           2
GAVGVGKSAL                2
RLPAKAPL                  2
GQVELGGGNAVEVCKGS         2
Name: antigen.epitope, Length: 155, dtype: int64

In [38]:
epitope_counts.to_csv('1.csv')

In [52]:
from sklearn.utils import resample

# Splitting the data into features and target variable
X = data.drop('antigen.epitope', axis=1)
y = data['antigen.epitope']

# Splitting the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Concatenate our training data back together
training_set = pd.concat([X_train, y_train], axis=1)

# Separate minority and majority classes
minority = training_set[training_set['antigen.epitope'] == training_set['antigen.epitope'].value_counts().idxmin()]
majority = training_set[training_set['antigen.epitope'] != training_set['antigen.epitope'].value_counts().idxmin()]

# Upsample minority class
minority_upsampled = resample(minority,
                              replace=True, # Sample with replacement
                              n_samples=len(majority), # Match number in majority class
                              random_state=42) # Reproducible results

# Combine majority and upsampled minority
upsampled = pd.concat([majority, minority_upsampled])

# Checking counts
upsampled['antigen.epitope'].value_counts(), upsampled.shape

upsampled

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,complex.id,gene,cdr3_a_aa,v_a_gene,j_a_gene,species,antigen.gene,antigen.species,vdjdb.score,count,cdr3_a_aa_encoded,v_a_gene_encoded,j_a_gene_encoded,antigen.epitope
2453,2453,80819,27286,TRA,881,4,16,HomoSapiens,KRAS,HomoSapiens,2,1,881,4,16,MTEYKLVVVGARGVGKSALTIQLI
424,424,5795,469,TRA,1247,31,34,HomoSapiens,NS3,HCV,3,1,1247,31,34,KLVALGINAV
1224,1224,23649,2060,TRA,1290,31,27,HomoSapiens,NS3,HCV,1,1,1290,31,27,RAQAPPPSW
2321,2321,80206,0,TRA,482,37,27,HomoSapiens,Gag,HIV-1,1,1,482,37,27,FRDYVDRFYKTLRAEQASQE
2044,2044,74896,0,TRA,673,8,11,HomoSapiens,Gag,HIV-1,1,1,673,8,11,KAFSPEVIPMF
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
235,235,1785,363,TRA,1414,33,32,HomoSapiens,DQ2-GLIA-OMEGA1,Homo sapiens,3,1,1414,33,32,LQPFPQPELPYGSGGS
235,235,1785,363,TRA,1414,33,32,HomoSapiens,DQ2-GLIA-OMEGA1,Homo sapiens,3,1,1414,33,32,LQPFPQPELPYGSGGS
235,235,1785,363,TRA,1414,33,32,HomoSapiens,DQ2-GLIA-OMEGA1,Homo sapiens,3,1,1414,33,32,LQPFPQPELPYGSGGS
235,235,1785,363,TRA,1414,33,32,HomoSapiens,DQ2-GLIA-OMEGA1,Homo sapiens,3,1,1414,33,32,LQPFPQPELPYGSGGS


In [98]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.utils.class_weight import compute_class_weight

# Ensure all necessary variables are defined and processed correctly
labels = data['antigen.epitope']
features = data[['cdr3_a_aa', 'v_a_gene', 'j_a_gene']]
features_encoded = pd.get_dummies(features)

# Splitting the dataset into training and testing sets again
X_train, X_test, y_train, y_test = train_test_split(features_encoded, labels, test_size=0.15, random_state=42)

# Calculate class weights based on the training labels
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
weights = dict(zip(np.unique(y_train), class_weights))

# Training the Random Forest model with class weights
rf_classifier_weighted = RandomForestClassifier(random_state=30, class_weight=weights)
rf_classifier_weighted.fit(X_train, y_train)

# Predicting the test results
y_pred_weighted = rf_classifier_weighted.predict(X_test)

# Generating and displaying the classification report
classification_report_weighted = classification_report(y_test, y_pred_weighted)
classification_report_weighted

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# 计算准确率
accuracy = accuracy_score(y_test, y_pred_weighted)
print(f'Accuracy: {accuracy:.2f}')

# 计算精确度
precision = precision_score(y_test, y_pred_weighted, average='macro', zero_division=0)
print(f'Precision: {precision:.2f}')

# 计算召回率
recall = recall_score(y_test, y_pred_weighted, average='macro', zero_division=0)
print(f'Recall: {recall:.2f}')

# 计算F1分数
f1 = f1_score(y_test, y_pred_weighted, average='macro', zero_division=0)
print(f'F1 Score: {f1:.2f}')

Accuracy: 0.52
Precision: 0.20
Recall: 0.21
F1 Score: 0.19


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [72]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from collections import Counter

labels = data['antigen.epitope']
sequences = data['cdr3_a_aa']

# 3. 特征提取函数：计算序列中每个字符的频率
def feature_extraction(sequences):
    features = []
    for sequence in sequences:
        # 计算序列中每个字符的频率
        freqs = Counter(sequence)
        # 标准化频率（可选）
        total = sum(freqs.values())
        features.append({char: count / total for char, count in freqs.items()})
    return pd.DataFrame(features).fillna(0)

# 应用特征提取
features_encoded = feature_extraction(sequences)

# 4. 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(features_encoded, labels, test_size=0.2, random_state=42)

# 5. 训练逻辑回归模型
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# 6. 进行预测
y_pred = model.predict(X_test)

# 7. 评估模型
print('Accuracy:', accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.24863883847549909
                          precision    recall  f1-score   support

               AAFKRSCLK       0.00      0.00      0.00         1
               AAGIGILTV       0.00      0.00      0.00         2
         AALALLLLDRLNQLE       0.00      0.00      0.00         1
         AAVVRFQEAANKQKQ       0.00      0.00      0.00         1
              ALDPHSGHFV       0.00      0.00      0.00         3
               ALHGGWTTK       0.00      0.00      0.00         2
               ALSPVIPHI       0.00      0.00      0.00         1
              ALWGPDPAAA       0.00      0.00      0.00         1
               ALYGFVPVL       0.00      0.00      0.00         1
             APARLERRHSA       0.00      0.00      0.00         1
            APFSEQEQPVLG       0.00      0.00      0.00         1
           APRGPHGGAASGL       0.00      0.00      0.00         2
               CINGVCWTV       0.00      0.00      0.00         4
            CPSQEPMSIYVY       0.00      0.00

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [101]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# 创建 OneHotEncoder，对序列和基因名称进行独热编码
column_transformer = ColumnTransformer([
    ('cdr3_a_aa_ohe', OneHotEncoder(), ['cdr3_a_aa']),
    ('v_a_gene_ohe', OneHotEncoder(), ['v_a_gene']),
    ('j_a_gene_ohe', OneHotEncoder(), ['j_a_gene']),
    ('vdjdb.score_ohe', OneHotEncoder(), ['vdjdb.score'])
], remainder='drop')  # drop 表示除了指定的列以外的其他列将被丢弃

# 对特征进行独热编码转换
X_encoded = column_transformer.fit_transform(data_filtered)

# 标签也需要转换为数值型
y_encoded = LabelEncoder().fit_transform(data_filtered['antigen.epitope'])

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y_encoded, test_size=0.2, random_state=42)

# 创建并训练模型，这里我们仍然使用 KNN，但你也可以尝试其他模型
knn_model = KNeighborsClassifier(n_neighbors=3)
knn_model.fit(X_train, y_train)

# 在测试集上进行预测
y_pred = knn_model.predict(X_test)

# 输出模型表现
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))


              precision    recall  f1-score   support

           2       0.00      0.00      0.00         1
           3       0.00      0.00      0.00         1
           4       0.50      1.00      0.67         1
           5       0.00      0.00      0.00         1
           6       0.00      0.00      0.00         1
           7       1.00      1.00      1.00         6
           8       1.00      1.00      1.00         1
           9       0.00      0.00      0.00         0
          10       0.00      0.00      0.00         1
          12       0.00      0.00      0.00         1
          13       0.50      0.50      0.50         2
          14       0.00      0.00      0.00         1
          16       0.00      0.00      0.00         3
          17       0.00      0.00      0.00         1
          18       0.00      0.00      0.00         3
          19       0.50      0.50      0.50         2
          20       0.00      0.00      0.00         4
          21       0.50    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [94]:
# 2. 特征提取函数：计算 `cdr3_a_aa` 中每个字符的频率
def sequence_features(sequences):
    features = []
    for sequence in sequences:
        freqs = Counter(sequence)
        total = sum(freqs.values())
        feature = {char: count / total for char, count in freqs.items()}
        features.append(feature)
    return pd.DataFrame(features).fillna(0)

# 应用特征提取
X_train_seq = sequence_features(data['cdr3_a_aa'])
X_test_seq = sequence_features(data['cdr3_a_aa'])

# 3. 独热编码 `v_a_gene` 和 `j_a_gene`
column_trans = ColumnTransformer(
    [('one_hot_encoder', OneHotEncoder(), ['v_a_gene', 'j_a_gene'])],
    remainder='passthrough'
)

X_train_encoded = column_trans.fit_transform(data[['v_a_gene', 'j_a_gene']])
X_test_encoded = column_trans.transform(data[['v_a_gene', 'j_a_gene']])

# 4. 合并所有特征
X_train = pd.concat([X_train_seq.reset_index(drop=True), pd.DataFrame(X_train_encoded.toarray())], axis=1)
X_test = pd.concat([X_test_seq.reset_index(drop=True), pd.DataFrame(X_test_encoded.toarray())], axis=1)
y_train = data['antigen.epitope']
y_test = data['antigen.epitope']

# 5. 训练逻辑回归模型
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# 6. 进行预测
y_pred = model.predict(X_test)

# 7. 评估模型
print('Accuracy:', accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

TypeError: Feature names are only supported if all input features have string names, but your input has ['int', 'str'] as feature name / column name types. If you want feature names to be stored and validated, you must convert them all to strings, by using X.columns = X.columns.astype(str) for example. Otherwise you can remove feature / column names from your input data, or convert them all to a non-string data type.

In [96]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from collections import Counter


# 2. 特征提取函数：计算 `cdr3_a_aa` 中每个字符的频率
def sequence_features(sequences):
    features = []
    for sequence in sequences:
        freqs = Counter(sequence)
        total = sum(freqs.values())
        feature = {char: count / total for char, count in freqs.items()}
        features.append(feature)
    return pd.DataFrame(features).fillna(0)

# 应用特征提取
X_seq = sequence_features(data['cdr3_a_aa'])

# 3. 独热编码 `v_a_gene` 和 `j_a_gene`
column_trans = ColumnTransformer(
    [('one_hot_encoder', OneHotEncoder(), ['v_a_gene', 'j_a_gene'])],
    remainder='passthrough'
)

X_encoded = column_trans.fit_transform(data[['v_a_gene', 'j_a_gene']])

# 4. 合并所有特征
X = pd.concat([X_seq.reset_index(drop=True), pd.DataFrame(X_encoded.toarray())], axis=1)
X.columns = X.columns.astype(str)  # 将所有列名转换为字符串类型
y = data['antigen.epitope']

# 5. 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 6. 训练逻辑回归模型
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# 7. 进行预测
y_pred = model.predict(X_test)

# 8. 评估模型
print('Accuracy:', accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.41379310344827586
                          precision    recall  f1-score   support

               AAFKRSCLK       0.00      0.00      0.00         1
               AAGIGILTV       0.00      0.00      0.00         2
         AALALLLLDRLNQLE       0.00      0.00      0.00         1
         AAVVRFQEAANKQKQ       0.00      0.00      0.00         1
              ALDPHSGHFV       0.00      0.00      0.00         3
               ALHGGWTTK       0.00      0.00      0.00         2
               ALSPVIPHI       0.00      0.00      0.00         1
              ALWGPDPAAA       0.00      0.00      0.00         1
               ALYGFVPVL       0.00      0.00      0.00         1
             APARLERRHSA       0.00      0.00      0.00         1
            APFSEQEQPVLG       0.00      0.00      0.00         1
           APRGPHGGAASGL       0.00      0.00      0.00         2
               CINGVCWTV       0.00      0.00      0.00         4
            CPSQEPMSIYVY       1.00      1.00

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [97]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder
from collections import Counter


# 2. 特征提取函数：计算 `cdr3_a_aa` 中每个字符的频率
def sequence_features(sequences):
    features = []
    for sequence in sequences:
        freqs = Counter(sequence)
        total = sum(freqs.values())
        feature = {char: count / total for char, count in freqs.items()}
        features.append(feature)
    return pd.DataFrame(features).fillna(0)

# 应用特征提取
X_seq = sequence_features(data['cdr3_a_aa'])

# 3. 使用 LabelEncoder 转换 `v_a_gene` 和 `j_a_gene`
label_encoder_v = LabelEncoder()
label_encoder_j = LabelEncoder()

v_encoded = label_encoder_v.fit_transform(data['v_a_gene'])
j_encoded = label_encoder_j.fit_transform(data['j_a_gene'])

# 4. 合并所有特征
X = pd.concat([
    X_seq.reset_index(drop=True), 
    pd.DataFrame(v_encoded, columns=['v_a_gene']),
    pd.DataFrame(j_encoded, columns=['j_a_gene'])
], axis=1)

y = data['antigen.epitope']

# 5. 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 6. 训练逻辑回归模型
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# 7. 进行预测
y_pred = model.predict(X_test)

# 8. 评估模型
print('Accuracy:', accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.25226860254083483
                          precision    recall  f1-score   support

               AAFKRSCLK       0.00      0.00      0.00         1
               AAGIGILTV       0.00      0.00      0.00         2
         AALALLLLDRLNQLE       0.00      0.00      0.00         1
         AAVVRFQEAANKQKQ       0.00      0.00      0.00         1
              ALDPHSGHFV       0.00      0.00      0.00         3
               ALHGGWTTK       0.00      0.00      0.00         2
               ALSPVIPHI       0.00      0.00      0.00         1
              ALWGPDPAAA       0.00      0.00      0.00         1
               ALYGFVPVL       0.00      0.00      0.00         1
             APARLERRHSA       0.00      0.00      0.00         1
            APFSEQEQPVLG       0.00      0.00      0.00         1
           APRGPHGGAASGL       0.00      0.00      0.00         2
               CINGVCWTV       0.00      0.00      0.00         4
            CPSQEPMSIYVY       0.00      0.00

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  args=(X, target, sample_weight, l2_reg_strength, n_threads),
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
