In [1]:
from sklearn.model_selection import train_test_split

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sklearn.preprocessing import LabelEncoder

# Load the dataset containing sequence features
data = pd.read_csv("mouse_data_alpha.csv") 

# 假设 `data` 是已加载的pandas DataFrame，并且包含了上述的列

class_counts = data['antigen.epitope'].value_counts()

# Find classes that have only one instance
single_classes = class_counts[class_counts == 1].index


# Remove rows where 'antigen.epitope' belongs to classes with only one instance
data_filtered = data[~data['antigen.epitope'].isin(single_classes)]
data_filtered.to_csv("mouse_data_alpha_filtered.csv") 

# 首先，根据`cdr3_a_aa`、`v_a_gene`和`j_a_gene`来计算距离矩阵
# 计算距离矩阵的具体代码将根据所选算法和数据类型有所不同

# 定义特征列和目标列
feature_columns = ['cdr3_a_aa', 'v_a_gene', 'j_a_gene']
target_column = 'antigen.epitope'

# 提取特征和目标
X = data_filtered[feature_columns]  # 或是距离矩阵，取决于您的模型需求
y = data_filtered[target_column]

In [2]:
single_classes

Index(['KAPYNFATM', 'INFDFNTI', 'KAPFNFATM', 'KAPANFATM', 'KAVYNFATM',
       'VSYYGPKTSPVQ', 'ADSLSFFSSSIKRGGGSLVP', 'LVERLYLVCGGEG', 'KAPYDYAPI',
       'RGPGRAFVTI', 'VVVGAVGVGK', 'VVGAVGVGK', 'KAVANFATM', 'SPAPRPLDL',
       'VPYMAEFGM', 'ANGVAFFLTPFKA', 'KVITFIDL', 'RGYVYQGL', 'SRGGASQYRPSQ',
       'SQYYYNSL', 'GAMKRHGLDNYRGYSLG', 'WIYVYRPMGCGGS',
       'PADPLAFFSSAIKGGGGSLV', 'ADGLAYFRSSFKGG', 'FEAQKAKANKAVDG', 'MPAGRPWDL',
       'FEAQKAKANKAV', 'ADLIAYLEQATKG', 'QLSDVPMDL', 'SPLDSLWWI', 'FLSPFWFDI',
       'SPAEAGFFL', 'QPAEGGFQL', 'SIYRYYGL', 'VVVGADGVGK'],
      dtype='object', name='antigen.epitope')

In [3]:
import numpy as np
from tcrdist.repertoire import TCRrep
#创造‘TCRrep’类
#alpha chain
data_alpha = pd.read_csv("mouse_data_alpha_filtered.csv")
tr_alpha = TCRrep(cell_df=data_alpha, 
                  organism='mouse', 
                  chains=['alpha'], 
                  db_file='alphabeta_gammadelta_db.tsv')
tr_alpha.compute_distances()

alpha_distance_matrix = tr_alpha.pw_alpha

max_value = np.max(alpha_distance_matrix)
min_value = np.min(alpha_distance_matrix)

# 标准化距离矩阵
normalized_alpha_distance_matrix = (alpha_distance_matrix - min_value) / (max_value - min_value)
np.save('alpha_distance_mouse_matrix.npy', normalized_alpha_distance_matrix)
normalized_alpha_distance_matrix


  self._validate_cell_df()


array([[0.        , 0.        , 0.67272727, ..., 0.59545455, 0.57272727,
        0.58181818],
       [0.        , 0.        , 0.67272727, ..., 0.59545455, 0.57272727,
        0.58181818],
       [0.67272727, 0.67272727, 0.        , ..., 0.61363636, 0.61363636,
        0.6       ],
       ...,
       [0.59545455, 0.59545455, 0.61363636, ..., 0.        , 0.53181818,
        0.27272727],
       [0.57272727, 0.57272727, 0.61363636, ..., 0.53181818, 0.        ,
        0.57272727],
       [0.58181818, 0.58181818, 0.6       , ..., 0.27272727, 0.57272727,
        0.        ]])

In [4]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import StandardScaler
# 首先，获取训练集和测试集的索引
distance_matrix=normalized_alpha_distance_matrix
labels = data_filtered['antigen.epitope'].values
# Manually split the indices to create a training and testing set
indices = np.arange(distance_matrix.shape[0])
train_indices, test_indices = train_test_split(indices, test_size=0.2, random_state=42)

# Extract the corresponding rows for training and testing
# The entire training matrix is used, and distances from each test sample to all training samples are computed
X_train = distance_matrix[train_indices][:, train_indices]
X_test = distance_matrix[test_indices][:, train_indices]
y_train = labels[train_indices]
y_test = labels[test_indices]
#train
knn = KNeighborsClassifier(n_neighbors=1, metric='precomputed')
knn.fit(X_train, y_train)

In [5]:
y_pred = knn.predict(X_test )
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")
report = classification_report(y_test, y_pred)
print(report)
X_test.shape

Accuracy: 68.54%
               precision    recall  f1-score   support

ADLIAYLKQATKG       0.00      0.00      0.00         1
    ASNENMETM       0.70      0.81      0.75        26
    HGIRNASFI       0.68      0.60      0.64        25
    LSLRNPILV       0.50      0.36      0.42        11
    QLSPFPFDL       1.00      1.00      1.00         1
RVSYYGPKTSPVQ       0.00      0.00      0.00         0
    SQLLNAKYL       0.17      0.20      0.18         5
   SSLENFRAYV       0.75      0.75      0.75        24
     SSPPMFRV       0.67      0.77      0.71        26
    SSYRRPVGI       0.80      0.73      0.76        51
     TVYGFCLL       0.56      0.62      0.59         8

     accuracy                           0.69       178
    macro avg       0.53      0.53      0.53       178
 weighted avg       0.69      0.69      0.69       178



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


(178, 711)

In [6]:
from sklearn.model_selection import GridSearchCV
knn = KNeighborsClassifier()

# 设置要尝试的k值范围
param_grid = {'n_neighbors': np.arange(1, 25)}

# 进行网格搜索
knn_gscv = GridSearchCV(knn, param_grid, cv=5)  # 5折交叉验证
knn_gscv.fit(X_train, y_train)

# 最佳k值
best_k = knn_gscv.best_params_['n_neighbors']
print(f"Best k value: {best_k}")



Best k value: 1


In [7]:
from sklearn.svm import SVC
# 创建SVM模型
svm = SVC(kernel='linear') # 也可以尝试其他核函数

# 训练模型
svm.fit(X_train, y_train)
# 对测试集做预测
y_pred = svm.predict(X_test)

# 计算准确率
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")
report =classification_report(y_test, y_pred)
print(report)

Accuracy: 0.6685393258426966
               precision    recall  f1-score   support

ADLIAYLKQATKG       0.00      0.00      0.00         1
    ASNENMETM       0.62      0.81      0.70        26
    HGIRNASFI       0.82      0.56      0.67        25
    LSLRNPILV       0.50      0.27      0.35        11
    QLSPFPFDL       0.50      1.00      0.67         1
RVSYYGPKTSPVQ       0.00      0.00      0.00         0
    SQLLNAKYL       0.50      0.20      0.29         5
   SSLENFRAYV       0.71      0.71      0.71        24
     SSPPMFRV       0.83      0.73      0.78        26
    SSYRRPVGI       0.67      0.75      0.70        51
     TVYGFCLL       0.42      0.62      0.50         8

     accuracy                           0.67       178
    macro avg       0.51      0.51      0.49       178
 weighted avg       0.68      0.67      0.66       178



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
