In [102]:
import pandas as pd
data=pd.read_csv('./vdjdb.txt',sep='\t', header=0)
columns_to_drop = ['complex.id','reference.id', 'method', 'meta','cdr3fix','web.method','web.method.seq','web.cdr3fix.nc','web.cdr3fix.unmp','mhc.a','mhc.b','mhc.class']
data.drop(columns=columns_to_drop, inplace=True)
data=data[data['vdjdb.score'] != 0]
data

Unnamed: 0,gene,cdr3,v.segm,j.segm,species,antigen.epitope,antigen.gene,antigen.species,vdjdb.score
0,TRA,CIVRAPGRADMRF,TRAV26-1*01,TRAJ43*01,HomoSapiens,FLKEKGGL,Nef,HIV-1,2
1,TRB,CASSYLPGQGDHYSNQPQHF,TRBV13*01,TRBJ1-5*01,HomoSapiens,FLKEKGGL,Nef,HIV-1,2
2,TRB,CASSFEAGQGFFSNQPQHF,TRBV13*01,TRBJ1-5*01,HomoSapiens,FLKEKGGL,Nef,HIV-1,2
3,TRA,CAVPSGAGSYQLTF,TRAV20*01,TRAJ28*01,HomoSapiens,FLKEKGGL,Nef,HIV-1,2
4,TRB,CASSFEPGQGFYSNQPQHF,TRBV13*01,TRBJ1-5*01,HomoSapiens,FLKEKGGL,Nef,HIV-1,2
...,...,...,...,...,...,...,...,...,...
92686,TRB,CASSQGSGGNEQFF,TRBV4-3*01,TRBJ2-1*01,HomoSapiens,FPQPEQPFPWQP,Gluten,Wheat,2
92689,TRA,CAASVLYGSSNTGKLIF,TRAV29/DV5*01,TRAJ37*01,HomoSapiens,QLQPFPQPELPY,Gluten,Wheat,2
92690,TRB,CASSIVGSGGYNEQFF,TRBV19*01,TRBJ2-1*01,HomoSapiens,QLQPFPQPELPY,Gluten,Wheat,2
92767,TRA,CAPQGATNKLIF,TRAV12-2*01,TRAJ32*01,HomoSapiens,PQQPFPQPEQPFP,Gluten,Wheat,2


In [103]:
class_counts = data['antigen.epitope'].value_counts()

# Find classes that have only one instance
single_classes = class_counts[class_counts == 1].index


# Remove rows where 'antigen.epitope' belongs to classes with only one instance
data_filtered = data[~data['antigen.epitope'].isin(single_classes)]
data=data_filtered
data = data.dropna()
data.to_csv('data.csv')

In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer


# 2. 为 `v_a_gene` 和 `j_a_gene` 以及 `cdr3_a_aa` 应用独热编码
column_trans = ColumnTransformer(
    [
        ('one_hot_encoder_vj', OneHotEncoder(), ['v.segm', 'j.segm']),
        ('one_hot_encoder_cdr3', OneHotEncoder(handle_unknown='ignore'), ['cdr3'])
    ],
    remainder='drop'
)

X_encoded = column_trans.fit_transform(data)
y = data['antigen.epitope']

# 3. 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# 4. 训练逻辑回归模型
model = LogisticRegression(max_iter=1000, solver='lbfgs')
model.fit(X_train, y_train)

# 5. 进行预测
y_pred = model.predict(X_test)

# 6. 评估模型
print('Accuracy:', accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.4788391777509069
                          precision    recall  f1-score   support

               AAGIGILTV       0.25      0.33      0.29         3
         AALALLLLDRLNQLE       0.00      0.00      0.00         1
              ALDPHSGHFV       0.00      0.00      0.00         3
               ALHGGWTTK       0.00      0.00      0.00         1
               ALLPGLPAA       0.00      0.00      0.00         3
              ALQIPFAMQM       0.00      0.00      0.00         2
               ALSKGVHFV       0.00      0.00      0.00         4
               ALSPVIPHI       1.00      1.00      1.00         4
               ALTPVVVTL       0.00      0.00      0.00         1
               ALWEIQQVV       0.00      0.00      0.00         2
               ALWGFFPVL       1.00      1.00      1.00         2
               ALYGFVPVL       0.00      0.00      0.00         1
           ANGVAFFLTPFKA       0.00      0.00      0.00         1
            APFSEQEQPVLG       0.00      0.00 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [22]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.utils.class_weight import compute_class_weight

# Ensure all necessary variables are defined and processed correctly
labels = data['antigen.epitope']
features = data[['v.segm', 'j.segm','cdr3','gene','species','vdjdb.score']]
features_encoded = pd.get_dummies(features)

# Splitting the dataset into training and testing sets again
X_train, X_test, y_train, y_test = train_test_split(features_encoded, labels, test_size=0.2, random_state=42)

# Calculate class weights based on the training labels
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
weights = dict(zip(np.unique(y_train), class_weights))

# Training the Random Forest model with class weights
rf_classifier_weighted = RandomForestClassifier(random_state=30, class_weight=weights)
rf_classifier_weighted.fit(X_train, y_train)

# Predicting the test results
y_pred_weighted = rf_classifier_weighted.predict(X_test)

# Generating and displaying the classification report
classification_report_weighted = classification_report(y_test, y_pred_weighted)
classification_report_weighted

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# 计算准确率
accuracy = accuracy_score(y_test, y_pred_weighted)
print(f'Accuracy: {accuracy:.2f}')

# 计算精确度
precision = precision_score(y_test, y_pred_weighted, average='macro', zero_division=0)
print(f'Precision: {precision:.2f}')

# 计算召回率
recall = recall_score(y_test, y_pred_weighted, average='macro', zero_division=0)
print(f'Recall: {recall:.2f}')

# 计算F1分数
f1 = f1_score(y_test, y_pred_weighted, average='macro', zero_division=0)
print(f'F1 Score: {f1:.2f}')

Accuracy: 0.58
Precision: 0.24
Recall: 0.23
F1 Score: 0.22


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [26]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer


# 2. 使用 OneHotEncoder 对所有分类特征进行独热编码
column_trans = ColumnTransformer(
    [
        ('one_hot_encoder', OneHotEncoder(handle_unknown='ignore'), ['v.segm', 'j.segm','cdr3','gene','species','vdjdb.score'])
    ],
    remainder='drop'
)

X_encoded = column_trans.fit_transform(data)
y = data['antigen.epitope']

# 3. 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# 4. 训练 KNN 模型
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train, y_train)

# 5. 进行预测
y_pred = knn_model.predict(X_test)

# 6. 评估模型
print('Accuracy:', accuracy_score(y_test, y_pred))
print('Precision:', precision_score(y_test, y_pred, average='macro'))
print('Recall:', recall_score(y_test, y_pred, average='macro'))
print('F1 Score:', f1_score(y_test, y_pred, average='macro'))
print(classification_report(y_test, y_pred))



Accuracy: 0.5035167563094746
Precision: 0.1417566738317319
Recall: 0.15836085321808382
F1 Score: 0.1375705619694349
                          precision    recall  f1-score   support

               AAFKRSCLK       0.00      0.00      0.00         5
               AAGIGILTV       0.14      0.50      0.22         2
         AALALLLLDRLNQLE       0.00      0.00      0.00         1
               AALPILFQV       0.00      0.00      0.00         1
         AAVVRFQEAANKQKQ       0.00      0.00      0.00         0
          ADGLAYFRSSFKGG       0.00      0.00      0.00         0
           ADLIAYLKQATKG       0.00      0.00      0.00         1
    ADSLSFFSSSIKRGGGSLVP       0.00      0.00      0.00         1
              ALDPHSGHFV       0.12      1.00      0.22         1
               ALGIGILTV       0.00      0.00      0.00         0
               ALHGGWTTK       0.00      0.00      0.00         1
               ALLLQLFTL       0.00      0.00      0.00         1
               ALLPGLPAA 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [27]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.utils.class_weight import compute_class_weight
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer


# 2. 特征和标签选择
labels = data['antigen.epitope']
features = data[['v.segm', 'j.segm', 'cdr3', 'gene', 'species', 'vdjdb.score']]

# 3. 应用独热编码
onehot_encoder = OneHotEncoder()
features_encoded = onehot_encoder.fit_transform(features)

# 4. 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(features_encoded, labels, test_size=0.2, random_state=42)

# 5. 计算类别权重
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
weights = dict(zip(np.unique(y_train), class_weights))

# 6. 训练随机森林模型
rf_classifier_weighted = RandomForestClassifier(random_state=30, class_weight=weights)
rf_classifier_weighted.fit(X_train, y_train)

# 7. 进行预测
y_pred_weighted = rf_classifier_weighted.predict(X_test)

# 8. 生成和显示分类报告
classification_report_weighted = classification_report(y_test, y_pred_weighted, zero_division=0)
print(classification_report_weighted)

# 9. 计算和显示准确率、精确度、召回率和F1分数
accuracy = accuracy_score(y_test, y_pred_weighted)
precision = precision_score(y_test, y_pred_weighted, average='macro', zero_division=0)
recall = recall_score(y_test, y_pred_weighted, average='macro', zero_division=0)
f1 = f1_score(y_test, y_pred_weighted, average='macro', zero_division=0)

print(f'Accuracy: {accuracy:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1 Score: {f1:.2f}')


                          precision    recall  f1-score   support

               AAFKRSCLK       0.00      0.00      0.00         5
               AAGIGILTV       0.50      0.50      0.50         2
         AALALLLLDRLNQLE       0.00      0.00      0.00         1
               AALPILFQV       0.00      0.00      0.00         1
           ADLIAYLEQATKG       0.00      0.00      0.00         0
           ADLIAYLKQATKG       0.00      0.00      0.00         1
    ADSLSFFSSSIKRGGGSLVP       0.00      0.00      0.00         1
              ALDPHSGHFV       1.00      1.00      1.00         1
               ALGIGILTV       0.00      0.00      0.00         0
               ALHGGWTTK       0.00      0.00      0.00         1
               ALLLQLFTL       0.00      0.00      0.00         1
               ALLPGLPAA       0.00      0.00      0.00         1
               ALSKGVHFV       0.00      0.00      0.00         6
               ALSPVIPHI       0.71      1.00      0.83         5
         

In [33]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.utils.class_weight import compute_class_weight
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer


# 2. 特征和标签选择
labels = data['antigen.epitope']
features = data[['cdr3']]

# 3. 应用独热编码
onehot_encoder = OneHotEncoder()
features_encoded = onehot_encoder.fit_transform(features)

# 4. 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(features_encoded, labels, test_size=0.2, random_state=42)

# 5. 计算类别权重
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
weights = dict(zip(np.unique(y_train), class_weights))

# 6. 训练随机森林模型
rf_classifier_weighted = RandomForestClassifier(random_state=30, class_weight=weights)
rf_classifier_weighted.fit(X_train, y_train)

# 7. 进行预测
y_pred_weighted = rf_classifier_weighted.predict(X_test)

# 8. 生成和显示分类报告
classification_report_weighted = classification_report(y_test, y_pred_weighted, zero_division=0)
print(classification_report_weighted)

# 9. 计算和显示准确率、精确度、召回率和F1分数
accuracy = accuracy_score(y_test, y_pred_weighted)
precision = precision_score(y_test, y_pred_weighted, average='macro', zero_division=0)
recall = recall_score(y_test, y_pred_weighted, average='macro', zero_division=0)
f1 = f1_score(y_test, y_pred_weighted, average='macro', zero_division=0)

print(f'Accuracy: {accuracy:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1 Score: {f1:.2f}')


                          precision    recall  f1-score   support

               AAFKRSCLK       0.00      0.00      0.00         5
               AAGIGILTV       0.00      0.00      0.00         2
         AALALLLLDRLNQLE       0.00      0.00      0.00         1
               AALPILFQV       0.00      0.00      0.00         1
           ADLIAYLEQATKG       0.00      0.00      0.00         0
           ADLIAYLKQATKG       0.00      0.00      0.00         1
    ADSLSFFSSSIKRGGGSLVP       0.00      0.00      0.00         1
              ALDPHSGHFV       1.00      1.00      1.00         1
               ALGIGILTV       0.00      0.00      0.00         0
               ALHGGWTTK       0.00      0.00      0.00         1
               ALLLQLFTL       0.00      0.00      0.00         1
               ALLPGLPAA       0.00      0.00      0.00         1
               ALSKGVHFV       0.00      0.00      0.00         6
               ALSPVIPHI       1.00      1.00      1.00         5
         

In [88]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE

# 假设 'target' 是目标变量，其余的是特征
labels = data['antigen.epitope']
features = data[['v.segm', 'j.segm', 'cdr3', 'gene', 'species', 'vdjdb.score']]

# 应用独热编码
encoder = OneHotEncoder()
features_encoded = encoder.fit_transform(features)

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(features_encoded, labels, test_size=0.2, random_state=42)

# 应用SMOTE进行过采样处理
smote = SMOTE(random_state=42,k_neighbors=2)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# 使用随机森林模型
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train_smote, y_train_smote)

# 预测测试集结果
y_pred = rf.predict(X_test)

# 输出分类报告
print(classification_report(y_test, y_pred))


ValueError: Expected n_neighbors <= n_samples_fit, but n_neighbors = 3, n_samples_fit = 2, n_samples = 2

In [84]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from imblearn.combine import SMOTEENN

# 读取数据
labels = data['antigen.epitope']
features = data[['v.segm', 'j.segm', 'cdr3', 'gene', 'species', 'vdjdb.score']]

# 应用独热编码
encoder = OneHotEncoder()
features_encoded = encoder.fit_transform(features)

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(features_encoded, labels, test_size=0.2, random_state=42)

# 应用SMOTEENN进行过采样和欠采样
smote_enn = SMOTEENN(random_state=42)
X_resampled, y_resampled = smote_enn.fit_resample(X_train, y_train)

# 训练随机森林模型
rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier.fit(X_resampled, y_resampled)

# 进行预测
y_pred = rf_classifier.predict(X_test)

# 输出分类报告
print(classification_report(y_test, y_pred))


ValueError: Expected n_neighbors <= n_samples_fit, but n_neighbors = 6, n_samples_fit = 5, n_samples = 5

In [89]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from imblearn.under_sampling import RandomUnderSampler, ClusterCentroids, TomekLinks


In [90]:
# 使用聚类中心进行欠采样
def cluster_centroids_undersampling(X_train, y_train):
    cc = ClusterCentroids(random_state=42)
    X_resampled, y_resampled = cc.fit_resample(X_train, y_train)
    return X_resampled, y_resampled


In [91]:
# 随机欠采样代码
def random_undersampling(X_train, y_train):
    rus = RandomUnderSampler(random_state=42)
    X_resampled, y_resampled = rus.fit_resample(X_train, y_train)
    return X_resampled, y_resampled


In [92]:
# 使用Tomek Links进行欠采样
def tomek_links_undersampling(X_train, y_train):
    tl = TomekLinks()
    X_resampled, y_resampled = tl.fit_resample(X_train, y_train)
    return X_resampled, y_resampled


In [119]:
# 加载数据
labels = data['antigen.epitope']
features = data[['v.segm', 'j.segm', 'cdr3', 'gene', 'species', 'vdjdb.score']]

# 应用独热编码
encoder = OneHotEncoder()
features_encoded = encoder.fit_transform(features)

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(features_encoded, labels, test_size=0.2, random_state=42)

# 应用随机欠采样
X_train_rus, y_train_rus = random_undersampling(X_train, y_train)

# 应用Cluster Centroids
X_train_cc, y_train_cc = cluster_centroids_undersampling(X_train, y_train)

# 应用Tomek Links
X_train_tl, y_train_tl = tomek_links_undersampling(X_train, y_train)


In [120]:
# 训练随机森林模型并打印分类报告
def train_and_evaluate(X_train, y_train, X_test, y_test):
    rf_classifier = RandomForestClassifier(random_state=42)
    rf_classifier.fit(X_train, y_train)
    y_pred = rf_classifier.predict(X_test)
    print(classification_report(y_test, y_pred))

# 对每种方法训练和评估模型

print("Tomek Links Results:")
train_and_evaluate(X_train_tl, y_train_tl, X_test, y_test)


Tomek Links Results:


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


                          precision    recall  f1-score   support

               AAFKRSCLK       0.00      0.00      0.00         5
               AAGIGILTV       0.20      0.50      0.29         2
         AALALLLLDRLNQLE       0.00      0.00      0.00         1
               AALPILFQV       0.00      0.00      0.00         1
         AAVVRFQEAANKQKQ       0.00      0.00      0.00         0
           ADLIAYLKQATKG       0.25      1.00      0.40         1
    ADSLSFFSSSIKRGGGSLVP       0.00      0.00      0.00         1
              ALDPHSGHFV       1.00      1.00      1.00         1
               ALGIGILTV       0.00      0.00      0.00         0
               ALHGGWTTK       0.00      0.00      0.00         1
               ALLLQLFTL       0.00      0.00      0.00         1
               ALLPGLPAA       0.00      0.00      0.00         1
               ALSKGVHFV       0.00      0.00      0.00         6
               ALSPVIPHI       1.00      1.00      1.00         5
         

In [101]:
from sklearn.model_selection import GridSearchCV

# 设置参数网格
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# 创建随机森林和GridSearchCV对象
forest = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(estimator=forest, param_grid=param_grid, cv=3, scoring='accuracy', n_jobs=-1)

# 训练使用 GridSearchCV
grid_search.fit(X_train, y_train)

# 打印最佳参数和最佳模型的得分
print("最佳参数：", grid_search.best_params_)
print("最高准确率：", grid_search.best_score_)

# 使用最佳模型预测
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
print(classification_report(y_test, y_pred))




KeyboardInterrupt: 

In [121]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from imblearn.over_sampling import RandomOverSampler
# 初始化随机过采样器
ros = RandomOverSampler(random_state=42)

# 应用随机过采样
X_train_ros, y_train_ros = ros.fit_resample(X_train, y_train)

# 检查过采样后的类别分布
print("类别分布：\n", pd.Series(y_train_ros).value_counts())


类别分布：
 QYIKWPWYI           998
AVGVGKSAL           998
APFSEQEQPVLG        998
QYDPVAALF           998
HRRGSRSYV           998
                   ... 
HSKKKCDEL           998
MMWDRGLGMM          998
DRFYKTLRAEQASQEV    998
RPPIFIRRL           998
RLGEVRHPV           998
Name: antigen.epitope, Length: 453, dtype: int64


In [122]:

# 5. 计算类别权重
class_weights = compute_class_weight('balanced', classes=np.unique(y_train_ros), y=y_train_ros)
weights = dict(zip(np.unique(y_train_ros), class_weights))

# 6. 训练随机森林模型
rf_classifier_weighted = RandomForestClassifier(random_state=30, class_weight=weights)
rf_classifier_weighted.fit(X_train_ros, y_train_ros)

# 7. 进行预测
y_pred_weighted = rf_classifier_weighted.predict(X_test)

# 8. 生成和显示分类报告
classification_report_weighted = classification_report(y_test, y_pred_weighted, zero_division=0)
print(classification_report_weighted)

# 9. 计算和显示准确率、精确度、召回率和F1分数
accuracy = accuracy_score(y_test, y_pred_weighted)
precision = precision_score(y_test, y_pred_weighted, average='macro', zero_division=0)
recall = recall_score(y_test, y_pred_weighted, average='macro', zero_division=0)
f1 = f1_score(y_test, y_pred_weighted, average='macro', zero_division=0)

print(f'Accuracy: {accuracy:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1 Score: {f1:.2f}')

                          precision    recall  f1-score   support

               AAFKRSCLK       0.00      0.00      0.00         5
               AAGIGILTV       0.50      0.50      0.50         2
         AALALLLLDRLNQLE       0.00      0.00      0.00         1
               AALPILFQV       0.00      0.00      0.00         1
           ADLIAYLEQATKG       0.00      0.00      0.00         0
           ADLIAYLKQATKG       0.00      0.00      0.00         1
    ADSLSFFSSSIKRGGGSLVP       0.00      0.00      0.00         1
              ALDPHSGHFV       1.00      1.00      1.00         1
               ALGIGILTV       0.00      0.00      0.00         0
               ALHGGWTTK       0.00      0.00      0.00         1
               ALLLQLFTL       0.00      0.00      0.00         1
               ALLPGLPAA       0.00      0.00      0.00         1
               ALSKGVHFV       0.00      0.00      0.00         6
               ALSPVIPHI       0.71      1.00      0.83         5
         

In [123]:
# 5. 训练逻辑回归模型
model = LogisticRegression(max_iter=1000)
model.fit(X_train_ros, y_train_ros)

# 6. 进行预测
y_pred = model.predict(X_test)

# 7. 评估模型
print('Accuracy:', accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.5610260653702938
                          precision    recall  f1-score   support

               AAFKRSCLK       1.00      0.20      0.33         5
               AAGIGILTV       0.50      0.50      0.50         2
         AALALLLLDRLNQLE       0.00      0.00      0.00         1
               AALPILFQV       0.00      0.00      0.00         1
           ADLIAYLEQATKG       0.00      0.00      0.00         0
           ADLIAYLKQATKG       0.00      0.00      0.00         1
    ADSLSFFSSSIKRGGGSLVP       0.00      0.00      0.00         1
              ALDPHSGHFV       1.00      1.00      1.00         1
               ALGIGILTV       0.00      0.00      0.00         0
               ALHGGWTTK       0.00      0.00      0.00         1
               ALLLQLFTL       0.00      0.00      0.00         1
               ALLPGLPAA       0.00      0.00      0.00         1
               ALSKGVHFV       0.00      0.00      0.00         6
               ALSPVIPHI       0.71      1.00 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [125]:
column_trans = ColumnTransformer(
    [
        ('one_hot_encoder_vj', OneHotEncoder(), ['v.segm', 'j.segm', 'cdr3', 'gene', 'species', 'vdjdb.score']),
        ('one_hot_encoder_cdr3', OneHotEncoder(handle_unknown='ignore'), ['cdr3'])
    ],
    remainder='drop'
)

X_encoded = column_trans.fit_transform(data)
y = data['antigen.epitope']

# 3. 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

In [126]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from imblearn.over_sampling import RandomOverSampler
# 初始化随机过采样器
ros = RandomOverSampler(random_state=42)

# 应用随机过采样
X_train_ros, y_train_ros = ros.fit_resample(X_train, y_train)

# 检查过采样后的类别分布
print("类别分布：\n", pd.Series(y_train_ros).value_counts())


类别分布：
 QYIKWPWYI           998
AVGVGKSAL           998
APFSEQEQPVLG        998
QYDPVAALF           998
HRRGSRSYV           998
                   ... 
HSKKKCDEL           998
MMWDRGLGMM          998
DRFYKTLRAEQASQEV    998
RPPIFIRRL           998
RLGEVRHPV           998
Name: antigen.epitope, Length: 453, dtype: int64


In [127]:
# 5. 计算类别权重
from sklearn.utils.class_weight import compute_class_weight
class_weights = compute_class_weight('balanced', classes=np.unique(y_train_ros), y=y_train_ros)
weights = dict(zip(np.unique(y_train_ros), class_weights))
#*
# 6. 训练随机森林模型
rf_classifier_weighted = RandomForestClassifier(random_state=30, class_weight=weights)
rf_classifier_weighted.fit(X_train_ros, y_train_ros)

# 7. 进行预测
y_pred_weighted = rf_classifier_weighted.predict(X_test)

# 8. 生成和显示分类报告
classification_report_weighted = classification_report(y_test, y_pred_weighted, zero_division=0)
print(classification_report_weighted)

# 9. 计算和显示准确率、精确度、召回率和F1分数
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
accuracy = accuracy_score(y_test, y_pred_weighted)
precision = precision_score(y_test, y_pred_weighted, average='macro', zero_division=0)
recall = recall_score(y_test, y_pred_weighted, average='macro', zero_division=0)
f1 = f1_score(y_test, y_pred_weighted, average='macro', zero_division=0)

print(f'Accuracy: {accuracy:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1 Score: {f1:.2f}')

                          precision    recall  f1-score   support

               AAFKRSCLK       0.00      0.00      0.00         5
               AAGIGILTV       0.50      0.50      0.50         2
         AALALLLLDRLNQLE       0.00      0.00      0.00         1
               AALPILFQV       0.00      0.00      0.00         1
           ADLIAYLEQATKG       0.00      0.00      0.00         0
           ADLIAYLKQATKG       0.00      0.00      0.00         1
    ADSLSFFSSSIKRGGGSLVP       0.00      0.00      0.00         1
              ALDPHSGHFV       1.00      1.00      1.00         1
               ALGIGILTV       0.00      0.00      0.00         0
               ALHGGWTTK       0.00      0.00      0.00         1
               ALLLQLFTL       0.00      0.00      0.00         1
               ALLPGLPAA       0.00      0.00      0.00         1
               ALSKGVHFV       0.00      0.00      0.00         6
               ALSPVIPHI       1.00      1.00      1.00         5
         