In [None]:
# ab combined 后预测MusMusculus

import pandas as pd

data=pd.read_csv('./vdjdb.txt',sep='\t', header=0)
data[data['species'] == 'MusMusculus']
data=data[data['vdjdb.score'] != 0]
data

In [None]:
columns_to_drop = ['antigen.species','antigen.gene','reference.id', 'method', 'meta','cdr3fix','web.method','web.method.seq','web.cdr3fix.nc','web.cdr3fix.unmp','mhc.a','mhc.b','mhc.class']
data.drop(columns=columns_to_drop, inplace=True)
df=data.dropna()
# # Filter data for alpha chain and beta chain
data_alpha = df[df['gene'] == 'TRA'].copy()
data_beta = df[df['gene'] == 'TRB'].copy()

data_alpha = data_alpha[data_alpha['complex.id'] != 0]
data_beta = data_beta[data_beta['complex.id'] != 0]

data_combined = pd.merge(data_alpha,data_beta, on='complex.id' )
data_beta.to_csv('mouse_combined.csv')
data_combined= data_combined.drop_duplicates()
data_combined

In [None]:
class_counts = data_combined['antigen.epitope_x'].value_counts()

# Find classes that have only one instance
single_classes = class_counts[class_counts == 1].index
data_combined = data_combined[~data_combined['antigen.epitope_x'].isin(single_classes)]

# 首先，根据`cdr3_a_aa`、`v_a_gene`和`j_a_gene`来计算距离矩阵
# 计算距离矩阵的具体代码将根据所选算法和数据类型有所不同

# 定义特征列和目标列
feature_columns = ['cdr3_x', 'cdr3_y', 'v.segm_x', 'j.segm_x', 'v.segm_y', 'j.segm_y','mhc.a_y', 'mhc.b_y', 'mhc.class_y' ,'mhc.a_x', 'mhc.b_x', 'mhc.class_x' ]
target_column = 'antigen.epitope_x'

# 提取特征和目标
X = data_combined[feature_columns]  # 或是距离矩阵，取决于您的模型需求
y = data_combined[target_column]

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
column_trans = ColumnTransformer(
    [
        ('one_hot_encoder_vj', OneHotEncoder(), ['v.segm_x', 'j.segm_x', 'v.segm_y', 'j.segm_y','mhc.a_y', 'mhc.b_y', 'mhc.class_y' ,'mhc.a_x', 'mhc.b_x', 'mhc.class_x']),
        ('one_hot_encoder_cdr3', OneHotEncoder(handle_unknown='ignore'), ['cdr3_x', 'cdr3_y'])
    ],
    remainder='drop'
)

X_encoded = column_trans.fit_transform(data_combined)
# 分层划分测试集和训练集
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, stratify=y, random_state=42)

In [None]:
# 5. 计算类别权重
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.utils.class_weight import compute_class_weight
import numpy as np
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
weights = dict(zip(np.unique(y_train), class_weights))
#*
# 6. 训练随机森林模型
rf_classifier_weighted = RandomForestClassifier(random_state=30, class_weight=weights)
rf_classifier_weighted.fit(X_train, y_train)

# 7. 进行预测
y_pred_weighted = rf_classifier_weighted.predict(X_test)

# 8. 生成和显示分类报告
classification_report_weighted = classification_report(y_test, y_pred_weighted, zero_division=0)
#print(classification_report_weighted)

# 9. 计算和显示准确率、精确度、召回率和F1分数
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
accuracy = accuracy_score(y_test, y_pred_weighted)
precision = precision_score(y_test, y_pred_weighted, average='macro', zero_division=0)
recall = recall_score(y_test, y_pred_weighted, average='macro', zero_division=0)
f1 = f1_score(y_test, y_pred_weighted, average='macro', zero_division=0)

print(f'Accuracy: {accuracy:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1 Score: {f1:.2f}')
#onehot编码，随机过采样补全，随机森林预测