This notebook applies an XGBoost classification model aimed at predicting the responder status of HIV patients. The objective is to classify patients by leveraging their HIV reverse transcriptase and protease sequences, along with clinical data. These sequences have been embedded using the ProtT5 transformer. The model categorizes patients as "1" for those who have shown improvement and "0" for those who have not, aiding in the identification of patients who are responding positively to treatment .


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.datasets import make_classification
import joblib
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

In [None]:
train_df = pd.read_csv('/content/drive/MyDrive/ביואינפורמטיקה/שנה ג/מיני פרויקט בביואינפורמטיקה/פרויקט/ProtT5(6)/HIV training data.csv')
train_df

Unnamed: 0,PatientID,Resp,PR Seq,RT Seq,VL-t0,CD4-t0
0,1,0,CCTCAAATCACTCTTTGGCAACGACCCCTCGTCCCAATAAGGATAG...,CCCATTAGTCCTATTGAAACTGTACCAGTAAAGCTAAAGCCAGGAA...,4.30,145
1,2,0,CCTCAAATCACTCTTTGGCAACGACCCCTCGTCGCAATAAAGATAG...,CCCATTAGTCCTATTGAAACTGTACCAGTAAAATTAAAGCCAGGAA...,3.60,224
2,3,0,CCTCAAATCACTCTTTGGCAACGACCCCTCGTCGCAATAAAGGTAG...,CCCATTAGTCCTATTGAAACTGTACCAGTAAAATTAAAGCCAGGAA...,3.20,1017
3,4,0,CCTCAAATCACTCTTTGGCAACGACCCCTCGTCGCAATAAGGATAG...,CCCATTAGTCCTATTGAAACTGTACCAGTAAAATTAAAGCCAGGAA...,5.70,206
4,5,0,CCTCAAATCACTCTTTGGCAACGACCCCTCGTCGCAGTAAAGATAG...,CCCATTAGTCCTATTGAAACTGTACCAGTAAAATTAAAGCCAGGAA...,3.50,572
...,...,...,...,...,...,...
995,996,0,,CCCATTAGTCCTATTGARACTGTACCAGTAMAATTAAAGCCAGGAA...,3.15,354
996,997,0,,CCCATYAGTCCTATTGAAACTGTACCAGTAAAATTAAAGCCAGGAA...,5.50,50
997,998,0,,CCCATYAGTCCTATTGAAACTGTACCAGTAAAATTAAAGCCAGGAA...,4.10,369
998,999,0,,CCTATTAGTCCTATTGAAACTGTACCAGTAAAATTAAAGCCAGGAA...,3.37,127


In [None]:
train_RT = pd.read_csv('/content/drive/MyDrive/ביואינפורמטיקה/שנה ג/מיני פרויקט בביואינפורמטיקה/פרויקט/ProtT5(6)/embedding_1000_RT_train.csv')

In [None]:
train_RT

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,990,991,992,993,994,995,996,997,998,999
0,0.3076,0.2925,0.3323,0.29470,0.3333,0.3198,0.3267,0.3184,0.33420,0.32960,...,0.33540,0.32810,0.33760,0.2940,0.27780,0.3313,0.29320,0.3300,0.2996,0.28600
1,-0.1120,-0.1315,-0.1415,-0.14940,-0.1362,-0.1567,-0.1387,-0.1299,-0.11993,-0.13460,...,-0.07940,-0.10345,-0.11017,-0.1345,-0.13180,-0.1196,-0.13750,-0.0939,-0.0997,-0.11926
2,0.2335,0.1552,0.1951,0.25510,0.2045,0.2058,0.2085,0.2341,0.19970,0.20480,...,0.24740,0.28370,0.19840,0.2026,0.22920,0.2233,0.25240,0.2133,0.2311,0.24610
3,0.4202,0.4675,0.4490,0.35280,0.4297,0.4465,0.4370,0.4783,0.44340,0.43100,...,0.40400,0.41920,0.43260,0.4375,0.44290,0.4043,0.43140,0.4448,0.4487,0.43160
4,-0.4194,-0.4146,-0.4304,-0.40840,-0.3943,-0.4204,-0.3977,-0.4421,-0.43580,-0.42400,...,-0.41500,-0.42430,-0.42000,-0.3872,-0.42360,-0.3767,-0.40330,-0.4314,-0.4380,-0.40200
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
506875,-0.1212,-0.0097,-0.1721,-0.02313,-0.1598,-0.1880,-0.1732,-0.0695,-0.16280,-0.17050,...,-0.10876,-0.07480,-0.15300,-0.1415,-0.14170,-0.1649,-0.23350,-0.0856,-0.1819,-0.14300
506876,-0.1771,-0.2377,-0.1873,-0.06340,-0.1995,-0.2229,-0.2688,-0.3027,-0.22570,-0.22050,...,-0.14880,-0.18970,-0.31570,-0.3630,-0.19230,-0.2512,-0.22360,-0.1740,-0.1971,-0.24870
506877,0.2720,0.3340,0.3310,0.13510,0.2898,0.2515,0.3313,0.3992,0.31620,0.29440,...,0.30350,0.34060,0.20000,0.4230,0.28780,0.2820,0.27660,0.3157,0.3506,0.37740
506878,-0.1855,-0.1979,-0.1750,-0.25900,-0.1860,-0.1677,-0.1406,-0.2437,-0.13950,-0.20740,...,-0.11730,-0.06730,-0.16860,-0.1644,-0.10370,-0.1540,-0.19210,-0.0882,-0.0974,-0.09955


In [None]:
train_label=train_df['Resp']
train_label

0      0
1      0
2      0
3      0
4      0
      ..
995    0
996    0
997    0
998    0
999    0
Name: Resp, Length: 1000, dtype: int64

PCA

In [None]:
train_RT = train_RT.transpose()

PCA in parts

We performed PCA in parts because the RAM couldn't handle it all at once. The RAM couldn't handle both the test and training data simultaneously, so we performed PCA on the training data, saved the PCA object, and later applied it to the test data using the saved object

In [None]:
# Performing PCA with 100 components on the training data
pca = PCA(n_components=100)
X_train_pca = pca.fit_transform(train_RT)

# Creating a DataFrame for the training data after PCA
columns = [f"PC{i+1}" for i in range(100)]
train_data_pca = pd.DataFrame(X_train_pca, columns=columns)
train_data_pca["Resp"] = train_label

# Saving the DataFrame of training data after PCA to a CSV file
train_data_pca.to_csv("new_pca_RT_train_final.csv", index=False)

# Saving the PCA object for future use
joblib.dump(pca, 'pca_model_final.pkl')

In [None]:
test_df= pd.read_csv('/content/drive/MyDrive/ביואינפורמטיקה/שנה ג/מיני פרויקט בביואינפורמטיקה/פרויקט/ProtT5(6)/test_data.csv')
test_df

Unnamed: 0,PatientID,Resp,PR Seq,RT Seq,VL-t0,CD4-t0
0,1,H,NCTCTATTAGATACAGGAGCAGATGACACAGTATTAGAAGARATGG...,CCTATTAGTCCTATTGAAACTGTACCAGTRAAATTAAAGCCAGGAA...,5.60,69
1,2,H,NCTCTATTAGATACAGGAGCAGATGATACAGTATTAGAAGAAATGA...,CCCATCAGTCCTATTGAAACTGTACCAGTAAAATTAAAGCCAGGAA...,5.30,119
2,3,H,GGGCAAATAAAGGAAGCTCTATTAGATACAGGAGCAGATGATACAG...,CCCATTAGTCCTATTGAAACTGTACCAGTAAAATTAAAGCCAGGAA...,5.70,41
3,4,H,GGGCAACTAAAGGAAGCTCTATTAGATACAGGAGCAGATGATACAG...,CCTATTAGTCCTATTGAAACTGTACCAGTAAAATTAAAGCCAGGAA...,5.20,48
4,5,H,GGGGGGCAACTAAAGGAAGCTCTATTAGATACAGGAGCAGATGATA...,CCCATTAGTCCTATTGAAACTGTACCAGTAAAATTAAAGCCAGGAA...,5.50,311
...,...,...,...,...,...,...
687,688,H,CCTCAGATCACTCTTTGGCAACGACCCCTCGTCACAATAAAGATAG...,CCCATTAGTCCTATTGAAACTGTACCAGTAAAATTAAAGCCAGGAA...,5.30,366
688,689,H,CCTCAGATCACTCTTTGGCAACGACCCCTCGTCACAATAAAGATAG...,CCCATTAGTCCTATTGAAACTGTACCAGTAAAATTAAAGCCAGGAA...,4.90,151
689,690,H,CCTCAGATCACTCTTTGGCAACGACCCCTCGTCACAATAAAGATAG...,CCTATTAGTCCTATTGAAACTGTACCAGTAAAATTAAAGCCAGGAA...,4.90,411
690,691,H,CCTCAGATCACTCTTTGGCAACGACCCCTCGTCACAATAAAGATAG...,CCCATTAGTCCTATTGAAACTGTACCAGTAAAATTAAAGCCAGGAA...,4.50,268


In [None]:
test_RT= pd.read_csv('/content/drive/MyDrive/ביואינפורמטיקה/שנה ג/מיני פרויקט בביואינפורמטיקה/פרויקט/ProtT5(6)/embedding_results_test_RT.csv')
test_RT

In [None]:
test_label=pd.read_csv('/content/drive/MyDrive/ביואינפורמטיקה/שנה ג/מיני פרויקט בביואינפורמטיקה/פרויקט/ProtT5(6)/test_lable.csv')
test_label=test_label['ResponderStatus']
test_label

0      1
1      0
2      1
3      0
4      1
      ..
687    1
688    1
689    1
690    1
691    1
Name: ResponderStatus, Length: 692, dtype: int64

In [None]:
#Transpose
test_RT= test_RT.transpose()

PCA to the test

In [None]:
#We saved the PCA transform from the training data to later perform PCA on the test data

pca = joblib.load('')  # Replace with the exact path to the pca_model.pkl file on your drive

# Performing transform on the test data
X_test_pca = pca.transform(test_RT)
columns = [f"PC{i+1}" for i in range(X_test_pca.shape[1])]

# Creating a DataFrame for the test data after PCA
test_data_pca = pd.DataFrame(X_test_pca/, columns=columns)
test_data_pca["ResponderStatus"] = test_label

# Saving the DataFrame of test data after PCA to a CSV file
test_data_pca.to_csv("pca_RT_test.csv", index=False)

In [None]:
train_RT_PCA= pd.read_csv('/content/drive/MyDrive/ביואינפורמטיקה/שנה ג/מיני פרויקט בביואינפורמטיקה/פרויקט/ProtT5(6)/מחברות-PR,RT,ALL FATIRES/final/pca_train_fit_RT_1000_seq.csv')
train_RT_PCA=train_RT_PCA.drop(['Resp'],axis=1)
train_RT_PCA

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,...,PC91,PC92,PC93,PC94,PC95,PC96,PC97,PC98,PC99,PC100
0,19.468535,16.690811,39.408652,13.798016,1.545957,0.839309,1.291928,-1.123200,1.249711,-4.614261,...,-1.704164,0.437927,0.537436,2.644502,-1.485645,0.925236,1.315919,-0.469013,2.126996,0.450310
1,-17.164891,18.642016,-8.327178,0.815992,-2.454045,-20.420742,2.766523,14.267944,-8.109614,17.698266,...,-0.041639,-2.454362,-0.922559,0.216361,-4.471073,-0.397673,-3.046035,-0.667115,-1.308287,2.784812
2,-11.595662,20.524892,-10.055025,-4.423242,-7.918139,1.582351,0.388201,2.791286,-3.649752,-1.697527,...,3.679493,2.157192,2.519446,1.932880,-0.289343,2.733469,-1.500422,-0.236010,-2.858274,0.253565
3,98.324098,-23.737515,-9.951965,11.856986,-3.501389,2.479090,-1.278360,-1.445255,-2.562276,0.913382,...,0.298209,1.436515,-0.436911,3.708288,0.139090,-0.923414,-0.370323,0.612296,-2.744184,-0.574743
4,-9.539609,19.129397,-10.609693,-7.536527,-4.329402,-15.510846,17.040986,0.243370,2.612851,-3.512832,...,-2.253495,2.356092,1.062744,2.336709,-1.709266,0.220020,1.486058,-0.169757,0.022307,-2.237143
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,-13.066772,19.815587,-9.970176,13.138114,0.518502,-12.331159,-0.237316,8.328650,-8.414628,-4.884303,...,1.782784,-3.157702,-0.450952,-1.699169,-3.449650,2.218817,1.962050,-0.542329,2.506513,-0.954538
996,-8.882492,22.444915,-5.651161,-10.883909,-0.875066,4.132189,-17.132928,-1.102329,10.678666,-0.690484,...,2.821055,0.509456,-0.090572,2.125578,-1.533416,-1.133331,-2.608001,-1.362068,-1.268701,1.887736
997,-22.016576,1.729764,-6.724420,26.251746,17.976849,10.364271,3.145767,-5.488966,2.094159,1.997147,...,-0.325549,2.369301,0.654479,-0.664620,-2.053382,0.805898,-0.065541,-1.339175,0.821416,2.229429
998,-22.207060,-10.417325,-3.970456,4.376329,23.331556,8.280640,14.628940,-13.825987,12.061811,2.658102,...,-0.239960,-1.655548,-1.081321,-3.865851,4.487018,-1.123869,-0.909715,1.793193,-1.114843,-2.468145


In [None]:
test_RT_PCA= pd.read_csv('/content/drive/MyDrive/ביואינפורמטיקה/שנה ג/מיני פרויקט בביואינפורמטיקה/פרויקט/ProtT5(6)/מחברות-PR,RT,ALL FATIRES/final/pca_test_transform_RT_1000_seq.csv')
test_RT_PCA

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,...,PC91,PC92,PC93,PC94,PC95,PC96,PC97,PC98,PC99,PC100
0,96.738785,-22.183756,-7.615487,-11.342175,2.628930,-1.235586,-0.693187,-2.135332,8.242574,1.961689,...,1.307903,0.042740,2.218423,-1.780101,2.337994,1.997186,0.592466,-0.654364,-1.323716,-1.401459
1,96.555974,-22.333141,-8.779864,-8.581465,4.271654,0.901243,-6.476556,2.975004,0.430718,-0.159567,...,0.742278,1.531182,-1.226408,3.347541,-0.359025,3.140640,1.313073,-0.995234,-3.674432,-0.599512
2,-10.231989,17.473122,-8.752664,-21.265517,-2.302233,1.629605,3.593105,-7.310579,7.584506,9.525608,...,0.833450,-2.345209,-1.020997,-2.470617,0.706749,1.839725,-3.828726,2.296222,-0.960563,-1.848076
3,98.853773,-24.519099,-11.033540,6.475513,-0.415112,-1.420998,2.006645,0.229880,-1.189474,-0.313785,...,-1.361551,-2.513094,1.220160,2.237722,3.218172,1.200747,0.901825,1.010021,2.128166,-0.895052
4,-12.194039,19.650747,-8.716043,-0.502945,5.083546,-3.550506,-11.888924,9.207689,2.652391,-9.991116,...,0.797339,-1.649128,2.126768,-1.092917,0.526183,-0.867072,0.214048,-0.557311,-2.816051,-0.153161
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
687,-42.312277,-46.918843,3.319111,8.526868,-11.544565,-0.018931,1.363414,5.300386,-9.635414,4.547047,...,-0.795898,1.025699,-0.765660,1.275599,-2.604298,-0.971486,-3.939393,-1.346038,0.690217,-0.659414
688,99.110047,-24.205776,-10.383340,-5.865726,2.541446,3.003599,-8.088880,5.430029,-2.508339,-2.970422,...,1.183093,-2.119951,3.069150,0.870366,-1.000629,0.299059,-2.630532,-0.901062,0.084201,0.359060
689,-14.159698,4.808339,-9.870978,-4.432865,8.483870,0.318714,23.842161,-6.011801,-0.947047,-0.709332,...,0.296958,-2.994017,0.414361,0.720005,-0.154325,1.490132,-1.110214,-0.284734,-4.827731,-1.236673
690,-23.972309,9.013892,-8.333592,8.330632,16.234753,-5.995284,-8.883605,12.700753,-2.496752,18.969742,...,0.484112,-0.750897,-3.587304,-0.544773,1.529914,2.205275,2.800087,-1.480479,-1.355578,-0.388843


In [None]:
train_PR_PCA = pd.read_csv('/content/drive/MyDrive/ביואינפורמטיקה/שנה ג/מיני פרויקט בביואינפורמטיקה/פרויקט/ProtT5(6)/PCA_train_test/pca_PR_train.csv')
train_PR_PCA=train_PR_PCA.drop(['Resp'],axis=1)
train_PR_PCA

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,...,PC91,PC92,PC93,PC94,PC95,PC96,PC97,PC98,PC99,PC100
0,-1.885705,-0.651117,11.843484,4.869803,-1.839872,-1.211086,-1.339022,-5.662424,-2.658699,-2.294969,...,1.642744,1.330677,-0.572675,2.124031,2.439378,1.012206,-1.305197,-0.847549,-0.354215,-0.599726
1,8.164608,-11.079589,6.101052,1.325754,1.939282,-2.708731,1.215843,-3.561059,-7.453094,-3.030263,...,-0.355921,-0.000113,-0.412133,0.574142,1.053504,-0.123772,0.190279,0.687328,-0.959674,-0.022768
2,-8.200550,7.256495,-5.723636,3.398085,-4.638080,4.988092,0.015292,0.147888,-8.151237,0.377515,...,0.241881,-0.417225,-0.508514,-0.406622,-0.051119,-0.084291,-0.566928,0.938009,-0.905622,-0.086298
3,-8.060616,5.178030,-3.000635,5.470617,5.086985,2.029443,-0.753842,2.374189,-3.562362,-0.106443,...,0.890827,-0.192852,1.158719,-0.999682,0.770241,-1.580742,1.128355,0.996155,-0.273399,0.681298
4,-7.668880,4.791262,-2.451455,0.302582,6.092274,-0.743319,-0.482994,3.074220,-0.092215,-0.252762,...,0.767926,-0.053625,-0.334521,0.119323,-0.926653,-1.525049,0.217797,0.153457,-1.594505,0.821017
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
915,59.871827,54.162080,8.097254,-3.236060,12.931488,16.105585,24.186987,-13.588010,2.861421,2.477015,...,1.183771,-0.090672,0.370140,1.937033,0.962456,1.919436,-0.779699,-0.142160,0.731794,0.580605
916,59.070576,54.637252,8.540747,-3.078992,13.713314,16.550111,24.143944,-13.256716,2.231031,2.765292,...,1.692978,-0.296690,-0.283723,-0.797104,-0.163010,-0.608893,-1.616867,-0.498342,1.067861,-0.585809
917,17.881770,-15.291705,-1.319019,5.066728,-1.266535,-2.437619,2.409814,-1.060089,0.126137,-1.614192,...,1.005991,-0.278236,0.737634,0.436368,-0.250314,1.563612,0.195921,3.044061,-0.980565,-0.241512
918,51.226825,45.769151,6.619134,1.362370,-1.413785,-7.432729,-7.532700,5.848887,-2.681675,-2.729530,...,0.257285,0.172115,0.029524,0.095887,-0.190369,-0.077163,0.070953,-0.321681,-0.145686,0.161915


In [None]:
test_PR_PCA= pd.read_csv('/content/drive/MyDrive/ביואינפורמטיקה/שנה ג/מיני פרויקט בביואינפורמטיקה/פרויקט/ProtT5(6)/PCA_train_test/new_pca_PR_test.csv')
test_PR_PCA=test_PR_PCA.drop(['ResponderStatus'],axis=1)
test_PR_PCA

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,...,PC91,PC92,PC93,PC94,PC95,PC96,PC97,PC98,PC99,PC100
0,44.949972,36.283096,4.323080,1.222841,-3.865250,-6.983619,-12.330371,5.883948,-1.257910,-2.131967,...,-0.029822,-1.583009,0.262311,-0.020635,-0.119702,0.552338,-0.074957,1.664440,-0.218548,-0.504711
1,44.863813,36.305688,4.577772,1.401188,-3.401129,-7.047742,-12.377896,5.849542,-1.455263,-2.413868,...,0.053458,-1.460817,0.303536,-0.397558,-0.090858,0.484594,0.335562,1.392583,0.242092,-0.538237
2,47.175134,38.496341,6.090333,1.234224,-1.775033,-10.950471,-14.366040,7.925392,-4.684522,-3.866963,...,0.101347,0.304441,0.092659,-0.584909,0.657126,-0.211925,1.130855,0.470095,0.620590,0.386287
3,46.966885,38.041870,5.506160,0.762846,-3.888286,-12.688907,-13.793094,6.206620,-2.290117,-5.620418,...,-0.204126,0.205094,1.166862,-0.120951,0.860254,-0.775732,1.005704,0.060564,0.700499,-0.802037
4,51.879458,43.228122,5.614592,2.843887,-5.430430,-14.018890,-19.578671,13.031836,-4.698966,-7.655952,...,0.432650,-0.019286,0.408321,-0.040560,-0.044534,0.735486,0.261580,0.512268,0.046843,-0.401221
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
687,-5.442459,-0.164754,9.961652,8.806644,1.117222,5.080343,-0.029762,4.568124,-2.209392,-1.181702,...,0.050899,-0.254937,-0.306368,0.167982,0.315031,-0.441005,-0.053706,-0.270162,-0.192095,-0.486890
688,-7.074809,3.850077,-3.121434,5.802342,3.533248,-1.684975,0.247388,-0.942935,0.578980,-4.038236,...,1.015583,-0.773760,-0.252666,0.915936,2.131247,0.585415,-1.009386,1.134345,-0.866198,-0.023927
689,-9.027360,7.034213,-5.314144,3.654288,-4.259074,4.866479,-0.513533,-0.854975,-8.957362,0.101756,...,0.546676,-0.676071,-0.160294,0.232966,-0.988431,-1.200631,-0.348696,-0.352314,-0.667969,0.058982
690,-3.823557,1.445404,-1.208816,1.941569,4.328978,-4.393157,-3.536302,-8.931817,-3.424385,-0.400304,...,-1.830979,0.248899,1.039982,-1.965809,0.541022,-0.747695,1.296238,-0.079008,1.090038,-0.117337


In [None]:
train_clinical=train_df[['VL-t0',	'CD4-t0']]
train_clinical

Unnamed: 0,VL-t0,CD4-t0
0,4.30,145
1,3.60,224
2,3.20,1017
3,5.70,206
4,3.50,572
...,...,...
995,3.15,354
996,5.50,50
997,4.10,369
998,3.37,127


In [None]:
test_clinical=test_df[['VL-t0',	'CD4-t0']]
test_clinical

Unnamed: 0,VL-t0,CD4-t0
0,5.60,69
1,5.30,119
2,5.70,41
3,5.20,48
4,5.50,311
...,...,...
687,5.30,366
688,4.90,151
689,4.90,411
690,4.50,268


Merging the columns of the sequences and clinical data of the training set after embedding and PCA

In [None]:
x_train= pd.concat([train_PR_PCA, train_RT_PCA, train_clinical], axis=1, ignore_index=True)
x_train = x_train.rename(columns={200: 'VL-t0', 201: 'CD4-t0'})
x_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,192,193,194,195,196,197,198,199,VL-t0,CD4-t0
0,-1.885705,-0.651117,11.843484,4.869803,-1.839872,-1.211086,-1.339022,-5.662424,-2.658699,-2.294969,...,0.537436,2.644502,-1.485645,0.925236,1.315919,-0.469013,2.126996,0.450310,4.30,145
1,8.164608,-11.079589,6.101052,1.325754,1.939282,-2.708731,1.215843,-3.561059,-7.453094,-3.030263,...,-0.922559,0.216361,-4.471073,-0.397673,-3.046035,-0.667115,-1.308287,2.784812,3.60,224
2,-8.200550,7.256495,-5.723636,3.398085,-4.638080,4.988092,0.015292,0.147888,-8.151237,0.377515,...,2.519446,1.932880,-0.289343,2.733469,-1.500422,-0.236010,-2.858274,0.253565,3.20,1017
3,-8.060616,5.178030,-3.000635,5.470617,5.086985,2.029443,-0.753842,2.374189,-3.562362,-0.106443,...,-0.436911,3.708288,0.139090,-0.923414,-0.370323,0.612296,-2.744184,-0.574743,5.70,206
4,-7.668880,4.791262,-2.451455,0.302582,6.092274,-0.743319,-0.482994,3.074220,-0.092215,-0.252762,...,1.062744,2.336709,-1.709266,0.220020,1.486058,-0.169757,0.022307,-2.237143,3.50,572
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,,,,,,,,,,,...,-0.450952,-1.699169,-3.449650,2.218817,1.962050,-0.542329,2.506513,-0.954538,3.15,354
996,,,,,,,,,,,...,-0.090572,2.125578,-1.533416,-1.133331,-2.608001,-1.362068,-1.268701,1.887736,5.50,50
997,,,,,,,,,,,...,0.654479,-0.664620,-2.053382,0.805898,-0.065541,-1.339175,0.821416,2.229429,4.10,369
998,,,,,,,,,,,...,-1.081321,-3.865851,4.487018,-1.123869,-0.909715,1.793193,-1.114843,-2.468145,3.37,127


Filling the missing values with the column's median

In [None]:
x = x_train.fillna(x_train.median())
x

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,192,193,194,195,196,197,198,199,VL-t0,CD4-t0
0,-1.885705,-0.651117,11.843484,4.869803,-1.839872,-1.211086,-1.339022,-5.662424,-2.658699,-2.294969,...,0.537436,2.644502,-1.485645,0.925236,1.315919,-0.469013,2.126996,0.450310,4.30,145
1,8.164608,-11.079589,6.101052,1.325754,1.939282,-2.708731,1.215843,-3.561059,-7.453094,-3.030263,...,-0.922559,0.216361,-4.471073,-0.397673,-3.046035,-0.667115,-1.308287,2.784812,3.60,224
2,-8.200550,7.256495,-5.723636,3.398085,-4.638080,4.988092,0.015292,0.147888,-8.151237,0.377515,...,2.519446,1.932880,-0.289343,2.733469,-1.500422,-0.236010,-2.858274,0.253565,3.20,1017
3,-8.060616,5.178030,-3.000635,5.470617,5.086985,2.029443,-0.753842,2.374189,-3.562362,-0.106443,...,-0.436911,3.708288,0.139090,-0.923414,-0.370323,0.612296,-2.744184,-0.574743,5.70,206
4,-7.668880,4.791262,-2.451455,0.302582,6.092274,-0.743319,-0.482994,3.074220,-0.092215,-0.252762,...,1.062744,2.336709,-1.709266,0.220020,1.486058,-0.169757,0.022307,-2.237143,3.50,572
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,-3.478690,0.513879,-2.172637,-1.102549,0.413889,0.096499,-0.106507,0.150152,0.163058,-0.211260,...,-0.450952,-1.699169,-3.449650,2.218817,1.962050,-0.542329,2.506513,-0.954538,3.15,354
996,-3.478690,0.513879,-2.172637,-1.102549,0.413889,0.096499,-0.106507,0.150152,0.163058,-0.211260,...,-0.090572,2.125578,-1.533416,-1.133331,-2.608001,-1.362068,-1.268701,1.887736,5.50,50
997,-3.478690,0.513879,-2.172637,-1.102549,0.413889,0.096499,-0.106507,0.150152,0.163058,-0.211260,...,0.654479,-0.664620,-2.053382,0.805898,-0.065541,-1.339175,0.821416,2.229429,4.10,369
998,-3.478690,0.513879,-2.172637,-1.102549,0.413889,0.096499,-0.106507,0.150152,0.163058,-0.211260,...,-1.081321,-3.865851,4.487018,-1.123869,-0.909715,1.793193,-1.114843,-2.468145,3.37,127


Merging the columns of the sequences and clinical data of the testing set after embedding and PCA

In [None]:
x_test= pd.concat([test_PR_PCA, test_RT_PCA,test_clinical], axis=1, ignore_index=True)
x_test = x_test.rename(columns={200: 'VL-t0', 201: 'CD4-t0'})
x_test

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,192,193,194,195,196,197,198,199,VL-t0,CD4-t0
0,44.949972,36.283096,4.323080,1.222841,-3.865250,-6.983619,-12.330371,5.883948,-1.257910,-2.131967,...,2.218423,-1.780101,2.337994,1.997186,0.592466,-0.654364,-1.323716,-1.401459,5.60,69
1,44.863813,36.305688,4.577772,1.401188,-3.401129,-7.047742,-12.377896,5.849542,-1.455263,-2.413868,...,-1.226408,3.347541,-0.359025,3.140640,1.313073,-0.995234,-3.674432,-0.599512,5.30,119
2,47.175134,38.496341,6.090333,1.234224,-1.775033,-10.950471,-14.366040,7.925392,-4.684522,-3.866963,...,-1.020997,-2.470617,0.706749,1.839725,-3.828726,2.296222,-0.960563,-1.848076,5.70,41
3,46.966885,38.041870,5.506160,0.762846,-3.888286,-12.688907,-13.793094,6.206620,-2.290117,-5.620418,...,1.220160,2.237722,3.218172,1.200747,0.901825,1.010021,2.128166,-0.895052,5.20,48
4,51.879458,43.228122,5.614592,2.843887,-5.430430,-14.018890,-19.578671,13.031836,-4.698966,-7.655952,...,2.126768,-1.092917,0.526183,-0.867072,0.214048,-0.557311,-2.816051,-0.153161,5.50,311
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
687,-5.442459,-0.164754,9.961652,8.806644,1.117222,5.080343,-0.029762,4.568124,-2.209392,-1.181702,...,-0.765660,1.275599,-2.604298,-0.971486,-3.939393,-1.346038,0.690217,-0.659414,5.30,366
688,-7.074809,3.850077,-3.121434,5.802342,3.533248,-1.684975,0.247388,-0.942935,0.578980,-4.038236,...,3.069150,0.870366,-1.000629,0.299059,-2.630532,-0.901062,0.084201,0.359060,4.90,151
689,-9.027360,7.034213,-5.314144,3.654288,-4.259074,4.866479,-0.513533,-0.854975,-8.957362,0.101756,...,0.414361,0.720005,-0.154325,1.490132,-1.110214,-0.284734,-4.827731,-1.236673,4.90,411
690,-3.823557,1.445404,-1.208816,1.941569,4.328978,-4.393157,-3.536302,-8.931817,-3.424385,-0.400304,...,-3.587304,-0.544773,1.529914,2.205275,2.800087,-1.480479,-1.355578,-0.388843,4.50,268


In [None]:
y=train_df['Resp']
y

0      0
1      0
2      0
3      0
4      0
      ..
995    0
996    0
997    0
998    0
999    0
Name: Resp, Length: 1000, dtype: int64

In [None]:
y_test=test_label
y_test

0      1
1      0
2      1
3      0
4      1
      ..
687    1
688    1
689    1
690    1
691    1
Name: ResponderStatus, Length: 692, dtype: int64

Xgboost Model

In [None]:
scale_pos_weight = np.sum(y == 0) / np.sum(y== 1)
scale_pos_weight

3.854368932038835

In [None]:
model = xgb.XGBClassifier(scale_pos_weight=scale_pos_weight, use_label_encoder=False, eval_metric='logloss')

In [None]:
rnd, results = [], []

# Iterate 10 times
for _ in range(30):


  X_train, X_val, y_train, y_val = train_test_split(x, y, test_size=0.2) # split to get a validation set

# Calculate the scale_pos_weight value for handling imbalanced data
  scale_pos_weight = np.sum(y == 0) / np.sum(y == 1)

# Initialize the XGBoost classifier with tuned parameters
  model = xgb.XGBClassifier(
    booster='gbtree',
    n_estimators=1000,
    learning_rate=0.001,
    max_depth=3,
    min_child_weight=5,
    gamma=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.01,
    reg_lambda=1,
    objective='binary:logistic',
    scale_pos_weight=scale_pos_weight,
    random_state=42,
    n_jobs=-1,
    eval_metric='logloss',
    early_stopping_rounds=50
)

# Train the model with early stopping


# Print the list of random numbers and their average
  model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    verbose=False
  )

# Predict on the test set
  y_pred = model.predict(x_test)

# Evaluate the model
  print("Confusion Matrix:")
  print(confusion_matrix(y_test, y_pred))
  print("\naccuracy_score:")
  print(accuracy_score(y_test, y_pred))
  results.append(accuracy_score(y_test, y_pred))
  import random

  def generate_random_binary_list(n):
    return [random.choice([0, 1]) for _ in range(n)]

  n = len(y_pred)
  random_binary_list = generate_random_binary_list(n)

  print(accuracy_score(y_test, random_binary_list))
  rnd.append(accuracy_score(y_test, random_binary_list))
# Compute the average of the numbers in the list

print("Average:")
print ("results", sum(results) / len(results))
print("rnd", sum(rnd) / len(rnd))


Confusion Matrix:
[[231 115]
 [121 225]]

accuracy_score:
0.6589595375722543
0.5014450867052023
Confusion Matrix:
[[263  83]
 [169 177]]

accuracy_score:
0.6358381502890174
0.48988439306358383
Confusion Matrix:
[[260  86]
 [164 182]]

accuracy_score:
0.638728323699422
0.5144508670520231
Confusion Matrix:
[[269  77]
 [176 170]]

accuracy_score:
0.634393063583815
0.5028901734104047
Confusion Matrix:
[[232 114]
 [130 216]]

accuracy_score:
0.6473988439306358
0.476878612716763
Confusion Matrix:
[[241 105]
 [147 199]]

accuracy_score:
0.6358381502890174
0.4956647398843931
Confusion Matrix:
[[245 101]
 [150 196]]

accuracy_score:
0.6372832369942196
0.4667630057803468
Confusion Matrix:
[[230 116]
 [115 231]]

accuracy_score:
0.6661849710982659
0.5404624277456648
Confusion Matrix:
[[248  98]
 [151 195]]

accuracy_score:
0.6401734104046243
0.4884393063583815
Confusion Matrix:
[[252  94]
 [161 185]]

accuracy_score:
0.6315028901734104
0.48988439306358383
Confusion Matrix:
[[235 111]
 [128 218]]


We can see that the accuracy is 64%