In [90]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [91]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.impute import KNNImputer
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error

import json

import warnings

In [92]:
PATH = '/content/drive/Shareddrives/Interview Preparation/Data Science/Parkinson disease/data/'

In [97]:
# Importing data

train_proteins = pd.read_csv(PATH+'train_proteins.csv')
train_peptides = pd.read_csv(PATH+'train_peptides.csv')
train_clinical = pd.read_csv(PATH+'train_clinical_data.csv')

In [96]:
test_proteins = pd.read_csv(PATH+'test_proteins.csv')
test_peptides = pd.read_csv(PATH+'test_peptides.csv')
test_clinical = pd.read_csv(PATH+'test.csv')

submission = pd.read_csv(PATH+'sample_submission.csv')

In [100]:
submission

Unnamed: 0,prediction_id,rating,group_key
0,3342_0_updrs_1_plus_0_months,0,0
1,3342_0_updrs_1_plus_6_months,0,0
2,3342_0_updrs_1_plus_12_months,0,0
3,3342_0_updrs_1_plus_24_months,0,0
4,3342_0_updrs_2_plus_0_months,0,0
...,...,...,...
59,50423_6_updrs_3_plus_24_months,0,6
60,50423_6_updrs_4_plus_0_months,0,6
61,50423_6_updrs_4_plus_6_months,0,6
62,50423_6_updrs_4_plus_12_months,0,6


# Data Preprocessing

## Function

In [None]:
features = ["QNC(UniMod_4)ELFEQLGEYK", "SIDVAC(UniMod_4)HPGYALPK", "NVVYTC(UniMod_4)NEGYSLIGNPVAR", "LVGGPMDASVEEEGVRR", "ADLSGITGAR", "P43652", "SPVDIC(UniMod_4)TAKPR", "AESPEVC(UniMod_4)FNEESPK", "LAARLEALKENGGAR", "LPPTSAHGNVAEGETKPDPDVTER", "TEIDKPSQMQVTDVQDNSISVK", "SKVPPPRDFHINLFR", "HTSVQTTSSGSGPFTDVR", "MYYSAVDPTKDIFTGLIGPM(UniMod_35)K", "SNLDEDIIAEENIVSR", "LADGGATNQGRVEIFYR", "FFLC(UniMod_4)QVAGDAK", "TLEAQLTPR", "EQPPSLTR", "PALEDLR", "FQSVFTVTR", "HLSLLTTLSNR", "LC(UniMod_4)TVATLR", "NPDPWAK", "QWAGLVEK", "C(UniMod_4)RDQLPYIC(UniMod_4)QFGIV", "P49908", "FNKPFVFLM(UniMod_35)IEQNTK", "LSSWVLLMK", "ASYGVKPR", "GSQTQSHPDLGTEGC(UniMod_4)WDQLSAPR", "KYLYEIAR", "GVASLFAGR", "TGAQELLR", "P11277", "MFTTAPDQVDKEDEDFQESNK", "VTTVASHTSDSDVPSGVTEVVVK", "LGQSLDC(UniMod_4)NAEVYVVPWEK", "P01834", "TKKQELSEAEQATR", "LAPLAEDVR", "P02656", "AAFGQGSGPIMLDEVQC(UniMod_4)TGTEASLADC(UniMod_4)K", "GDKVWVYPPEKK", "TPENYPNAGLTMNYC(UniMod_4)R", "KQINDYVEKGTQGK", "SVPMVPPGIK", "C(UniMod_4)IIEVLSNALSK", "VTIPTDLIASSGDIIK", "SDDKVTLEER", "SSGQWQTPGATR", "P00748", "TM(UniMod_35)LLQPAGSLGSYSYR", "C(UniMod_4)C(UniMod_4)ESASEDC(UniMod_4)MAKELPEHTVK", "P01042", "KLSSWVLLM(UniMod_35)K", "LGC(UniMod_4)SLNQNSVPDIHGVEAPAR", "SLGNVIMVC(UniMod_4)R", "KSQPMGLWR", "LKEDAVSAAFK", "IFFYDSENPPASEVLR", "P16070", "ALYLQYTDETFR", "TPEVTC(UniMod_4)VVVDVSHEDPEVK", "LLELTGPK", "RQEC(UniMod_4)SIPVC(UniMod_4)GQDQVTVAMTPR", "P10645", "KEC(UniMod_4)C(UniMod_4)EKPLLEK", "C(UniMod_4)VNHYGGYLC(UniMod_4)LPK", "HLDSVLQQLQTEVYR", "LVNEVTEFAK", "P01621", "MASGAANVVGPK", "Q99674", "KLGQSLDC(UniMod_4)NAEVYVVPWEK", "GSESGIFTNTK", "P13611", "AVDTWSWGER", "LYGSEAFATDFQDSAAAK", "Q7Z3B1", "EGTC(UniMod_4)PEAPTDEC(UniMod_4)KPVK", "NILTSNNIDVKK", "NFGYTLR", "AATVGSLAGQPLQER", "FSVVYAK", "NSKFEDC(UniMod_4)C(UniMod_4)QEK", "C(UniMod_4)LVEKGDVAFVKHQTVPQNTGGK", "THLGEALAPLSK", "LETPDFQLFK", "GRTC(UniMod_4)PKPDDLPFSTVVPLK", "NTEILTGSWSDQTYPEGTQAIYK", "QRQEELC(UniMod_4)LAR", "GSPAINVAVHVFR", "LEGQEEEEDNRDSSMK", "IEIPSSVQQVPTIIK", "VNHVTLSQPK", "RTPITVVK", "KMTVTDQVNC(UniMod_4)PK", "MATLYSR", "QHVVYGPWNLPQSSYSHLTR", "O00391", "EVQPVELPNC(UniMod_4)NLVK", "LVMGIPTFGR", "C(UniMod_4)LPVTAPENGK", "HPNVFGFC(UniMod_4)R", "P05067", "NQEQVSPLTLLK", "SC(UniMod_4)C(UniMod_4)EEQNKVNC(UniMod_4)LQTR", "TKEEYGHSEVVEYYC(UniMod_4)NPR", "P17936", "MTVTDQVNC(UniMod_4)PK", "P00738", "DGGFC(UniMod_4)EVC(UniMod_4)K", "GNSYFMVEVK", "ISLPESLK", "GC(UniMod_4)SFLPDPYQK", "KREEAPSLRPAPPPISGGGYR", "PIVTSPYQIHFTK", "P25311", "ELSSFIDKGQELC(UniMod_4)ADYSENTFTEYKK", "VDNALQSGNSQESVTEQDSKDSTYSLSSTLTLSK", "LRTEGDGVYTLNNEKQWINK", "AQGFTEDTIVFLPQTDK", "IEKVEHSDLSFSK", "APEAQVSVQPNFQQDK", "NRDHDTFLAVR", "Q12841", "P41222", "HPDEAAFFDTASTGK", "Q13332", "DGLKYHYLIR", "Q9BY67", "QVVAGLNFR", "QTALVELVK", "SLEDLQLTHNKITK", "P00747", "NFPPSQDASGDLYTTSSQLTLPATQC(UniMod_4)LAGK", "P00441", "DLLLPQPDLR", "P01861", "C(UniMod_4)SYTEDAQC(UniMod_4)IDGTIEVPK", "EYC(UniMod_4)GVPGDGDEELLR", "MDYPKQTQVSVLPEGGETPLFK", "YGLDSDLSC(UniMod_4)K", "LSPEDYTLK", "LC(UniMod_4)MGSGLNLC(UniMod_4)EPNNKEGYYGYTGAFR", "VAAGAFQGLR", "Q9UKV8", "Q96BZ4", "TPSGLYLGTC(UniMod_4)ER", "HQPQEFPTYVEPTNDEIC(UniMod_4)EAFRK", "ELDLNSVLLK", "KSASDLTWDNLK", "P23083", "LGPGMADIC(UniMod_4)K", "NLHGDGIALWYTR", "O14498", "TMSEVGGSVEDLIAK", "SEYPSIK", "HQPQEFPTYVEPTNDEIC(UniMod_4)EAFRKDPK", "QYTDSTFRVPVER", "KDSGFQMNQLR", "FKDLGEENFK", "P61916", "LVDGKGVPIPNKVIFIR", "SLDFTELDVAAEK", "LSINTHPSQKPLSITVR", "FTFEYSR", "SDVVYTDWK", "P07195", "EDC(UniMod_4)NELPPRR", "ITGYIIK", "FSC(UniMod_4)MC(UniMod_4)PQGYQVVR", "P01009", "P16870", "TNTNVNC(UniMod_4)PIEC(UniMod_4)FMPLDVQADREDSRE", "C(UniMod_4)TSTGWIPAPR", "ELDESLQVAER", "LGVRPSQGGEAPR", "VLTPTQVK", "RPC(UniMod_4)FSALEVDETYVPK", "TLKIENVSYQDKGNYR", "WSRPQAPITGYR", "YLQEIYNSNNQK", "IMNGEADAMSLDGGFVYIAGK", "HYDGSYSTFGER", "TVQAVLTVPK", "TTPPVLDSDGSFFLYSK", "AVSEKEVDSGNDIYGNPIKR", "VEPLRAELQEGAR", "P32754", "ALSIGFETC(UniMod_4)R", "LVFFAEDVGSNK", "VHVSEEGTEPEAMLQVLGPKPALPAGTEDTAKEDAANR", "SSALDMENFR", "LC(UniMod_4)DNLSTK", "P02753", "KPALEDLR", "LFAC(UniMod_4)SNK", "NPDSSTTGPWC(UniMod_4)YTTDPTVR", "IALVITDGR", "QHM(UniMod_35)DSDSSPSSSSTYC(UniMod_4)NQMMR", "EDSLEAGLPLQVR", "C(UniMod_4)C(UniMod_4)AAADPHEC(UniMod_4)YAK", "C(UniMod_4)FSGQC(UniMod_4)ISK", "WLPSSSPVTGYR", "GNQWVGYDDQESVK", "LKC(UniMod_4)DEWSVNSVGK", "WQEEMELYR", "DGGFC(UniMod_4)EVC(UniMod_4)KK", "DEPPQSPWDR", "EFTRPEEIIFLR", "LAC(UniMod_4)C(UniMod_4)VVGVC(UniMod_4)GPGLWER", "VFNTPEGVPSAPSSLK", "Q9UNU6", "TVVQPSVGAAAGPVVPPC(UniMod_4)PGR", "QTVSWAVTPK", "AKWEMPFDPQDTHQSR", "P00736", "SGIPIVTSPYQIHFTK", "VMPIC(UniMod_4)LPSKDYAEVGR", "GLETSLAEC(UniMod_4)TFTK", "ASEGGFTATGQR", "LLDSLPSDTR", "P00734", "P02671", "ATEHLSTLSEK", "IVSSAMEPDREYHFGQAVR", "AYLEEEC(UniMod_4)PATLRK", "TVAAPSVFIFPPSDEQLK", "FTDSENVC(UniMod_4)QER", "VPQVSTPTLVEVSR", "RGYQLSDVDGVTC(UniMod_4)EDIDEC(UniMod_4)ALPTGGHIC(UniMod_4)SYR", "P14618", "GMFNIQHC(UniMod_4)K", "P01594", "VTLTC(UniMod_4)VAPLSGVDFQLR", "STNLHDYGMLLPC(UniMod_4)GIDK", "P01031", "VGGVQSLGGTGALR", "AGLLRPDYALLGHR", "GINVALANGK", "LGADM(UniMod_35)EDVC(UniMod_4)GR", "GSFAC(UniMod_4)QC(UniMod_4)PPGYQK", "LSPLGEEMR", "P02747", "C(UniMod_4)KPVNTFVHEPLVDVQNVC(UniMod_4)FQEK", "WYFDVTEGK", "Q7Z5P9", "AYLEEEC(UniMod_4)PATLR", "SFNRGEC(UniMod_4)", "TVAAC(UniMod_4)NLPIVR", "TEHLSTLSEK", "LTPLYELVK", "YWGVASFLQK", "P06727", "AIQLTYNPDESSKPNMIDAATLK", "ILEVVNQIQDEER", "P35542", "TPSAAYLWVGTGASEAEK", "AYKSELEEQLTPVAEETR", "FTNIGPDTMR", "FTEC(UniMod_4)C(UniMod_4)QAADK", "THLAPYSDELRQR", "C(UniMod_4)LAPLEGAR", "ADSGEGDFLAEGGGVR", "DKLAAC(UniMod_4)LEGNC(UniMod_4)AEGLGTNYR", "P13591", "HGGLYHENMR", "DQPFTILYR", "P05155", "P24592", "TPLPPTSAHGNVAEGETKPDPDVTER", "ELSSFIDKGQELC(UniMod_4)ADYSENTFTEYK", "P05060", "VKDLATVYVDVLK", "P36222", "NTGIIC(UniMod_4)TIGPASR", "ALFLETEQLK", "SIAQYWLGC(UniMod_4)PAPGHL", "VLNQELR", "P04211", "IC(UniMod_4)VGC(UniMod_4)PR", "P08571", "AVC(UniMod_4)SQEAMTGPC(UniMod_4)R", "P07711", "YSLTYIYTGLSK", "LVDKFLEDVKK", "P09486", "C(UniMod_4)LAFEC(UniMod_4)PENYRR", "TFHEASEDC(UniMod_4)ISR", "LHLDYIGPC(UniMod_4)K", "TAKDALSSVQESQVAQQAR", "SGSDEVQVGQQR", "P02792", "HGPGLIYR", "NSPLDEENLTQENQDR", "KC(UniMod_4)C(UniMod_4)VEC(UniMod_4)PPC(UniMod_4)PAPPVAGPSVFLFPPKPK", "NIINSDGGPYVC(UniMod_4)R", "SETKDLLFRDDTVC(UniMod_4)LAK", "O15240", "O94919", "KTSLEDFYLDEER", "VTSIQDWVQK", "LC(UniMod_4)MAALK", "SGEATDGARPQALPEPMQESK", "P04196", "P08133", "NSLFEYQK", "P01033", "IC(UniMod_4)LEDNVLM(UniMod_35)SGVK", "LDIDSPPITAR", "P43121", "Q15904", "EGVQKEDIPPADLSDQVPDTESETR", "ADDLGKGGNEESTKTGNAGSR", "P04156", "AADDTWEPFASGK", "GQSISVTSIRPC(UniMod_4)AAETQ", "P20933", "SASDLTWDNLKGK", "O00533", "YQC(UniMod_4)YC(UniMod_4)YGR", "P08253", "FSC(UniMod_4)FQEEAPQPHYQLR", "TQVNTQAEQLR", "C(UniMod_4)LVNLIEK", "Q9UBR2", "YLFLNGNK", "EILSVDC(UniMod_4)STNNPSQAK", "LVAYYTLIGASGQR", "EGQEC(UniMod_4)GVYTPNC(UniMod_4)APGLQC(UniMod_4)HPPKDDEAPLR", "AKAYLEEEC(UniMod_4)PATLRK", "IPIEDGSGEVVLSRK", "C(UniMod_4)REILSVDC(UniMod_4)STNNPSQAK", "AAFTEC(UniMod_4)C(UniMod_4)QAADK", "ESAYLYAR", "P01019", "ETLLQDFR", "GAAPPKQEFLDIEDP", "GEVQAMLGQSTEELRVR", "M(UniMod_35)ADEAGSEADHEGTHSTKR", "DVRDYFMPC(UniMod_4)PGR", "GAQTQTEEEMTR", "SGINC(UniMod_4)PIQK", "P27169", "VTIKPAPETEKRPQDAK", "P02647", "PTLVEVSR", "GSPAINVAVHVFRK", "DSGEGDFLAEGGGVR", "TFYEPGEEITYSC(UniMod_4)KPGYVSR", "DYFMPC(UniMod_4)PGR", "SLGEC(UniMod_4)C(UniMod_4)DVEDSTTC(UniMod_4)FNAK", "QQTEWQSGQR", "O14773", "FTQVTPTSLSAQWTPPNVQLTGYR", "LAVYQAGAR", "VGFYESDVMGR", "Q14118", "RVWELSK", "DSGFQM(UniMod_35)NQLR", "KDSGFQM(UniMod_35)NQLR", "SSFVAPLEK", "Q16270", "GMDSC(UniMod_4)KGDSGGAFAVQDPNDKTK", "P10643", "DC(UniMod_4)SLPYATESK", "NLQPASEYTVSLVAIKGNQESPK", "AGC(UniMod_4)VAESTAVC(UniMod_4)R", "VYC(UniMod_4)DMNTENGGWTVIQNR", "YFIDFVAR", "C(UniMod_4)FLAFTQTK", "ALMSPAGMLR", "GDFSSANNRDNTYNR", "IGADFLAR", "SNSSMHITDC(UniMod_4)R", "VHTEC(UniMod_4)C(UniMod_4)HGDLLEC(UniMod_4)ADDR", "HTFMGVVSLGSPSGEVSHPR", "ATEDEGSEQKIPEATNRR", "QKWEAEPVYVQR", "SPAINVAVHVFR", "VLEPTLK", "P14174", "P61278", "NTFAEVTGLSPGVTYYFK", "QSGLYFIKPLK", "TMQALPYSTVGNSNNYLHLSVLR", "HTFM(UniMod_35)GVVSLGSPSGEVSHPR", "LC(UniMod_4)MGSGLNLC(UniMod_4)EPNNK", "SRYPVC(UniMod_4)GSDGTTYPSGC(UniMod_4)QLR", "VKDISEVVTPR", "IASFSQNC(UniMod_4)DIYPGKDFVQPPTK", "EPTMYGEILSPNYPQAYPSEVEK", "SPVGVQPILNEHTFC(UniMod_4)AGMSK", "FLATTPNSLLVSWQPPR", "ASYLDC(UniMod_4)IR", "NFPSPVDAAFR", "LSPIYNLVPVK", "P01859", "APLIPMEHC(UniMod_4)TTR", "ALQDQLVLVAAK", "C(UniMod_4)APFFYGGC(UniMod_4)GGNR", "NILDRQDPPSVVVTSHQAPGEK", "AIPVTQYLK", "KAADDTWEPFASGK", "SSPVVIDASTAIDAPSNLR", "TPLTATLSK", "ILGPLSYSK", "DFADIPNLR", "AQTTVTC(UniMod_4)M(UniMod_35)ENGWSPTPR", "VQPYLDDFQKK", "FM(UniMod_35)ETVAEK", "AAVYHHFISDGVR", "SVIPSDGPSVAC(UniMod_4)VK", "QHMDSDSSPSSSSTYC(UniMod_4)NQMMR", "P04180", "SC(UniMod_4)VGETTESTQC(UniMod_4)EDEELEHLR", "P02649", "Q08380", "GYTQQLAFR", "ILAGSADSEGVAAPR", "TLVVHEKADDLGKGGNEESTK", "TFISPIK", "HSIFTPETNPR", "LDELRDEGK", "Q13449", "P02766", "P98160", "P01860", "MKYWGVASFLQK", "FSGTWYAMAK", "MGNFPWQVFTNIHGR", "EC(UniMod_4)EEIIR", "QELSEAEQATR", "SGIEC(UniMod_4)QLWR", "P07998", "YLGEEYVK", "NWGLGGHAFC(UniMod_4)R", "TTPEPC(UniMod_4)ELDDEDFR", "P51884", "P06396", "GC(UniMod_4)PTEEGC(UniMod_4)GER", "Q06481", "EVGPTNADPVC(UniMod_4)LAK", "THLPEVFLSK", "P07225", "AVLPTGDVIGDSAK", "TLLSNLEEAK", "O75326", "O60888", "P02765", "M(UniMod_35)LTPEHVFIHPGWK", "NLAVSQVVHK", "VHVSEEGTEPEAMLQVLGPKPALPAGTEDTAKEDAANRK", "P11142", "TFTC(UniMod_4)TAAYPESK", "RLGMFNIQHC(UniMod_4)K", "SPFEQHIK", "LRENELTYYC(UniMod_4)C(UniMod_4)KK", "SC(UniMod_4)ESNSPFPVHPGTAEC(UniMod_4)C(UniMod_4)TK", "KLYDYC(UniMod_4)DVPQC(UniMod_4)AAPSFDC(UniMod_4)GKPQVEPK", "EVNVSPC(UniMod_4)PTQPC(UniMod_4)QLSK", "LLRDPADASEAHESSSR", "INENTGSVSVTR", "ALGISPFHEHAEVVFTANDSGPRR", "MC(UniMod_4)PQLQQYEMHGPEGLR", "P36980", "GAYPLSIEPIGVR", "YGQTIRPIC(UniMod_4)LPC(UniMod_4)TEGTTR", "P01876", "LLPAQLPAEKEVGPPLPQEAVPLQK", "RLPIGSYF", "VLDLSC(UniMod_4)NR", "P04075", "SFQTGLFTAAR", "ATEDEGSEQKIPEATNR", "P06454", "YYC(UniMod_4)FQGNQFLR", "GEVQAMLGQSTEELR", "P09871", "Q96S96", "AIGAVPLIQGEYMIPC(UniMod_4)EK", "ETYGEMADC(UniMod_4)C(UniMod_4)AK", "LLEVPEGR", "FSPATHPSEGLEENYC(UniMod_4)RNPDNDPQGPWC(UniMod_4)YTTDPEKR", "VPEARPNSMVVEHPEFLK", "SGSAHEYSSSPDDAIFQSLAR", "KQTALVELVK", "TEGDGVYTLNNEK", "P05090", "Q8IWV7", "EQLSLLDRFTEDAKR", "MADEAGSEADHEGTHSTKR", "TSLEDFYLDEER", "SKEFQLFSSPHGK", "RLEAGDHPVELLAR", "P02790", "P02652", "GLYDVVSVLR", "ARAEAQEAEDQQAR", "LGADMEDVC(UniMod_4)GR", "P49588", "GSPSGEVSHPR", "Q9Y646", "GKRPYQEGTPC(UniMod_4)SQC(UniMod_4)PSGYHC(UniMod_4)K", "VDGALC(UniMod_4)MEK", "VLSIAQAHSPAFSC(UniMod_4)EQVR", "YYTYLIMNK", "GYC(UniMod_4)APGMEC(UniMod_4)VK", "AGAAAGGPGVSGVC(UniMod_4)VC(UniMod_4)K", "VFSNGADLSGVTEEAPLK", "AVISPGFDVFAK", "WKNFPSPVDAAFR", "KC(UniMod_4)SYTEDAQC(UniMod_4)IDGTIEVPK", "DTDTGALLFIGK", "AKLEEQAQQIR", "ASLVPMEHC(UniMod_4)ITR", "DIPMNPMC(UniMod_4)IYR", "LIVHNGYC(UniMod_4)DGR", "LQSLFDSPDFSK", "P60174", "DPTFIPAPIQAK", "IC(UniMod_4)ANVFC(UniMod_4)GAGR", "DQTVSDNELQEMSNQGSK", "SASDLTWDNLK", "RPGGEPSPEGTTGQSYNQYSQR", "M(UniMod_35)C(UniMod_4)PQLQQYEMHGPEGLR", "Q02818", "P19021", "NLNEKDYELLC(UniMod_4)LDGTR", "FIYGGC(UniMod_4)GGNR", "VNGSPVDNHPFAGDVVFPR", "O75144", "GVALADFNR", "P23142", "GYPGVQAPEDLEWER", "TPC(UniMod_4)QQELDQVLER", "C(UniMod_4)C(UniMod_4)TESLVNR", "GATLALTQVTPQDER", "RGEQC(UniMod_4)VDIDEC(UniMod_4)TIPPYC(UniMod_4)HQR", "NPDGDVGGPWC(UniMod_4)YTTNPR", "Q562R1", "SLPSEASEQYLTK", "AGDFLEANYMNLQR", "SELEEQLTPVAEETR", "GNPEPTFSWTK", "P02679", "GLPAPIEK", "SLAPYAQDTQEK", "P01034", "AELQC(UniMod_4)PQPAA", "DVC(UniMod_4)KNYAEAK", "KPQSAVYSTGSNGILLC(UniMod_4)EAEGEPQPTIK", "NILTSNNIDVK", "AKPALEDLR", "Q13283", "HSTIFENLANK", "SSNLIILEEHLK", "YANC(UniMod_4)HLAR", "P10909", "Q92520", "TMQENSTPRED", "VPFDAATLHTSTAMAAQHGMDDDGTGQK", "ILLQGTPVAQMTEDAVDAER", "ALANSLAC(UniMod_4)QGK", "TM(UniMod_35)QENSTPRED", "P04275", "QLNEINYEDHK", "MELERPGGNEITR", "DVLLEAC(UniMod_4)C(UniMod_4)ADGHR", "ALVQQMEQLR", "VEKPTADAEAYVFTPNMIC(UniMod_4)AGGEK", "TSESGELHGLTTEEEFVEGIYK", "P06681", "IVQIYKDLLR", "KVEQAVETEPEPELR", "M(UniMod_35)YLGYEYVTAIR", "P08493", "HPNSPLDEENLTQENQDR", "AGLAASLAGPHSIVGR", "DGWSAQPTC(UniMod_4)IK", "RNTEILTGSWSDQTYPEGTQAIYK", "NIGEFC(UniMod_4)GK", "LQDLYSIVR", "TPC(UniMod_4)TVSC(UniMod_4)NIPVVSGKEC(UniMod_4)EEIIR", "YVNKEIQNAVNGVK", "SVIPSDGPSVAC(UniMod_4)VKK", "VC(UniMod_4)PFAGILENGAVR", "P05546", "P01833", "P01344", "ASHEEVEGLVEK", "SWFEPLVEDM(UniMod_35)QR", "LQAEAFQAR", "DLLFKDSAHGFLK", "DWHGVPGQVDAAMAGR", "P01877", "P13987", "LINDYVK", "KQELSEAEQATR", "SLGSPSGEVSHPR", "P80748", "P01024", "AQC(UniMod_4)GGGLLGVR", "LLC(UniMod_4)QC(UniMod_4)LGFGSGHFR", "LMVELHNLYR", "QDGSVDFGR", "ITTTSPWMFPSR", "MYLGYEYVTAIR", "IEC(UniMod_4)VSAETTEDC(UniMod_4)IAK", "GQEDASPDRFSNTDYAVGYMLR", "P12109", "P04217", "ADDKETC(UniMod_4)FAEEGK", "SYTITGLQPGTDYK", "YIETDPANR", "P00746", "SSNTYTLTDVRR", "SVLGQLGITK", "VFSNGADLSGVTEEAPLKLSK", "RLEGQEEEEDNRDSSMK", "PNHAVVTR", "AIGYLNTGYQR", "TLYSSSPR", "P39060", "KTLLSNLEEAK", "EIQNAVNGVK", "M(UniMod_35)ASGAANVVGPK", "KASYLDC(UniMod_4)IR", "TGDEITYQC(UniMod_4)R", "HGTC(UniMod_4)AAQVDALNSQKK", "P10451", "ILGQQVPYATK", "GGETSEMYLIQPDSSVKPYR", "SASLHLPK", "TYETTLEK", "QPSSAFAAFVK", "YHDRDVWKPEPC(UniMod_4)R", "P40925", "EQLTPLIK", "TFTLLDPK", "FMETVAEK", "SDVVYTDWKK", "WSGQTAIC(UniMod_4)DNGAGYC(UniMod_4)SNPGIPIGTR", "LQSIGTENTEENRR", "YLYEIAR", "MPYEC(UniMod_4)GPSLDVC(UniMod_4)AQDER", "Q14515", "IDQTVEELRR", "KVESELIKPINPR", "MVQEQC(UniMod_4)C(UniMod_4)HSQLEELHC(UniMod_4)ATGISLANEQDR", "NFPPSQDASGDLYTTSSQLTLPATQC(UniMod_4)PDGK", "AQTTVTC(UniMod_4)MENGWSPTPR", "WSSTSPHRPR", "ISYGNDALMPSLTETK", "IYISGMAPRPSLAK", "PPTSAHGNVAEGETKPDPDVTER", "FQPTLLTLPR", "DLATVYVDVLK", "SLAELGGHLDQQVEEFRR", "TVFGTEPDMIR", "YVMLPVADQDQC(UniMod_4)IR", "VFQEPLFYEAPR", "KYFIDFVAR", "GYHLNEEGTR", "IPTTFENGR", "QQTHMLDVMQDHFSR", "O14791", "P07339", "LAVTTHGLPC(UniMod_4)LAWASAQAK", "NSVIKVPMMNSK", "YYWGGQYTWDMAK", "ATFGC(UniMod_4)HDGYSLDGPEEIEC(UniMod_4)TK", "QSTNAYPDLR", "KATEDEGSEQKIPEATNR", "KLC(UniMod_4)MAALK", "VYAC(UniMod_4)EVTHQGLSSPVTK", "TPLGDTTHTC(UniMod_4)PR", "QINDYVEK", "SDVMYTDWK", "SFPTYC(UniMod_4)QQK", "SDVMYTDWKK", "LLDNWDSVTSTFSK", "Q9Y6R7", "AGALNSNDAFVLK", "AATGEC(UniMod_4)TATVGKR", "ISASAEELR", "C(UniMod_4)AEENC(UniMod_4)FIQK", "C(UniMod_4)MC(UniMod_4)PAENPGC(UniMod_4)R", "QINDYVEKGTQGK", "C(UniMod_4)PFPSRPDNGFVNYPAKPTLYYK", "P02760", "ADRDQYELLC(UniMod_4)LDNTR", "P19827", "Q12907", "ATLGPAVRPLPWQR", "GPSVFPLAPSSK", "IYISGMAPR", "TC(UniMod_4)VADESAENC(UniMod_4)DKSLHTLFGDK", "VMDKYTFELSR", "KFPSGTFEQVSQLVK", "Q99969", "SSGLVSNAPGVQIR", "P04004", "GEAAGAVQELAR", "GLGEISAASEFK", "P07858", "Q9UHG2", "ESLQQMAEVTR", "EWVAIESDSVQPVPR", "WEAEPVYVQR", "VDNALQSGNSQESVTEQDSK", "VSPTDC(UniMod_4)SAVEPEAEK", "QIGSVYR", "P31997", "RIPIEDGSGEVVLSRK", "P09104", "KLINDYVK", "VPVAVQGEDTVQSLTQGDGVAK", "MMAVAADTLQR", "VIAVNEVGR", "LGMFNIQHC(UniMod_4)K", "P07602", "Q6UXD5", "P02750", "GYQLSDVDGVTC(UniMod_4)EDIDEC(UniMod_4)ALPTGGHIC(UniMod_4)SYR", "VTGVVLFR", "Q8NBJ4", "WYEIEKIPTTFENGR", "KLSSWVLLMK", "IKPVFIEDANFGR", "P02774", "VKSPELQAEAK", "C(UniMod_4)C(UniMod_4)ESASEDC(UniMod_4)MAK", "KTLLSNLEEAKK", "ATWSGAVLAGR", "LSSWVLLM(UniMod_35)K", "C(UniMod_4)LKDGAGDVAFVK", "FNALQYLR", "IDQNVEELKGR", "Q12805", "LVGGPM(UniMod_35)DASVEEEGVRR", "LYEQLSGK", "YGLVTYATYPK", "ADQVC(UniMod_4)INLR", "IVSSAM(UniMod_35)EPDREYHFGQAVR", "QQETAAAETETR", "VQAAVGTSAAPVPSDNH", "M(UniMod_35)VQEQC(UniMod_4)C(UniMod_4)HSQLEELHC(UniMod_4)ATGISLANEQDR", "P04207", "P61626", "FVVTDGGITR", "AYQGVAAPFPK", "MC(UniMod_4)VDVNEC(UniMod_4)QR", "NKFDPSLTQR", "NSWGEEWGMGGYVK", "KVLLDGVQNPR", "NNEGTYYSPNYNPQSR", "VRGGEGTGYFVDFSVR", "C(UniMod_4)PNPPVQENFDVNKYLGR", "IQPSGGTNINEALLR", "FNKNNEGTYYSPNYNPQSR", "LSYEGEVTK", "LYHSEAFTVNFGDTEEAKK", "Q92823", "AQAGANTRPC(UniMod_4)PS", "QKPDGVFQEDAPVIHQEMIGGLR", "TC(UniMod_4)VADESAENC(UniMod_4)DK", "DASGVTFTWTPSSGK", "P01591", "O15394", "GEC(UniMod_4)WC(UniMod_4)VNPNTGK", "QGIPFFGQVR", "LLAGDHPIDLLLR", "TYLGNALVC(UniMod_4)TC(UniMod_4)YGGSR", "Q9NYU2", "P00450", "LKC(UniMod_4)DEWSVNSVGKIEC(UniMod_4)VSAETTEDC(UniMod_4)IAK", "YKAAFTEC(UniMod_4)C(UniMod_4)QAADK", "KVPQVSTPTLVEVSR", "P54289", "FDEFFSEGC(UniMod_4)APGSK", "GWVTDGFSSLK", "INHC(UniMod_4)RFDEFFSEGC(UniMod_4)APGSKK", "VVEESELAR", "GATTSPGVYELSSR", "Q96PD5", "TLLSNLEEAKK", "YTTEIIK", "TNFDNDIALVR", "LSITGTYDLK", "VTAAPQSVC(UniMod_4)ALR", "P04406", "EIGELYLPK", "SSQGGSLPSEEK", "KSPELQAEAK", "WGYC(UniMod_4)LEPK", "M(UniMod_35)ELERPGGNEITR", "DDNPNLPR", "LLIYDASNR", "AVVVHAGEDDLGR", "O00584", "SC(UniMod_4)DIPVFMNAR", "YPSLSIHGIEGAFDEPGTK", "ATVVYQGER", "DC(UniMod_4)HLAQVPSHTVVAR", "TELLPGDRDNLAIQTR", "LSNENHGIAQR", "P16152", "Q92876", "Q6UX71", "LHDRNTYEKYLGEEYVK", "AQPVQVAEGSEPDGFWEALGGK", "LGPLVEQGRVR", "FLENEDRR", "P02751", "P43251", "ALGISPFHEHAEVVFTANDSGPR", "P13521", "DNWVFC(UniMod_4)GGK", "TQVNTQAEQLRR", "Q99683", "FDEFFSEGC(UniMod_4)APGSKK", "AGKEPGLQIWR", "ALTDMPQMR", "IC(UniMod_4)LEDNVLMSGVK", "VQPYLDDFQK", "QKVEPLRAELQEGAR", "GGSTSYGTGSETESPR", "VHTEC(UniMod_4)C(UniMod_4)HGDLLEC(UniMod_4)ADDRADLAK", "TGYYFDGISR", "P13671", "P01608", "DLGEENFK", "MNFRPGVLSSR", "STSSFPC(UniMod_4)PAGHFNGFR", "SILENLR", "C(UniMod_4)VC(UniMod_4)PVSNAMC(UniMod_4)R", "P08697", "LNMHMNVQNGKWDSDPSGTK", "LIADLGSTSITNLGFR", "YAMVYGYNAAYNR", "VSEADSSNADWVTK", "QGVNDNEEGFFSAR", "SEGSSVNLSPPLEQC(UniMod_4)VPDRGQQYQGR", "AEAQEAEDQQAR", "YVM(UniMod_35)LPVADQDQC(UniMod_4)IR", "FLVNLVK", "STTPDITGYR", "VASYGVKPR", "HQTVPQNTGGKNPDPWAK", "KIYPTVNC(UniMod_4)QPLGMISLMK", "NLREGTC(UniMod_4)PEAPTDEC(UniMod_4)KPVK", "P16035", "FYNQVSTPLLR", "ALSSEWKPEIR", "SMEQNGPGLEYR", "P19652", "LFGGNFAHQASVAR", "LGPLVEQGR", "DAC(UniMod_4)GC(UniMod_4)C(UniMod_4)PMC(UniMod_4)AR", "QEPERNEC(UniMod_4)FLQHKDDNPNLPR", "RTHLPEVFLSK", "LRENELTYYC(UniMod_4)C(UniMod_4)K", "LYGSEAFATDFQDSAAAKK", "IYLYTLNDNAR", "QLKEHAVEGDC(UniMod_4)DFQLLKLDGK", "P02452", "Q8N2S1", "C(UniMod_4)QC(UniMod_4)DELC(UniMod_4)SYYQSC(UniMod_4)C(UniMod_4)TDYTAEC(UniMod_4)KPQVTR", "P02655", "TATSEYQTFFNPR", "RELDESLQVAER", "LEDMEQALSPSVFK", "FLENEDR", "GRPGPQPWC(UniMod_4)ATTPNFDQDQR", "KC(UniMod_4)STSSLLEAC(UniMod_4)TFR", "FDGILGMAYPR", "P18065", "P02768", "P08603", "Q14624", "DDKETC(UniMod_4)FAEEGKK", "P02749", "IFSFDGKDVLR", "FEHC(UniMod_4)NFNDVTTR", "LRTEGDGVYTLNNEK", "IWDVVEK", "C(UniMod_4)LAFEC(UniMod_4)PENYR", "C(UniMod_4)STSSLLEAC(UniMod_4)TFR", "HQPQEFPTYVEPTNDEIC(UniMod_4)EAFR", "THPHFVIPYR", "VTEIWQEVMQR", "P13473", "Q96KN2", "SGLSTGWTQLSK", "TDASDVKPC(UniMod_4)", "VDSGNDVTDIADDGC(UniMod_4)PKPPEIAHGYVEHSVR", "GAQTLYVPNC(UniMod_4)DHR", "ITYGETGGNSPVQEFTVPGSK", "YIFHNFMER", "ASQSVSSYLAWYQQKPGQAPR", "QFTSSTSYNR", "ADDKETC(UniMod_4)FAEEGKK", "KPVEEYANC(UniMod_4)HLAR", "MPC(UniMod_4)AELVREPGC(UniMod_4)GC(UniMod_4)C(UniMod_4)SVC(UniMod_4)AR", "GEAGAPGEEDIQGPTK", "SNAQGIDLNR", "P01717", "SC(UniMod_4)DTPPPC(UniMod_4)PR", "WC(UniMod_4)AVSEHEATK", "APHGPGLIYR", "VEPYGENFNK", "LEGEAC(UniMod_4)GVYTPR", "THLAPYSDELR", "C(UniMod_4)PNPPVQENFDVNK", "DKETC(UniMod_4)FAEEGKK", "SC(UniMod_4)DNPYIPNGDYSPLR", "DTVIKPLLVEPEGLEK", "KVTYTSQEDLVEKK", "ANRPFLVFIR", "P01008", "P01011", "DSGFQMNQLR", "ATVNPSAPR", "Q14508", "SEALAVDGAGKPGAEEAQDPEGK", "P01023", "SGELEQEEER", "SYLSMVGSC(UniMod_4)C(UniMod_4)TSASPTVC(UniMod_4)FLKER", "P02675", "KQINDYVEK", "YPNC(UniMod_4)AYR", "P02748", "QYNVGPSVSKYPLR", "DGAGDVAFVK", "TFYSC(UniMod_4)TTEGR", "P04433", "P14314", "LGNQEPGGQTALK", "LSKELQAAQAR", "VLLDGVQNPR", "Q9HDC9", "DSGRDYVSQFEGSALGK", "LVWEEAMSR", "P00751", "VMTPAVYAPYDVK", "P08294", "DNC(UniMod_4)C(UniMod_4)ILDER", "NGFYPATR", "TASDFITK", "LQPLDFKENAEQSR", "P30086", "KAEEEHLGILGPQLHADVGDKVK", "P05452", "DQTVSDNELQEM(UniMod_35)SNQGSK", "LC(UniMod_4)QDLGPGAFR", "ELLESYIDGR", "HYEGSTVPEK", "P02787", "LVGGPMDASVEEEGVR", "SVPPSASHVAPTETFTYEWTVPK", "VSTLPAITLK", "P07333", "RLYGSEAFATDFQDSAAAK", "VGYVSGWGR", "C(UniMod_4)TTPPPSSGPTYQC(UniMod_4)LK", "C(UniMod_4)DSSPDSAEDVRK", "DTSC(UniMod_4)VNPPTVQNAYIVSR", "AGLQVYNK", "P55290", "TSAHGNVAEGETKPDPDVTER", "VAVVQYSGTGQQRPER", "RVDTVDPPYPR", "EVPLNTIIFMGR", "EFQLFSSPHGK", "VFAVSHGR", "P05156", "GLSAEPGWQAK", "LSYTC(UniMod_4)EGGFR", "QTQVSVLPEGGETPLFK", "HQFLLTGDTQGR", "ALEQDLPVNIK", "IGDQWDKQHDMGHMMR", "LEPYADQLR", "Q9UBX5", "EAEEETTNDNGVLVLEPARK", "IAPQLSTEELVSLGEK", "P01857", "WELALGR", "KLVPFATELHER", "YGFIEGHVVIPR", "P04216", "LATVGELQAAWR", "LGEVNTYAGDLQK", "ATTVTGTPC(UniMod_4)QDWAAQEPHR", "ISLPESLKR", "NPC(UniMod_4)QDPYILTPENR", "FLC(UniMod_4)TGGVSPYADPNTC(UniMod_4)R", "FMQAVTGWK", "SPELQAEAK", "O43505", "P02763", "Q13740", "C(UniMod_4)LVEKGDVAFVK", "ISC(UniMod_4)TIANR", "VPMMSDPK", "EHVAHLLFLR", "M(UniMod_35)TVTDQVNC(UniMod_4)PK", "SC(UniMod_4)DKTHTC(UniMod_4)PPC(UniMod_4)PAPELLGGPSVFLFPPKPK", "P17174", "P08123", "HRLEDMEQALSPSVFK", "LEEQAQQIR", "SLHTLFGDK", "SPMYSIITPNILR", "STGGISVPGPMGPSGPR", "VRQGQGQSEPGEYEQR", "P20774", "LTASAPGYLAITK", "YPGPQAEGDSEGLSQGLVDREK", "Q9NQ79", "QTHQPPAPNSLIR", "LAAAVSNFGYDLYR", "LFDSDPITVTVPVEVSR", "RPDSLQHVLLPVLDR", "YVGGQEHFAHLLILR", "LVGYLDR", "C(UniMod_4)NLLAEK", "EVVSLTEAC(UniMod_4)C(UniMod_4)AEGADPDC(UniMod_4)YDTR", "ASGSPEPAISWFR", "YIVSGTPTFVPYLIK", "P61769", "Q8NE71", "LDEVKEQVAEVR", "NANTFISPQQR", "IGDTWSK", "VC(UniMod_4)SQYAAYGEK", "P01780", "Q99435", "DQGNQEQDPNISNGEEEEEKEPGEVGTHNDNQER", "HGSPVDIC(UniMod_4)TAKPR", "KLSENTDFLAPGVSSFTDSNQQESITK", "LVPPMEEDYPQFGSPK", "P19823", "SNSSM(UniMod_35)HITDC(UniMod_4)R", "EGYYGYTGAFR", "Q13451", "IPIEDGSGEVVLSR", "P36955", "DLLFRDDTVC(UniMod_4)LAK", "GLVSWGNIPC(UniMod_4)GSK", "P08637", "MDASLGNLFAR", "AVGDKLPEC(UniMod_4)EADDGC(UniMod_4)PKPPEIAHGYVEHSVR", "LEAGDHPVELLAR", "ANAGKPKDPTFIPAPIQAK", "HKVYAC(UniMod_4)EVTHQGLSSPVTK", "TMLLQPAGSLGSYSYR", "Q16610", "LSELIQPLPLER", "FQNALLVR", "KLVGYLDR", "AEFAEVSK", "EIVLTQSPATLSLSPGER", "VEHSDLSFSK", "Q6UXB8", "KGGETSEMYLIQPDSSVKPYR", "YNSQNQSNNQFVLYR", "IHWESASLLR", "HYTNPSQDVTVPC(UniMod_4)PVPPPPPC(UniMod_4)C(UniMod_4)HPR", "QC(UniMod_4)VPTEPC(UniMod_4)EDAEDDC(UniMod_4)GNDFQC(UniMod_4)STGR", "SIVVSPILIPENQR", "KPVDEYKDC(UniMod_4)HLAQVPSHTVVAR", "AFPALTSLDLSDNPGLGER", "C(UniMod_4)VDVDEC(UniMod_4)APPAEPC(UniMod_4)GK", "ALDFAVGEYNK"]

In [104]:
def prepare_dataset(train_proteins, train_peptides, selected_features = features):
    """
    Preprocess the protein and peptide dataframes to create a dataset ready for modeling.

    Parameters:
        train_proteins (pd.DataFrame): Protein dataframe.
        train_peptides (pd.DataFrame): Peptide dataframe.

    Returns:
        pd.DataFrame: Preprocessed feature and target dataset ready for modeling.
    """

    # Step 1: Grouping 
    df_protein_grouped = train_proteins.groupby(['patient_id','visit_id','UniProt'])['NPX'].mean().reset_index()
    df_peptide_grouped = train_peptides.groupby(['patient_id','visit_id','Peptide'])['PeptideAbundance'].mean().reset_index()
  
    # Step 2: Pivoting
    df_protein = df_protein_grouped.pivot(index='visit_id',columns = 'UniProt', values = 'NPX').rename_axis(columns=None).reset_index()
    df_peptide = df_peptide_grouped.pivot(index='visit_id',columns = 'Peptide', values = 'PeptideAbundance').rename_axis(columns=None).reset_index()
    
    # Step 3: Merging
    pro_pep_df = df_protein.merge(df_peptide, on = ['visit_id'], how = 'left')

    # Step 4: Include Patient Id and Visit Month
    pro_pep_df['patient_id'] = pro_pep_df['visit_id'].str.split('_').str[0]
    pro_pep_df['visit_month'] = pro_pep_df['visit_id'].str.split('_').str[1]

    # Step 5: Handling missing values - fill median groupby patient
    warnings.filterwarnings("ignore", category=RuntimeWarning, message="Mean of empty slice") # Filter out the specific warning

    unique_protein = np.unique(train_proteins['UniProt'])
    unique_peptide = np.unique(train_peptides['Peptide'])
    for i in unique_protein: # Proteins
      pro_pep_df[i] = pro_pep_df.groupby('patient_id')[i].transform(lambda x: x.fillna(x.median()))
    for i in unique_peptide: # Peptides
      pro_pep_df[i] = pro_pep_df.groupby('patient_id')[i].transform(lambda x: x.fillna(x.median()))
    
    warnings.filterwarnings("default", category=RuntimeWarning) # Restore the warning filter to its default state

    # Step 6: Save visit_id, visit_month and patient_id separately and remove them for next steps
    df_details = pro_pep_df[['visit_id', 'patient_id', 'visit_month']]
    pro_pep_df_features_only = pro_pep_df.drop(columns=['visit_id', 'patient_id', 'visit_month'])
    
    # Step 7: Remove one of the highly correlated features with high percentage of missing values
    pro_pep_df_features_only = pro_pep_df_features_only[selected_features]
    pro_pep_df = pd.concat([df_details, pro_pep_df_features_only])

    return pro_pep_df
    
    # Step 9: Imputer missing values for remaining features
    if dataset=='train':
      train_clinical = train_clinical[['visit_id', 'updrs_1', 'updrs_2', 'updrs_3', 'updrs_4']]
      melted_df = pd.melt(train_clinical, id_vars=['visit_id'], value_vars=['updrs_1', 'updrs_2', 'updrs_3', 'updrs_4'], var_name='updrs_test', value_name='rating')
      pro_pep_cli_df = pro_pep_df.merge(melted_df, on=['visit_id'], how='inner')
      
      details_df = pro_pep_cli_df[['visit_id',	'patient_id',	'visit_month', 'updrs_test']]
      pro_pep_df_features_target = pro_pep_cli_df.drop(columns=['visit_id',	'patient_id',	'visit_month', 'updrs_test']) # Get only features and target
      pro_pep_df_features_target = data_imputer(pro_pep_df_features_target)
    
    elif dataset=='test':
      train_clinical = train_clinical[['visit_id', 'updrs_test']]
      pro_pep_cli_df = pro_pep_df.merge(train_clinical, on=['visit_id'], how='inner')
      
      details_df = pro_pep_cli_df[['visit_id',	'patient_id',	'visit_month', 'updrs_test']]
      pro_pep_df_features_target = pro_pep_cli_df.drop(columns=['visit_id',	'patient_id',	'visit_month', 'updrs_test']) # Get only features and target
      pro_pep_df_features_target = data_imputer(pro_pep_df_features_target)
    
    # Step 10: Merge with details
    pro_pep_cli_df = pd.concat([details_df, pro_pep_df_features_target], axis=1)

    # Step 11: Feature Reduction
    if feature_reduction_method=='PCA':
      target_variance = 0.95 # Set target variance
      if dataset == 'train':
        details_df = pro_pep_cli_df[['visit_id',	'patient_id',	'visit_month', 'updrs_test']]
        features_df = feature_reduction_pca(pro_pep_cli_df.drop(columns=['visit_id',	'patient_id',	'visit_month', 'updrs_test', 'rating']), target_variance=target_variance, dataset=dataset, n_components=n_components)
        features_df = pd.concat([features_df, details_df], axis=1)
        target_df = pro_pep_cli_df[['rating']]
        return features_df, target_df
      
      elif dataset == 'test':
        details_df = pro_pep_cli_df[['visit_id',	'patient_id',	'visit_month', 'updrs_test']]
        features_df = feature_reduction_pca(pro_pep_cli_df.drop(columns=['visit_id',	'patient_id',	'visit_month', 'updrs_test']), target_variance=target_variance, dataset=dataset, n_components=n_components)
        features_df = pd.concat([features_df, details_df], axis=1)
        return features_df



# Change Imputer if necessary
def data_imputer(pro_pep_df_features_only):

  """
    Perform KNN imputation on the input dataframe to fill in missing values.

    Parameters:
        pro_pep_df_features_only (pd.DataFrame): Input dataframe with features only.

    Returns:
        pd.DataFrame: Imputed dataframe with missing values filled in using KNN imputation.
    """

  imputer = KNNImputer(n_neighbors=5) # Create instance for Imputer
  pro_pep_df_imputed = pd.DataFrame(imputer.fit_transform(pro_pep_df_features_only), columns=pro_pep_df_features_only.columns)   # Perform KNN imputation on the dataframe 

  return pro_pep_df_imputed


# Change feature reduction technique if necessary
def feature_reduction_pca(features_df, target_variance, dataset, n_components):

    """
    Perform Principal Component Analysis (PCA) on the features DataFrame.

    Parameters:
        - features_df (DataFrame): DataFrame containing the features to be transformed.
        - target_variance (float): Target cumulative variance to be explained by the selected components.

    Returns:
        - selected_features_df (DataFrame): DataFrame containing the selected features that explain the target variance.

    """

    # Perform PCA on the features DataFrame
    pca = PCA()
    pca.fit(features_df)
    if dataset == 'train':
      # Calculate the cumulative variance explained by each principal component
      cum_variance = np.cumsum(pca.explained_variance_ratio_)

      # Find the index of the first principal component that exceeds the target variance threshold
      n_components = np.argmax(cum_variance >= target_variance) + 1

      print(f'Number of selected PCA Components :{n_components}')

    # Apply PCA transformation to the features DataFrame with the selected number of components
    pca = PCA(n_components=n_components)
    transformed_features = pca.fit_transform(features_df)

    # Create a DataFrame with the selected features that explain the target variance
    selected_features_df = pd.DataFrame(transformed_features, columns=[f'PC{i+1}' for i in range(n_components)])

    return selected_features_df

In [101]:
# Format clinical data

targets = {}
for event in range(1, 5):
    for month in [0, 6, 12, 24]:
        train_clinical[f'updrs_{event}_plus_{month}_months'] = 0

for p_id in train_clinical.patient_id.unique():
    patient_data = train_clinical[train_clinical.patient_id == p_id]
    month_ranges = []
    month_intervals = [0, 6, 12, 24]
    for visit_month in patient_data.visit_month.values:
        month_ranges.append([visit_month, visit_month + 6, visit_month + 12, visit_month + 24])
    for idx, month_range in enumerate(month_ranges):
        for event in range(1, 5):
            updrs_values = patient_data[patient_data.visit_month.isin(month_range)][f'updrs_{event}'].to_list()
            if len(updrs_values) == 4:
                for e, value in enumerate(updrs_values):
                    m = month_range[0]
                    patient_data.loc[patient_data.visit_month == m, [f'updrs_{event}_plus_{month_intervals[e]}_months']] = value
            else:
                patient_data = patient_data[~patient_data.visit_month.isin(month_range)]
    targets[p_id] = patient_data

formatted_clin = pd.concat(targets.values(), ignore_index=True).set_index('visit_id').iloc[:, 7:]
formatted_clin.shape

(954, 16)

In [103]:
# Call function
features_df = prepare_dataset(train_proteins, train_peptides, selected_features = features)
features_df.shape

(2226, 1081)

In [106]:
# Merge with targets

df = formatted_clin.merge(features_df, on='visit_id', how='inner')
df

Unnamed: 0,visit_id,updrs_1_plus_0_months,updrs_1_plus_6_months,updrs_1_plus_12_months,updrs_1_plus_24_months,updrs_2_plus_0_months,updrs_2_plus_6_months,updrs_2_plus_12_months,updrs_2_plus_24_months,updrs_3_plus_0_months,...,KGGETSEMYLIQPDSSVKPYR,YNSQNQSNNQFVLYR,IHWESASLLR,HYTNPSQDVTVPC(UniMod_4)PVPPPPPC(UniMod_4)C(UniMod_4)HPR,QC(UniMod_4)VPTEPC(UniMod_4)EDAEDDC(UniMod_4)GNDFQC(UniMod_4)STGR,SIVVSPILIPENQR,KPVDEYKDC(UniMod_4)HLAQVPSHTVVAR,AFPALTSLDLSDNPGLGER,C(UniMod_4)VDVDEC(UniMod_4)APPAEPC(UniMod_4)GK,ALDFAVGEYNK
0,55_0,10,8,10,16,6,10,10,9,15.0,...,,,,,,,,,,
1,55_6,8,10,7,14,10,10,13,13,34.0,...,,,,,,,,,,
2,55_12,10,7,16,17,10,13,9,18,41.0,...,,,,,,,,,,
3,55_36,17,12,17,23,18,20,16,21,51.0,...,,,,,,,,,,
4,942_6,8,5,6,4,2,2,5,4,21.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
475,64674_12,9,3,12,11,9,6,12,18,18.0,...,,,,,,,,,,
476,64674_24,12,12,11,11,12,18,18,17,26.0,...,,,,,,,,,,
477,65043_0,2,3,4,4,6,6,7,8,16.0,...,,,,,,,,,,
478,65043_12,4,6,4,2,7,7,8,7,14.0,...,,,,,,,,,,


In [None]:
features_df.to_csv(PATH+'features_df.csv', index=False)
target_df.to_csv(PATH+'target_df.csv', index=False)

# Modeling

In [None]:
def smape(y_true, y_pred):
    """
    Calculate the Symmetric Mean Absolute Percentage Error (SMAPE).

    Parameters:
    - y_true: numpy array or list of true values
    - y_pred: numpy array or list of predicted values

    Returns:
    - smape_score: SMAPE score as a float value
    """
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    numerator = np.abs(y_pred - y_true)
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2.0
    smape_score = np.mean(numerator / denominator) * 100.0
    return smape_score

def train_and_evaluate_models(feature_df, target):
    """
    Train and evaluate different regression models using the SMAPE score.

    Parameters:
    - feature_df: pandas DataFrame containing the feature data
    - target: pandas Series containing the target values

    Returns:
    - best_model: the best trained model based on the SMAPE score
    """

    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(feature_df, target, test_size=0.2, random_state=42)

    # Initialize the models
    lr = LinearRegression()
    rfr = RandomForestRegressor(random_state=42)
    gbr = GradientBoostingRegressor(random_state=42)
    svr = SVR()

    # Train the models
    lr.fit(X_train, y_train)
    rfr.fit(X_train, y_train)
    gbr.fit(X_train, y_train)
    svr.fit(X_train, y_train)

    # Evaluate the models using SMAPE score
    lr_pred = lr.predict(X_test)
    rfr_pred = rfr.predict(X_test)
    gbr_pred = gbr.predict(X_test)
    svr_pred = svr.predict(X_test)

    lr_smape = smape(y_test, lr_pred)
    rfr_smape = smape(y_test, rfr_pred)
    gbr_smape = smape(y_test, gbr_pred)
    svr_smape = smape(y_test, svr_pred)

    # Find the best model based on SMAPE score
    smape_scores = {
        'Linear Regression': lr_smape,
        'Random Forest': rfr_smape,
        'Gradient Boosting': gbr_smape,
        'SVR': svr_smape
    }

    best_model = min(smape_scores, key=smape_scores.get)

    print("SMAPE Scores:")
    for model, score in smape_scores.items():
        print(f"{model}: {score:.2f}")

    print("Best Model:", best_model)

    # Return the best trained model
    if best_model == 'Linear Regression':
        return lr
    elif best_model == 'Random Forest':
        return rfr
    elif best_model == 'Gradient Boosting':
        return gbr
    elif best_model == 'SVR':
        return svr
    else:
        return None

def find_best_models(feature_df, target_df):
  final_df = pd.concat([feature_df, target_df], axis=1)

  updrs_1 = final_df[final_df['updrs_test'] == 'updrs_1']
  updrs_2 = final_df[final_df['updrs_test'] == 'updrs_2']
  updrs_3 = final_df[final_df['updrs_test'] == 'updrs_3']
  updrs_4 = final_df[final_df['updrs_test'] == 'updrs_4']

  updrs_list = [updrs_1, updrs_2, updrs_3, updrs_4]
  trained_models = {}

  for i, updrs in enumerate(updrs_list):
    name = 'updrs_'+str(i+1)
    trained_models[name] = train_and_evaluate_models(updrs.drop(columns=['visit_id', 'patient_id', 'visit_month', 'updrs_test', 'rating']), updrs['rating'])
    print(trained_models)
    
  return trained_models

In [None]:
trained_models = find_best_models(features_df, target_df)

SMAPE Scores:
Linear Regression: 75.72
Random Forest: 73.73
Gradient Boosting: 73.77
SVR: 72.93
Best Model: SVR
{'updrs_1': SVR()}
SMAPE Scores:
Linear Regression: 104.76
Random Forest: 104.77
Gradient Boosting: 105.42
SVR: 104.66
Best Model: SVR
{'updrs_1': SVR(), 'updrs_2': SVR()}
SMAPE Scores:
Linear Regression: 96.62
Random Forest: 98.73
Gradient Boosting: 99.55
SVR: 97.23
Best Model: Linear Regression
{'updrs_1': SVR(), 'updrs_2': SVR(), 'updrs_3': LinearRegression()}
SMAPE Scores:
Linear Regression: 107.84
Random Forest: 110.13
Gradient Boosting: 109.58
SVR: 115.22
Best Model: Linear Regression
{'updrs_1': SVR(), 'updrs_2': SVR(), 'updrs_3': LinearRegression(), 'updrs_4': LinearRegression()}


In [None]:
trained_models

{'updrs_1': SVR(),
 'updrs_2': SVR(),
 'updrs_3': LinearRegression(),
 'updrs_4': LinearRegression()}

# Test

In [None]:
test_proteins['visit_id'].unique()

array(['50423_0', '3342_6'], dtype=object)

In [None]:
test_features_df = prepare_dataset(test_proteins, test_peptides, test_clinical, feature_reduction_method='PCA', dataset='test', n_components=6)
test_features_df.shape

  visit_id   O00391    O00533   O00584   O14498   O14773   O14791   O15031  \
0   3342_6  14564.0  616804.0  61418.1  27830.1  14959.2  2853.12      NaN   
1  50423_0  33127.9  490742.0  43615.3      NaN  16486.6  2882.42  11963.7   

     O15240   O15394  ...  YVGGQEHFAHLLILR  \
0  223373.0  58863.0  ...        3657330.0   
1  124344.0  57688.1  ...        4137590.0   

   YVM(UniMod_35)LPVADQDQC(UniMod_4)IR  YVMLPVADQDQC(UniMod_4)IR  \
0                              47100.5                 1007150.0   
1                              15217.0                   83253.9   

   YVNKEIQNAVNGVK  YWGVASFLQK  YYC(UniMod_4)FQGNQFLR  YYTYLIMNK  \
0         74598.6    127126.0               210763.0    46207.2   
1        110941.0    112251.0               221372.0    46557.4   

   YYWGGQYTWDMAK  patient_id  visit_month  
0       22031.60        3342            6  
1        8527.23       50423            0  

[2 rows x 1316 columns]


AttributeError: ignored

In [None]:
test_features_df

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,visit_id,patient_id,visit_month,updrs_test
0,74607940.0,9.640644e-11,0.0,0.0,0.0,0.0,3342_6,3342,6,updrs_1
1,74607940.0,9.640644e-11,0.0,0.0,0.0,0.0,3342_6,3342,6,updrs_2
2,74607940.0,9.640644e-11,-0.0,-0.0,-0.0,-0.0,3342_6,3342,6,updrs_3
3,74607940.0,9.640644e-11,-0.0,-0.0,-0.0,-0.0,3342_6,3342,6,updrs_4
4,-74607940.0,9.640644e-11,0.0,-0.0,-0.0,-0.0,50423_0,50423,0,updrs_1
5,-74607940.0,9.640644e-11,-0.0,0.0,-0.0,-0.0,50423_0,50423,0,updrs_2
6,-74607940.0,9.640644e-11,-0.0,-0.0,0.0,-0.0,50423_0,50423,0,updrs_3
7,-74607940.0,9.640644e-11,-0.0,-0.0,-0.0,0.0,50423_0,50423,0,updrs_4


In [None]:
[x.split('_')[:2] for x in submission['prediction_id']]

[['3342', '0'],
 ['3342', '0'],
 ['3342', '0'],
 ['3342', '0'],
 ['3342', '0'],
 ['3342', '0'],
 ['3342', '0'],
 ['3342', '0'],
 ['3342', '0'],
 ['3342', '0'],
 ['3342', '0'],
 ['3342', '0'],
 ['3342', '0'],
 ['3342', '0'],
 ['3342', '0'],
 ['3342', '0'],
 ['50423', '0'],
 ['50423', '0'],
 ['50423', '0'],
 ['50423', '0'],
 ['50423', '0'],
 ['50423', '0'],
 ['50423', '0'],
 ['50423', '0'],
 ['50423', '0'],
 ['50423', '0'],
 ['50423', '0'],
 ['50423', '0'],
 ['50423', '0'],
 ['50423', '0'],
 ['50423', '0'],
 ['50423', '0'],
 ['3342', '6'],
 ['3342', '6'],
 ['3342', '6'],
 ['3342', '6'],
 ['3342', '6'],
 ['3342', '6'],
 ['3342', '6'],
 ['3342', '6'],
 ['3342', '6'],
 ['3342', '6'],
 ['3342', '6'],
 ['3342', '6'],
 ['3342', '6'],
 ['3342', '6'],
 ['3342', '6'],
 ['3342', '6'],
 ['50423', '6'],
 ['50423', '6'],
 ['50423', '6'],
 ['50423', '6'],
 ['50423', '6'],
 ['50423', '6'],
 ['50423', '6'],
 ['50423', '6'],
 ['50423', '6'],
 ['50423', '6'],
 ['50423', '6'],
 ['50423', '6'],
 ['50423', '

In [None]:
test_clinical

Unnamed: 0,visit_id,visit_month,patient_id,updrs_test,row_id,group_key
0,3342_0,0,3342,updrs_1,3342_0_updrs_1,0
1,3342_0,0,3342,updrs_2,3342_0_updrs_2,0
2,3342_0,0,3342,updrs_3,3342_0_updrs_3,0
3,3342_0,0,3342,updrs_4,3342_0_updrs_4,0
4,50423_0,0,50423,updrs_1,50423_0_updrs_1,0
5,50423_0,0,50423,updrs_2,50423_0_updrs_2,0
6,50423_0,0,50423,updrs_3,50423_0_updrs_3,0
7,50423_0,0,50423,updrs_4,50423_0_updrs_4,0
8,3342_6,6,3342,updrs_1,3342_6_updrs_1,6
9,3342_6,6,3342,updrs_2,3342_6_updrs_2,6


In [None]:
melted_df = pd.melt(train_clinical, id_vars=['visit_id'], value_vars=['updrs_1', 'updrs_2', 'updrs_3', 'updrs_4'], var_name='updrs_test', value_name='rating')

# Sort the melted DataFrame by visit_id
melted_df = melted_df.sort_values('visit_id')
melted_df = melted_df.sort_values(['visit_id', 'updrs_test'])

# Reset the index
melted_df = melted_df.reset_index(drop=True)
melted_df

Unnamed: 0,visit_id,updrs_test,rating
0,10053_0,updrs_1,3.0
1,10053_0,updrs_2,0.0
2,10053_0,updrs_3,13.0
3,10053_0,updrs_4,0.0
4,10053_12,updrs_1,4.0
...,...,...,...
10455,942_84,updrs_4,0.0
10456,942_96,updrs_1,6.0
10457,942_96,updrs_2,9.0
10458,942_96,updrs_3,39.0
