In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.impute import KNNImputer
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error

import json

import warnings

In [None]:
PATH = '/content/drive/Shareddrives/Interview Preparation/Data Science/Parkinson disease/data/'

In [None]:
# Importing data

train_proteins = pd.read_csv(PATH+'train_proteins.csv')
train_peptides = pd.read_csv(PATH+'train_peptides.csv')
train_clinical = pd.read_csv(PATH+'train_clinical_data.csv')

In [None]:
test_proteins = pd.read_csv(PATH+'test_proteins.csv')
test_peptides = pd.read_csv(PATH+'test_peptides.csv')
test_clinical = pd.read_csv(PATH+'test.csv')

submission = pd.read_csv(PATH+'sample_submission.csv')

# Data Preprocessing

## Function

In [None]:
features = ["O00391", "O00533", "O00584", "O14498", "O14773", "O14791", "O15240", "O15394", "O43505", "O60888", "O75144", "O75326", "O94919", "P00441", "P00450", "P00734", "P00736", "P00738", "P00746", "P00747", "P00748", "P00751", "P01008", "P01009", "P01011", "P01019", "P01023", "P01024", "P01031", "P01033", "P01034", "P01042", "P01344", "P01591", "P01594", "P01608", "P01621", "P01717", "P01780", "P01833", "P01834", "P01857", "P01859", "P01860", "P01861", "P01876", "P01877", "P02452", "P02647", "P02649", "P02652", "P02655", "P02656", "P02671", "P02675", "P02679", "P02747", "P02748", "P02749", "P02750", "P02751", "P02753", "P02760", "P02763", "P02765", "P02766", "P02768", "P02774", "P02787", "P02790", "P02792", "P04004", "P04075", "P04156", "P04180", "P04196", "P04207", "P04211", "P04216", "P04217", "P04275", "P04406", "P04433", "P05060", "P05067", "P05090", "P05155", "P05156", "P05408", "P05452", "P05546", "P06310", "P06396", "P06454", "P06681", "P06727", "P07195", "P07225", "P07333", "P07339", "P07602", "P07711", "P07858", "P07998", "P08123", "P08133", "P08253", "P08294", "P08493", "P08571", "P08603", "P08637", "P08697", "P09104", "P09486", "P09871", "P10451", "P10643", "P10645", "P10909", "P11142", "P11277", "P12109", "P13473", "P13521", "P13591", "P13611", "P13671", "P13987", "P14174", "P14314", "P14618", "P16035", "P16070", "P16152", "P16870", "P17174", "P17936", "P18065", "P19021", "P19652", "P19823", "P19827", "P20774", "P20933", "P23083", "P23142", "P24592", "P25311", "P27169", "P30086", "P31997", "P32754", "P35542", "P36222", "P36955", "P36980", "P39060", "P40925", "P41222", "P43121", "P43251", "P43652", "P49588", "P49908", "P51884", "P54289", "P55290", "P60174", "P61278", "P61626", "P61769", "P61916", "P80748", "P98160", "Q02818", "Q06481", "Q08380", "Q12805", "Q12841", "Q12907", "Q13283", "Q13332", "Q13449", "Q13451", "Q13740", "Q14118", "Q14508", "Q14515", "Q14624", "Q15904", "Q16270", "Q16610", "Q562R1", "Q6UX71", "Q6UXB8", "Q6UXD5", "Q7Z3B1", "Q7Z5P9", "Q8IWV7", "Q8N2S1", "Q8NBJ4", "Q8NE71", "Q92520", "Q92823", "Q92876", "Q96BZ4", "Q96KN2", "Q96PD5", "Q96S96", "Q99435", "Q99674", "Q99683", "Q99829", "Q99832", "Q99969", "Q9BY67", "Q9HDC9", "Q9NQ79", "Q9NYU2", "Q9UBR2", "Q9UBX5", "Q9UHG2", "Q9UKV8", "Q9UNU6", "Q9Y646", "Q9Y6R7", "AADDTWEPFASGK", "AAFGQGSGPIMLDEVQC(UniMod_4)TGTEASLADC(UniMod_4)K", "AAFTEC(UniMod_4)C(UniMod_4)QAADK", "AATGEC(UniMod_4)TATVGKR", "AATVGSLAGQPLQER", "AAVYHHFISDGVR", "ADDKETC(UniMod_4)FAEEGK", "ADDKETC(UniMod_4)FAEEGKK", "ADDLGKGGNEESTKTGNAGSR", "ADLSGITGAR", "ADQVC(UniMod_4)INLR", "ADRDQYELLC(UniMod_4)LDNTR", "ADSGEGDFLAEGGGVR", "AEAQEAEDQQAR", "AEFAEVSK", "AELQC(UniMod_4)PQPAA", "AESPEVC(UniMod_4)FNEESPK", "AFPALTSLDLSDNPGLGER", "AGAAAGGPGVSGVC(UniMod_4)VC(UniMod_4)K", "AGALNSNDAFVLK", "AGC(UniMod_4)VAESTAVC(UniMod_4)R", "AGDFLEANYMNLQR", "AGKEPGLQIWR", "AGLAASLAGPHSIVGR", "AGLLRPDYALLGHR", "AGLQVYNK", "AIGAVPLIQGEYMIPC(UniMod_4)EK", "AIGYLNTGYQR", "AIPVTQYLK", "AIQLTYNPDESSKPNMIDAATLK", "AKAYLEEEC(UniMod_4)PATLRK", "AKLEEQAQQIR", "AKPALEDLR", "AKWEMPFDPQDTHQSR", "ALANSLAC(UniMod_4)QGK", "ALDFAVGEYNK", "ALEQDLPVNIK", "ALFLETEQLK", "ALGISPFHEHAEVVFTANDSGPR", "ALGISPFHEHAEVVFTANDSGPRR", "ALMSPAGMLR", "ALQDQLVLVAAK", "ALSIGFETC(UniMod_4)R", "ALSSEWKPEIR", "ALTDMPQMR", "ALVQQMEQLR", "ALYLQYTDETFR", "ANAGKPKDPTFIPAPIQAK", "ANRPFLVFIR", "APEAQVSVQPNFQQDK", "APHGPGLIYR", "APLIPMEHC(UniMod_4)TTR", "AQAGANTRPC(UniMod_4)PS", "AQC(UniMod_4)GGGLLGVR", "AQGFTEDTIVFLPQTDK", "AQPVQVAEGSEPDGFWEALGGK", "AQTTVTC(UniMod_4)M(UniMod_35)ENGWSPTPR", "AQTTVTC(UniMod_4)MENGWSPTPR", "ARAEAQEAEDQQAR", "ASEGGFTATGQR", "ASGSPEPAISWFR", "ASHEEVEGLVEK", "ASLVPMEHC(UniMod_4)ITR", "ASQSVSSYLAWYQQKPGQAPR", "ASYGVKPR", "ASYLDC(UniMod_4)IR", "ATEDEGSEQKIPEATNR", "ATEDEGSEQKIPEATNRR", "ATEHLSTLSEK", "ATFGC(UniMod_4)HDGYSLDGPEEIEC(UniMod_4)TK", "ATLGPAVRPLPWQR", "ATTVTGTPC(UniMod_4)QDWAAQEPHR", "ATVNPSAPR", "ATVVYQGER", "ATWSGAVLAGR", "AVC(UniMod_4)SQEAMTGPC(UniMod_4)R", "AVDTWSWGER", "AVGDKLPEC(UniMod_4)EADDGC(UniMod_4)PKPPEIAHGYVEHSVR", "AVISPGFDVFAK", "AVLPTGDVIGDSAK", "AVSEKEVDSGNDIYGNPIKR", "AVVVHAGEDDLGR", "AYKSELEEQLTPVAEETR", "AYLEEEC(UniMod_4)PATLR", "AYLEEEC(UniMod_4)PATLRK", "AYQGVAAPFPK", "C(UniMod_4)AEENC(UniMod_4)FIQK", "C(UniMod_4)APFFYGGC(UniMod_4)GGNR", "C(UniMod_4)C(UniMod_4)AAADPHEC(UniMod_4)YAK", "C(UniMod_4)C(UniMod_4)ESASEDC(UniMod_4)MAK", "C(UniMod_4)C(UniMod_4)ESASEDC(UniMod_4)MAKELPEHTVK", "C(UniMod_4)C(UniMod_4)TESLVNR", "C(UniMod_4)C(UniMod_4)VEC(UniMod_4)PPC(UniMod_4)PAPPVAGPSVFLFPPKPK", "C(UniMod_4)DSSPDSAEDVRK", "C(UniMod_4)FLAFTQTK", "C(UniMod_4)FSGQC(UniMod_4)ISK", "C(UniMod_4)IIEVLSNALSK", "C(UniMod_4)KPVNTFVHEPLVDVQNVC(UniMod_4)FQEK", "C(UniMod_4)LAFEC(UniMod_4)PENYR", "C(UniMod_4)LAFEC(UniMod_4)PENYRR", "C(UniMod_4)LAPLEGAR", "C(UniMod_4)LKDGAGDVAFVK", "C(UniMod_4)LPVTAPENGK", "C(UniMod_4)LVEKGDVAFVK", "C(UniMod_4)LVEKGDVAFVKHQTVPQNTGGK", "C(UniMod_4)LVNLIEK", "C(UniMod_4)MC(UniMod_4)PAENPGC(UniMod_4)R", "C(UniMod_4)NLLAEK", "C(UniMod_4)PFPSRPDNGFVNYPAKPTLYYK", "C(UniMod_4)PNPPVQENFDVNK", "C(UniMod_4)PNPPVQENFDVNKYLGR", "C(UniMod_4)QC(UniMod_4)DELC(UniMod_4)SYYQSC(UniMod_4)C(UniMod_4)TDYTAEC(UniMod_4)KPQVTR", "C(UniMod_4)RDQLPYIC(UniMod_4)QFGIV", "C(UniMod_4)REILSVDC(UniMod_4)STNNPSQAK", "C(UniMod_4)STSSLLEAC(UniMod_4)TFR", "C(UniMod_4)SYTEDAQC(UniMod_4)IDGTIEVPK", "C(UniMod_4)TSTGWIPAPR", "C(UniMod_4)TTPPPSSGPTYQC(UniMod_4)LK", "C(UniMod_4)VC(UniMod_4)PVSNAMC(UniMod_4)R", "C(UniMod_4)VDVDEC(UniMod_4)APPAEPC(UniMod_4)GK", "C(UniMod_4)VNHYGGYLC(UniMod_4)LPK", "DAC(UniMod_4)GC(UniMod_4)C(UniMod_4)PMC(UniMod_4)AR", "DALSSVQESQVAQQAR", "DASGVTFTWTPSSGK", "DC(UniMod_4)GSVDGVIKEVNVSPC(UniMod_4)PTQPC(UniMod_4)QLSK", "DC(UniMod_4)HLAQVPSHTVVAR", "DC(UniMod_4)SLPYATESK", "DDKETC(UniMod_4)FAEEGKK", "DDNPNLPR", "DEPPQSPWDR", "DFADIPNLR", "DGAGDVAFVK", "DGGFC(UniMod_4)EVC(UniMod_4)K", "DGGFC(UniMod_4)EVC(UniMod_4)KK", "DGLKYHYLIR", "DGWSAQPTC(UniMod_4)IK", "DIPMNPMC(UniMod_4)IYR", "DKETC(UniMod_4)FAEEGKK", "DKLAAC(UniMod_4)LEGNC(UniMod_4)AEGLGTNYR", "DLATVYVDVLK", "DLGEENFK", "DLLFKDSAHGFLK", "DLLFRDDTVC(UniMod_4)LAK", "DLLLPQPDLR", "DNC(UniMod_4)C(UniMod_4)ILDER", "DNWVFC(UniMod_4)GGK", "DPTFIPAPIQAK", "DQGNQEQDPNISNGEEEEEKEPGEVGTHNDNQER", "DQPFTILYR", "DQTVSDNELQEM(UniMod_35)SNQGSK", "DQTVSDNELQEMSNQGSK", "DRLDEVKEQVAEVR", "DSGEGDFLAEGGGVR", "DSGFQM(UniMod_35)NQLR", "DSGFQMNQLR", "DSGRDYVSQFEGSALGK", "DTDTGALLFIGK", "DTSC(UniMod_4)VNPPTVQNAYIVSR", "DTVIKPLLVEPEGLEK", "DVC(UniMod_4)KNYAEAK", "DVLLEAC(UniMod_4)C(UniMod_4)ADGHR", "DVRDYFMPC(UniMod_4)PGR", "DWHGVPGQVDAAMAGR", "DYFMPC(UniMod_4)PGR", "DYVSQFEGSALGK", "EAEEETTNDNGVLVLEPARK", "EC(UniMod_4)EEIIR", "EDC(UniMod_4)NELPPRR", "EDSLEAGLPLQVR", "EFQLFSSPHGK", "EFTRPEEIIFLR", "EGQEC(UniMod_4)GVYTPNC(UniMod_4)APGLQC(UniMod_4)HPPKDDEAPLR", "EGTC(UniMod_4)PEAPTDEC(UniMod_4)KPVK", "EGVQKEDIPPADLSDQVPDTESETR", "EGYYGYTGAFR", "EHVAHLLFLR", "EIGELYLPK", "EILSVDC(UniMod_4)STNNPSQAK", "EIQNAVNGVK", "EIVLTQSPATLSLSPGER", "EKLQDEDLGFL", "ELDESLQVAER", "ELDLNSVLLK", "ELLESYIDGR", "ELSSFIDKGQELC(UniMod_4)ADYSENTFTEYK", "ELSSFIDKGQELC(UniMod_4)ADYSENTFTEYKK", "EPQVYTLPPSRDELTK", "EPTMYGEILSPNYPQAYPSEVEK", "EQLSLLDRFTEDAKR", "EQLTPLIK", "EQPPSLTR", "ESAYLYAR", "ESLQQMAEVTR", "ETLLQDFR", "ETYGEMADC(UniMod_4)C(UniMod_4)AK", "EVGPTNADPVC(UniMod_4)LAK", "EVNVSPC(UniMod_4)PTQPC(UniMod_4)QLSK", "EVPLNTIIFMGR", "EVQPVELPNC(UniMod_4)NLVK", "EVVSLTEAC(UniMod_4)C(UniMod_4)AEGADPDC(UniMod_4)YDTR", "EWVAIESDSVQPVPR", "EYC(UniMod_4)GVPGDGDEELLR", "FDEFFSEGC(UniMod_4)APGSK", "FDEFFSEGC(UniMod_4)APGSKK", "FDGILGMAYPR", "FEHC(UniMod_4)NFNDVTTR", "FFLC(UniMod_4)QVAGDAK", "FIYGGC(UniMod_4)GGNR", "FKDLGEENFK", "FLATTPNSLLVSWQPPR", "FLC(UniMod_4)TGGVSPYADPNTC(UniMod_4)R", "FLENEDR", "FLENEDRR", "FLVNLVK", "FM(UniMod_35)ETVAEK", "FMETVAEK", "FMQAVTGWK", "FNALQYLR", "FNKNNEGTYYSPNYNPQSR", "FNKPFVFLM(UniMod_35)IEQNTK", "FQNALLVR", "FQPTLLTLPR", "FQSVFTVTR", "FSALEVDETYVPK", "FSC(UniMod_4)FQEEAPQPHYQLR", "FSC(UniMod_4)MC(UniMod_4)PQGYQVVR", "FSGTWYAMAK", "FSPATHPSEGLEENYC(UniMod_4)RNPDNDPQGPWC(UniMod_4)YTTDPEKR", "FSVVYAK", "FTDSENVC(UniMod_4)QER", "FTEC(UniMod_4)C(UniMod_4)QAADK", "FTFEYSR", "FTNIGPDTMR", "FTQVTPTSLSAQWTPPNVQLTGYR", "FVVTDGGITR", "FYNQVSTPLLR", "GAAPPKQEFLDIEDP", "GAQTLYVPNC(UniMod_4)DHR", "GAQTQTEEEMTR", "GATLALTQVTPQDER", "GATTSPGVYELSSR", "GAYPLSIEPIGVR", "GC(UniMod_4)PTEEGC(UniMod_4)GER", "GC(UniMod_4)SFLPDPYQK", "GDFSSANNRDNTYNR", "GDKVWVYPPEKK", "GEAAGAVQELAR", "GEAGAPGEEDIQGPTK", "GEC(UniMod_4)WC(UniMod_4)VNPNTGK", "GEVQAMLGQSTEELR", "GEVQAMLGQSTEELRVR", "GGETSEMYLIQPDSSVKPYR", "GGSTSYGTGSETESPR", "GGTLGTPQTGSENDALYEYLR", "GINVALANGK", "GKRPYQEGTPC(UniMod_4)SQC(UniMod_4)PSGYHC(UniMod_4)K", "GLETSLAEC(UniMod_4)TFTK", "GLGEISAASEFK", "GLPAPIEK", "GLSAEPGWQAK", "GLVSWGNIPC(UniMod_4)GSK", "GLYDVVSVLR", "GMDSC(UniMod_4)KGDSGGAFAVQDPNDKTK", "GMFNIQHC(UniMod_4)K", "GNPEPTFSWTK", "GNQWVGYDDQESVK", "GNSYFMVEVK", "GPSVFPLAPSSK", "GQEDASPDRFSNTDYAVGYMLR", "GQSISVTSIRPC(UniMod_4)AAETQ", "GRPGPQPWC(UniMod_4)ATTPNFDQDQR", "GRTC(UniMod_4)PKPDDLPFSTVVPLK", "GSESGIFTNTK", "GSFAC(UniMod_4)QC(UniMod_4)PPGYQK", "GSPAINVAVHVFR", "GSPAINVAVHVFRK", "GSPSGEVSHPR", "GSQTQSHPDLGTEGC(UniMod_4)WDQLSAPR", "GVALADFNR", "GVASLFAGR", "GWVTDGFSSLK", "GYC(UniMod_4)APGMEC(UniMod_4)VK", "GYHLNEEGTR", "GYPGVQAPEDLEWER", "GYQLSDVDGVTC(UniMod_4)EDIDEC(UniMod_4)ALPTGGHIC(UniMod_4)SYR", "GYTQQLAFR", "HGGLYHENMR", "HGNVAEGETKPDPDVTER", "HGPGLIYR", "HGSPVDIC(UniMod_4)TAKPR", "HGTC(UniMod_4)AAQVDALNSQKK", "HKVYAC(UniMod_4)EVTHQGLSSPVTK", "HLDSVLQQLQTEVYR", "HLSLLTTLSNR", "HPDEAAFFDTASTGK", "HPNSPLDEENLTQENQDR", "HPNVFGFC(UniMod_4)R", "HQFLLTGDTQGR", "HQPQEFPTYVEPTNDEIC(UniMod_4)EAFR", "HQPQEFPTYVEPTNDEIC(UniMod_4)EAFRK", "HQPQEFPTYVEPTNDEIC(UniMod_4)EAFRKDPK", "HQTVPQNTGGKNPDPWAK", "HRLEDMEQALSPSVFK", "HSIFTPETNPR", "HSTIFENLANK", "HTFM(UniMod_35)GVVSLGSPSGEVSHPR", "HTFMGVVSLGSPSGEVSHPR", "HTSVQTTSSGSGPFTDVR", "HYDGSYSTFGER", "HYEGSTVPEK", "HYTNPSQDVTVPC(UniMod_4)PVPPPPPC(UniMod_4)C(UniMod_4)HPR", "IALVITDGR", "IAPQLSTEELVSLGEK", "IASFSQNC(UniMod_4)DIYPGKDFVQPPTK", "IC(UniMod_4)ANVFC(UniMod_4)GAGR", "IC(UniMod_4)LEDNVLM(UniMod_35)SGVK", "IC(UniMod_4)LEDNVLMSGVK", "IC(UniMod_4)VGC(UniMod_4)PR", "IDQNVEELKGR", "IDQTVEELRR", "IEC(UniMod_4)VSAETTEDC(UniMod_4)IAK", "IEIPSSVQQVPTIIK", "IEKVEHSDLSFSK", "IFFYDSENPPASEVLR", "IFSFDGKDVLR", "IGADFLAR", "IGDQWDKQHDMGHMMR", "IGDTWSK", "IHWESASLLR", "IKPVFIEDANFGR", "ILAGSADSEGVAAPR", "ILEVVNQIQDEER", "ILGPLSYSK", "ILGQQVPYATK", "ILLQGTPVAQMTEDAVDAER", "IMNGEADAMSLDGGFVYIAGK", "INENTGSVSVTR", "INHC(UniMod_4)RFDEFFSEGC(UniMod_4)APGSKK", "IPIEDGSGEVVLSR", "IPIEDGSGEVVLSRK", "IPTTFENGR", "IQPSGGTNINEALLR", "ISASAEELR", "ISC(UniMod_4)TIANR", "ISLPESLK", "ISLPESLKR", "ISYGNDALMPSLTETK", "ITGYIIK", "ITTTSPWMFPSR", "ITYGETGGNSPVQEFTVPGSK", "IVQIYKDLLR", "IVSSAM(UniMod_35)EPDREYHFGQAVR", "IVSSAMEPDREYHFGQAVR", "IWDVVEK", "IYISGMAPR", "IYISGMAPRPSLAK", "IYLYTLNDNAR", "KAADDTWEPFASGK", "KAEEEHLGILGPQLHADVGDKVK", "KASYLDC(UniMod_4)IR", "KATEDEGSEQKIPEATNR", "KC(UniMod_4)C(UniMod_4)VEC(UniMod_4)PPC(UniMod_4)PAPPVAGPSVFLFPPKPK", "KC(UniMod_4)STSSLLEAC(UniMod_4)TFR", "KC(UniMod_4)SYTEDAQC(UniMod_4)IDGTIEVPK", "KDSGFQM(UniMod_35)NQLR", "KDSGFQMNQLR", "KEC(UniMod_4)C(UniMod_4)EKPLLEK", "KFPSGTFEQVSQLVK", "KGGETSEMYLIQPDSSVKPYR", "KIYPTVNC(UniMod_4)QPLGMISLMK", "KLC(UniMod_4)MAALK", "KLGQSLDC(UniMod_4)NAEVYVVPWEK", "KLINDYVK", "KLSENTDFLAPGVSSFTDSNQQESITK", "KLSSWVLLM(UniMod_35)K", "KLSSWVLLMK", "KLVGYLDR", "KLVPFATELHER", "KLYDYC(UniMod_4)DVPQC(UniMod_4)AAPSFDC(UniMod_4)GKPQVEPK", "KMTVTDQVNC(UniMod_4)PK", "KPALEDLR", "KPQSAVYSTGSNGILLC(UniMod_4)EAEGEPQPTIK", "KPVDEYKDC(UniMod_4)HLAQVPSHTVVAR", "KPVEEYANC(UniMod_4)HLAR", "KQELSEAEQATR", "KQINDYVEK", "KQINDYVEKGTQGK", "KQTALVELVK", "KREEAPSLRPAPPPISGGGYR", "KSASDLTWDNLK", "KSPELQAEAK", "KSQPMGLWR", "KTLLSNLEEAK", "KTLLSNLEEAKK", "KTSLEDFYLDEER", "KVEQAVETEPEPELR", "KVESELIKPINPR", "KVLLDGVQNPR", "KVPQVSTPTLVEVSR", "KVTYTSQEDLVEKK", "KYFIDFVAR", "KYLYEIAR", "LAAAVSNFGYDLYR", "LAAC(UniMod_4)GPPPVAPPAAVAAVAGGAR", "LAARLEALKENGGAR", "LAC(UniMod_4)C(UniMod_4)VVGVC(UniMod_4)GPGLWER", "LADGGATNQGRVEIFYR", "LAPLAEDVR", "LATVGELQAAWR", "LAVTTHGLPC(UniMod_4)LAWASAQAK", "LAVYQAGAR", "LC(UniMod_4)DNLSTK", "LC(UniMod_4)MAALK", "LC(UniMod_4)MGSGLNLC(UniMod_4)EPNNK", "LC(UniMod_4)MGSGLNLC(UniMod_4)EPNNKEGYYGYTGAFR", "LC(UniMod_4)QDLGPGAFR", "LC(UniMod_4)TVATLR", "LDELRDEGK", "LDEVKEQVAEVR", "LDIDSPPITAR", "LEAGDHPVELLAR", "LEDMEQALSPSVFK", "LEEQAQQIR", "LEGEAC(UniMod_4)GVYTPR", "LEGQEEEEDNRDSSMK", "LEPGQQEEYYR", "LEPYADQLR", "LETPDFQLFK", "LFAC(UniMod_4)SNK", "LFDSDPITVTVPVEVSR", "LFGGNFAHQASVAR", "LGADM(UniMod_35)EDVC(UniMod_4)GR", "LGADMEDVC(UniMod_4)GR", "LGC(UniMod_4)SLNQNSVPDIHGVEAPAR", "LGEVNTYAGDLQK", "LGMFNIQHC(UniMod_4)K", "LGNQEPGGQTALK", "LGPGMADIC(UniMod_4)K", "LGPLVEQGR", "LGPLVEQGRVR", "LGQSLDC(UniMod_4)NAEVYVVPWEK", "LGVRPSQGGEAPR", "LHDRNTYEKYLGEEYVK", "LHLDYIGPC(UniMod_4)K", "LIADLGSTSITNLGFR", "LINDYVK", "LIVHNGYC(UniMod_4)DGR", "LKC(UniMod_4)DEWSVNSVGK", "LKC(UniMod_4)DEWSVNSVGKIEC(UniMod_4)VSAETTEDC(UniMod_4)IAK", "LKEDAVSAAFK", "LLAGDHPIDLLLR", "LLC(UniMod_4)QC(UniMod_4)LGFGSGHFR", "LLDNWDSVTSTFSK", "LLDSLPSDTR", "LLELTGPK", "LLEVPEGR", "LLIYDASNR", "LLPAQLPAEKEVGPPLPQEAVPLQK", "LLRDPADASEAHESSSR", "LMVELHNLYR", "LNMHMNVQNGKWDSDPSGTK", "LPPTSAHGNVAEGETKPDPDVTER", "LQAEAFQAR", "LQDLYSIVR", "LQPLDFKENAEQSR", "LQSIGTENTEENRR", "LQSLFDSPDFSK", "LRENELTYYC(UniMod_4)C(UniMod_4)K", "LRENELTYYC(UniMod_4)C(UniMod_4)KK", "LRTEGDGVYTLNNEK", "LRTEGDGVYTLNNEKQWINK", "LSELIQPLPLER", "LSINTHPSQKPLSITVR", "LSITGTYDLK", "LSKELQAAQAR", "LSNENHGIAQR", "LSPEDYTLK", "LSPIYNLVPVK", "LSPLGEEMR", "LSSWVLLM(UniMod_35)K", "LSSWVLLMK", "LSYEGEVTK", "LSYTC(UniMod_4)EGGFR", "LTASAPGYLAITK", "LTPLYELVK", "LVAYYTLIGASGQR", "LVDGKGVPIPNKVIFIR", "LVDKFLEDVKK", "LVFFAEDVGSNK", "LVGGPM(UniMod_35)DASVEEEGVRR", "LVGGPMDASVEEEGVR", "LVGGPMDASVEEEGVRR", "LVGYLDR", "LVMGIPTFGR", "LVNEVTEFAK", "LVPPMEEDYPQFGSPK", "LVWEEAMSR", "LYEQLSGK", "LYGSEAFATDFQDSAAAK", "LYGSEAFATDFQDSAAAKK", "LYHSEAFTVNFGDTEEAKK", "LYQQHGAGLFDVTR", "M(UniMod_35)ADEAGSEADHEGTHSTKR", "M(UniMod_35)ASGAANVVGPK", "M(UniMod_35)C(UniMod_4)PQLQQYEMHGPEGLR", "M(UniMod_35)ELERPGGNEITR", "M(UniMod_35)LTPEHVFIHPGWK", "M(UniMod_35)TVTDQVNC(UniMod_4)PK", "M(UniMod_35)VQEQC(UniMod_4)C(UniMod_4)HSQLEELHC(UniMod_4)ATGISLANEQDR", "M(UniMod_35)YLGYEYVTAIR", "MADEAGSEADHEGTHSTKR", "MASGAANVVGPK", "MATLYSR", "MC(UniMod_4)PQLQQYEMHGPEGLR", "MC(UniMod_4)VDVNEC(UniMod_4)QR", "MDASLGNLFAR", "MDYPKQTQVSVLPEGGETPLFK", "MELERPGGNEITR", "MFTTAPDQVDKEDEDFQESNK", "MGNFPWQVFTNIHGR", "MKYWGVASFLQK", "MMAVAADTLQR", "MNFRPGVLSSR", "MPC(UniMod_4)AELVREPGC(UniMod_4)GC(UniMod_4)C(UniMod_4)SVC(UniMod_4)AR", "MPYEC(UniMod_4)GPSLDVC(UniMod_4)AQDER", "MTVTDQVNC(UniMod_4)PK", "MVQEQC(UniMod_4)C(UniMod_4)HSQLEELHC(UniMod_4)ATGISLANEQDR", "MYLGYEYVTAIR", "MYYSAVDPTKDIFTGLIGPM(UniMod_35)K", "NANTFISPQQR", "NFGYTLR", "NFPPSQDASGDLYTTSSQLTLPATQC(UniMod_4)LAGK", "NFPPSQDASGDLYTTSSQLTLPATQC(UniMod_4)PDGK", "NFPSPVDAAFR", "NGFYPATR", "NIGEFC(UniMod_4)GK", "NIINSDGGPYVC(UniMod_4)R", "NILDRQDPPSVVVTSHQAPGEK", "NILTSNNIDVK", "NILTSNNIDVKK", "NKFDPSLTQR", "NLAVSQVVHK", "NLHGDGIALWYTR", "NLNEKDYELLC(UniMod_4)LDGTR", "NLQPASEYTVSLVAIKGNQESPK", "NLREGTC(UniMod_4)PEAPTDEC(UniMod_4)KPVK", "NNEGTYYSPNYNPQSR", "NPC(UniMod_4)QDPYILTPENR", "NPDGDVGGPWC(UniMod_4)YTTNPR", "NPDPWAK", "NPDSSTTGPWC(UniMod_4)YTTDPTVR", "NQEQVSPLTLLK", "NRDHDTFLAVR", "NSKFEDC(UniMod_4)C(UniMod_4)QEK", "NSLFEYQK", "NSPLDEENLTQENQDR", "NSVIKVPMMNSK", "NSWGEEWGMGGYVK", "NTEILTGSWSDQTYPEGTQAIYK", "NTFAEVTGLSPGVTYYFK", "NTGIIC(UniMod_4)TIGPASR", "NVVYTC(UniMod_4)NEGYSLIGNPVAR", "NWGLGGHAFC(UniMod_4)R", "PALEDLR", "PIVTSPYQIHFTK", "PNHAVVTR", "PPTSAHGNVAEGETKPDPDVTER", "PTLVEVSR", "QALNTDYLDSDYQR", "QC(UniMod_4)VPTEPC(UniMod_4)EDAEDDC(UniMod_4)GNDFQC(UniMod_4)STGR", "QDGSVDFGR", "QELSEAEQATR", "QEPERNEC(UniMod_4)FLQHKDDNPNLPR", "QFTSSTSYNR", "QGIPFFGQVR", "QGVNDNEEGFFSAR", "QHM(UniMod_35)DSDSSPSSSSTYC(UniMod_4)NQMMR", "QHMDSDSSPSSSSTYC(UniMod_4)NQMMR", "QHVVYGPWNLPQSSYSHLTR", "QIGSVYR", "QINDYVEK", "QINDYVEKGTQGK", "QKPDGVFQEDAPVIHQEMIGGLR", "QKVEPLRAELQEGAR", "QKWEAEPVYVQR", "QLKEHAVEGDC(UniMod_4)DFQLLKLDGK", "QLNEINYEDHK", "QNC(UniMod_4)ELFEQLGEYK", "QPSSAFAAFVK", "QQETAAAETETR", "QQLVETHMAR", "QQTEWQSGQR", "QQTHMLDVMQDHFSR", "QRQEELC(UniMod_4)LAR", "QSGLYFIKPLK", "QSTNAYPDLR", "QTALVELVK", "QTHQPPAPNSLIR", "QTQVSVLPEGGETPLFK", "QTVSWAVTPK", "QVVAGLNFR", "QWAGLVEK", "QYNVGPSVSKYPLR", "QYTDSTFRVPVER", "RELDESLQVAER", "RGEQC(UniMod_4)VDIDEC(UniMod_4)TIPPYC(UniMod_4)HQR", "RGYQLSDVDGVTC(UniMod_4)EDIDEC(UniMod_4)ALPTGGHIC(UniMod_4)SYR", "RIPIEDGSGEVVLSRK", "RLEAGDHPVELLAR", "RLEGQEEEEDNRDSSMK", "RLGMFNIQHC(UniMod_4)K", "RLPIGSYF", "RLYGSEAFATDFQDSAAAK", "RNTEILTGSWSDQTYPEGTQAIYK", "RPC(UniMod_4)FSALEVDETYVPK", "RPDSLQHVLLPVLDR", "RPGGEPSPEGTTGQSYNQYSQR", "RQEC(UniMod_4)SIPVC(UniMod_4)GQDQVTVAMTPR", "RTHLPEVFLSK", "RTPITVVK", "RVDTVDPPYPR", "RVWELSK", "RYIETDPANRDR", "SASDLTWDNLK", "SASDLTWDNLKGK", "SASLHLPK", "SC(UniMod_4)C(UniMod_4)EEQNKVNC(UniMod_4)LQTR", "SC(UniMod_4)DIPVFMNAR", "SC(UniMod_4)DKTHTC(UniMod_4)PPC(UniMod_4)PAPELLGGPSVFLFPPKPK", "SC(UniMod_4)DNPYIPNGDYSPLR", "SC(UniMod_4)DTPPPC(UniMod_4)PR", "SC(UniMod_4)ESNSPFPVHPGTAEC(UniMod_4)C(UniMod_4)TK", "SC(UniMod_4)VGETTESTQC(UniMod_4)EDEELEHLR", "SDDKVTLEER", "SDVMYTDWK", "SDVMYTDWKK", "SDVVYTDWK", "SDVVYTDWKK", "SEALAVDGAGKPGAEEAQDPEGK", "SEGSSVNLSPPLEQC(UniMod_4)VPDRGQQYQGR", "SELEEQLTPVAEETR", "SETKDLLFRDDTVC(UniMod_4)LAK", "SEYPSIK", "SFNRGEC(UniMod_4)", "SFPTYC(UniMod_4)QQK", "SFQTGLFTAAR", "SGEATDGARPQALPEPMQESK", "SGELEQEEER", "SGIEC(UniMod_4)QLWR", "SGINC(UniMod_4)PIQK", "SGIPIVTSPYQIHFTK", "SGLSTGWTQLSK", "SGSAHEYSSSPDDAIFQSLAR", "SGSDEVQVGQQR", "SIAQYWLGC(UniMod_4)PAPGHL", "SIDVAC(UniMod_4)HPGYALPK", "SILENLR", "SIVVSPILIPENQR", "SKEFQLFSSPHGK", "SKVPPPRDFHINLFR", "SLAELGGHLDQQVEEFRR", "SLAPYAQDTQEK", "SLDFTELDVAAEK", "SLEDLQLTHNKITK", "SLEDQVEMLR", "SLGEC(UniMod_4)C(UniMod_4)DVEDSTTC(UniMod_4)FNAK", "SLGNVIMVC(UniMod_4)R", "SLGSPSGEVSHPR", "SLHTLFGDK", "SLPSEASEQYLTK", "SMEQNGPGLEYR", "SNAQGIDLNR", "SNLDEDIIAEENIVSR", "SNSSM(UniMod_35)HITDC(UniMod_4)R", "SNSSMHITDC(UniMod_4)R", "SPAINVAVHVFR", "SPELQAEAK", "SPFEQHIK", "SPMYSIITPNILR", "SPVDIC(UniMod_4)TAKPR", "SPVGVQPILNEHTFC(UniMod_4)AGMSK", "SRYPVC(UniMod_4)GSDGTTYPSGC(UniMod_4)QLR", "SSALDMENFR", "SSFVAPLEK", "SSGLVSNAPGVQIR", "SSGQWQTPGATR", "SSNLIILEEHLK", "SSNTYTLTDVRR", "SSPVVIDASTAIDAPSNLR", "SSQGGSLPSEEK", "STGGISVPGPMGPSGPR", "STNLHDYGMLLPC(UniMod_4)GIDK", "STSSFPC(UniMod_4)PAGHFNGFR", "STTPDITGYR", "SVIPSDGPSVAC(UniMod_4)VK", "SVIPSDGPSVAC(UniMod_4)VKK", "SVLGQLGITK", "SVPMVPPGIK", "SVPPSASHVAPTETFTYEWTVPK", "SWFEPLVEDM(UniMod_35)QR", "SYLSMVGSC(UniMod_4)C(UniMod_4)TSASPTVC(UniMod_4)FLKER", "SYTITGLQPGTDYK", "TAKDALSSVQESQVAQQAR", "TASDFITK", "TATSEYQTFFNPR", "TC(UniMod_4)VADESAENC(UniMod_4)DK", "TC(UniMod_4)VADESAENC(UniMod_4)DKSLHTLFGDK", "TDASDVKPC(UniMod_4)", "TEGDGVYTLNNEK", "TEHLSTLSEK", "TEIDKPSQMQVTDVQDNSISVK", "TELLPGDRDNLAIQTR", "TFHEASEDC(UniMod_4)ISR", "TFISPIK", "TFTC(UniMod_4)TAAYPESK", "TFTLLDPK", "TFYEPGEEITYSC(UniMod_4)KPGYVSR", "TFYSC(UniMod_4)TTEGR", "TGAQELLR", "TGDEITYQC(UniMod_4)R", "TGYYFDGISR", "THLAPYSDELR", "THLAPYSDELRQR", "THLGEALAPLSK", "THLPEVFLSK", "THPHFVIPYR", "TINPAVDHC(UniMod_4)C(UniMod_4)K", "TKEEYGHSEVVEYYC(UniMod_4)NPR", "TKKQELSEAEQATR", "TLEAQLTPR", "TLKIENVSYQDKGNYR", "TLLSNLEEAK", "TLLSNLEEAKK", "TLVVHEKADDLGKGGNEESTK", "TLYSSSPR", "TM(UniMod_35)LLQPAGSLGSYSYR", "TM(UniMod_35)QENSTPRED", "TMLLQPAGSLGSYSYR", "TMQALPYSTVGNSNNYLHLSVLR", "TMQENSTPRED", "TMSEVGGSVEDLIAK", "TNFDNDIALVR", "TNTNVNC(UniMod_4)PIEC(UniMod_4)FMPLDVQADREDSRE", "TPC(UniMod_4)QQELDQVLER", "TPC(UniMod_4)TVSC(UniMod_4)NIPVVSGKEC(UniMod_4)EEIIR", "TPENYPNAGLTMNYC(UniMod_4)R", "TPEVTC(UniMod_4)VVVDVSHEDPEVK", "TPLGDTTHTC(UniMod_4)PR", "TPLPPTSAHGNVAEGETKPDPDVTER", "TPLTATLSK", "TPSAAYLWVGTGASEAEK", "TPSGLYLGTC(UniMod_4)ER", "TPVSDRVTK", "TQVNTQAEQLR", "TQVNTQAEQLRR", "TSAHGNVAEGETKPDPDVTER", "TSESGELHGLTTEEEFVEGIYK", "TSLEDFYLDEER", "TSPVDEKALQDQLVLVAAK", "TTPEPC(UniMod_4)ELDDEDFR", "TTPPVLDSDGSFFLYSK", "TVAAC(UniMod_4)NLPIVR", "TVAAPSVFIFPPSDEQLK", "TVFGTEPDMIR", "TVQAVLTVPK", "TVVQPSVGAAAGPVVPPC(UniMod_4)PGR", "TYETTLEK", "TYLGNALVC(UniMod_4)TC(UniMod_4)YGGSR", "VAAGAFQGLR", "VAHQLQALR", "VASYGVKPR", "VAVVQYSGTGQQRPER", "VC(UniMod_4)PFAGILENGAVR", "VC(UniMod_4)SQYAAYGEK", "VDGALC(UniMod_4)MEK", "VDNALQSGNSQESVTEQDSK", "VDNALQSGNSQESVTEQDSKDSTYSLSSTLTLSK", "VDSGNDVTDIADDGC(UniMod_4)PKPPEIAHGYVEHSVR", "VEHSDLSFSK", "VEKPTADAEAYVFTPNMIC(UniMod_4)AGGEK", "VEPLRAELQEGAR", "VEPYGENFNK", "VFAVSHGR", "VFNTPEGVPSAPSSLK", "VFQEPLFYEAPR", "VFSNGADLSGVTEEAPLK", "VFSNGADLSGVTEEAPLKLSK", "VGFYESDVMGR", "VGGVQSLGGTGALR", "VGYVSGWGR", "VHTEC(UniMod_4)C(UniMod_4)HGDLLEC(UniMod_4)ADDR", "VHTEC(UniMod_4)C(UniMod_4)HGDLLEC(UniMod_4)ADDRADLAK", "VHVSEEGTEPEAMLQVLGPKPALPAGTEDTAKEDAANR", "VHVSEEGTEPEAMLQVLGPKPALPAGTEDTAKEDAANRK", "VIAVNEVGR", "VKDISEVVTPR", "VKDLATVYVDVLK", "VKSPELQAEAK", "VLDLSC(UniMod_4)NR", "VLEPTLK", "VLLDGVQNPR", "VLNQELR", "VLSIAQAHSPAFSC(UniMod_4)EQVR", "VLTPTQVK", "VMDKYTFELSR", "VMPIC(UniMod_4)LPSKDYAEVGR", "VMTPAVYAPYDVK", "VNGSPVDNHPFAGDVVFPR", "VNHVTLSQPK", "VPEARPNSMVVEHPEFLK", "VPFDAATLHTSTAMAAQHGMDDDGTGQK", "VPMMSDPK", "VPQVSTPTLVEVSR", "VPVAVQGEDTVQSLTQGDGVAK", "VQAAVGTSAAPVPSDNH", "VQPYLDDFQK", "VQPYLDDFQKK", "VRGGEGTGYFVDFSVR", "VRQGQGQSEPGEYEQR", "VSEADSSNADWVTK", "VSPTDC(UniMod_4)SAVEPEAEK", "VSTLPAITLK", "VTAAPQSVC(UniMod_4)ALR", "VTEIWQEVMQR", "VTGVVLFR", "VTIKPAPETEKRPQDAK", "VTIPTDLIASSGDIIK", "VTLTC(UniMod_4)VAPLSGVDFQLR", "VTSIQDWVQK", "VTTVASHTSDSDVPSGVTEVVVK", "VVEESELAR", "VYAC(UniMod_4)EVTHQGLSSPVTK", "VYC(UniMod_4)DMNTENGGWTVIQNR", "WC(UniMod_4)AVSEHEATK", "WEAEPVYVQR", "WELALGR", "WGYC(UniMod_4)LEPK", "WKNFPSPVDAAFR", "WLPSSSPVTGYR", "WQEEMELYR", "WSGQTAIC(UniMod_4)DNGAGYC(UniMod_4)SNPGIPIGTR", "WSRPQAPITGYR", "WSSTSPHRPR", "WYEIEKIPTTFENGR", "WYFDVTEGK", "YAMVYGYNAAYNR", "YANC(UniMod_4)HLAR", "YFIDFVAR", "YGFIEGHVVIPR", "YGLDSDLSC(UniMod_4)K", "YGLVTYATYPK", "YGQTIRPIC(UniMod_4)LPC(UniMod_4)TEGTTR", "YHDRDVWKPEPC(UniMod_4)R", "YIETDPANR", "YIFHNFMER", "YIVSGTPTFVPYLIK", "YKAAFTEC(UniMod_4)C(UniMod_4)QAADK", "YLFLNGNK", "YLGEEYVK", "YLQEIYNSNNQK", "YLYEIAR", "YNSQNQSNNQFVLYR", "YPGPQAEGDSEGLSQGLVDREK", "YPNC(UniMod_4)AYR", "YPSLSIHGIEGAFDEPGTK", "YQC(UniMod_4)YC(UniMod_4)YGR", "YSLTYIYTGLSK", "YTTEIIK", "YVGGQEHFAHLLILR", "YVM(UniMod_35)LPVADQDQC(UniMod_4)IR", "YVMLPVADQDQC(UniMod_4)IR", "YVNKEIQNAVNGVK", "YWGVASFLQK", "YYC(UniMod_4)FQGNQFLR", "YYTYLIMNK", "YYWGGQYTWDMAK"]

In [None]:
def prepare_dataset(train_proteins, train_peptides, train_clinical, feature_reduction_method='PCA', dataset='train', selected_features = features):
    """
    Preprocess the protein and peptide dataframes to create a dataset ready for modeling.

    Parameters:
        train_proteins (pd.DataFrame): Protein dataframe.
        train_peptides (pd.DataFrame): Peptide dataframe.

    Returns:
        pd.DataFrame: Preprocessed feature and target dataset ready for modeling.
    """

    # Step 1: Grouping 
    df_protein_grouped = train_proteins.groupby(['patient_id','visit_id','UniProt'])['NPX'].mean().reset_index()
    df_peptide_grouped = train_peptides.groupby(['patient_id','visit_id','Peptide'])['PeptideAbundance'].mean().reset_index()
  
    # Step 2: Pivoting
    df_protein = df_protein_grouped.pivot(index='visit_id',columns = 'UniProt', values = 'NPX').rename_axis(columns=None).reset_index()
    df_peptide = df_peptide_grouped.pivot(index='visit_id',columns = 'Peptide', values = 'PeptideAbundance').rename_axis(columns=None).reset_index()
    
    # Step 3: Merging
    pro_pep_df = df_protein.merge(df_peptide, on = ['visit_id'], how = 'left')

    # Step 4: Include Patient Id and Visit Month
    pro_pep_df['patient_id'] = pro_pep_df['visit_id'].str.split('_').str[0]
    pro_pep_df['visit_month'] = pro_pep_df['visit_id'].str.split('_').str[1]

    # Step 5: Handling missing values - fill median groupby patient
    warnings.filterwarnings("ignore", category=RuntimeWarning, message="Mean of empty slice") # Filter out the specific warning

    unique_protein = np.unique(train_proteins['UniProt'])
    unique_peptide = np.unique(train_peptides['Peptide'])
    for i in unique_protein: # Proteins
      pro_pep_df[i] = pro_pep_df.groupby('patient_id')[i].transform(lambda x: x.fillna(x.median()))
    for i in unique_peptide: # Peptides
      pro_pep_df[i] = pro_pep_df.groupby('patient_id')[i].transform(lambda x: x.fillna(x.median()))
    
    warnings.filterwarnings("default", category=RuntimeWarning) # Restore the warning filter to its default state

    # Step 6: Save visit_id, visit_month and patient_id separately and remove them for next steps
    df_details = pro_pep_df[['visit_id', 'patient_id', 'visit_month']]
    pro_pep_df_features_only = pro_pep_df.drop(columns=['visit_id', 'patient_id', 'visit_month'])
    
    # Step 7: Remove one of the highly correlated features with high percentage of missing values
    if dataset == 'train':
      missing_percentages = pro_pep_df_features_only.isnull().mean() # Compute missing value percentages
      corr_matrix = pro_pep_df_features_only.corr().abs() # Find feature pairs with correlation 1 or -1
      correlated_features = [] 
      for i in range(len(corr_matrix.columns)):
          for j in range(i + 1, len(corr_matrix.columns)):
              correlation = corr_matrix.iloc[i, j]
              if correlation == 1.0 or correlation == -1.0:
                  feature1 = corr_matrix.columns[i]
                  feature2 = corr_matrix.columns[j]
                  correlated_features.append((feature1, feature2))
      for feature_pair in correlated_features: # Remove one of the correlated features with higher missing value percentage
          feature1, feature2 = feature_pair
          if missing_percentages[feature1] > missing_percentages[feature2]:
              pro_pep_df_features_only.drop(feature1, axis=1, inplace=True)
          else:
              pro_pep_df_features_only.drop(feature2, axis=1, inplace=True)
      
      # Step 8: Save selected features in JSON
      features_after_correlation = pro_pep_df_features_only.columns.tolist()
      with open(PATH+'features_after_correlation.json', 'w') as file:
        json.dump(features_after_correlation, file)
      print('Features After Correlation: ', features_after_correlation)

    elif dataset == 'test':
      pro_pep_df_features_only = pro_pep_df_features_only[selected_features]
    
    # Step 9: Imputer missing values for remaining features
    if dataset=='train':
      train_clinical = train_clinical[['visit_id', 'updrs_1', 'updrs_2', 'updrs_3', 'updrs_4']]
      melted_df = pd.melt(train_clinical, id_vars=['visit_id'], value_vars=['updrs_1', 'updrs_2', 'updrs_3', 'updrs_4'], var_name='updrs_test', value_name='rating')
      pro_pep_cli_df = pro_pep_df.merge(melted_df, on=['visit_id'], how='inner')
      
      details_df = pro_pep_cli_df[['visit_id',	'patient_id',	'visit_month', 'updrs_test']]
      pro_pep_df_features_target = pro_pep_cli_df.drop(columns=['visit_id',	'patient_id',	'visit_month', 'updrs_test']) # Get only features and target
      pro_pep_df_features_target = data_imputer(pro_pep_df_features_target)
    
    elif dataset=='test':
      train_clinical = train_clinical[['visit_id', 'updrs_test']]
      pro_pep_cli_df = pro_pep_df.merge(melted_df, on=['visit_id'], how='inner')
      
      details_df = pro_pep_cli_df[['visit_id',	'patient_id',	'visit_month', 'updrs_test']]
      pro_pep_df_features_target = pro_pep_cli_df.drop(columns=['visit_id',	'patient_id',	'visit_month', 'updrs_test']) # Get only features and target
      pro_pep_df_features_target = data_imputer(pro_pep_df_features_target)
    
    # Step 10: Merge with details
    pro_pep_cli_df = pd.concat([details_df, pro_pep_df_features_target], axis=1)

    # Step 11: Feature Reduction
    if feature_reduction_method=='PCA':
      target_variance = 0.95 # Set target variance
      if dataset == 'train':
        details_df = pro_pep_cli_df[['visit_id',	'patient_id',	'visit_month', 'updrs_test']]
        features_df = feature_reduction_pca(pro_pep_cli_df.drop(columns=['visit_id',	'patient_id',	'visit_month', 'updrs_test', 'rating']), target_variance=target_variance)
        features_df = pd.concat([features_df, details_df], axis=1)
        target_df = pro_pep_cli_df[['rating']]
        return features_df, target_df, features_after_correlation
      elif dataset == 'test':
        details_df = pro_pep_cli_df[['visit_id',	'patient_id',	'visit_month', 'updrs_test']]
        features_df = feature_reduction_pca(pro_pep_cli_df.drop(columns=['visit_id',	'patient_id',	'visit_month', 'updrs_test']), target_variance=target_variance)
        features_df = pd.concat([features_df, details_df], axis=1)
        return features_df


    # # Step 9: Combine target values
    # pro_pep_df = pd.concat([df_details, pro_pep_df_features_only], axis=1)
    # if dataset == 'train': 
    #   train_clinical = train_clinical[['visit_id', 'updrs_1', 'updrs_2', 'updrs_3', 'updrs_4']]
    #   melted_df = pd.melt(train_clinical, id_vars=['visit_id'], value_vars=['updrs_1', 'updrs_2', 'updrs_3', 'updrs_4'], var_name='updrs_test', value_name='rating')
    # elif dataset == 'test':
    #   melted_df = train_clinical.copy()
    # pro_pep_cli_df = pro_pep_df.merge(melted_df, on=['visit_id'], how='inner')
        
    # Step 10: Imputer missing values for remaining features
    # details_df = pro_pep_cli_df[['visit_id',	'patient_id',	'visit_month']]
    # pro_pep_df_features_target = pro_pep_cli_df.drop(columns=['visit_id',	'patient_id',	'visit_month']) # Get only features and target
    # pro_pep_df_features_target = data_imputer(pro_pep_df_features_target)

    # # Step 11: Merge with details
    # pro_pep_cli_df = pd.concat([details_df, pro_pep_df_features_target], axis=1)
    
    # # Step 11: Feature Reduction
    # if feature_reduction_method=='PCA':
    #   target_variance = 0.95 # Set target variance
    #   features_df = feature_reduction_pca(pro_pep_cli_df.drop(columns=['visit_id',	'patient_id',	'visit_month', 'updrs_1', 'updrs_2', 'updrs_3', 'updrs_4']), target_variance=target_variance)
    #   target_df = pro_pep_cli_df[['updrs_1', 'updrs_2', 'updrs_3', 'updrs_4']]
    
    # return features_df, target_df, features_after_correlation


# Change Imputer if necessary
def data_imputer(pro_pep_df_features_only):

  """
    Perform KNN imputation on the input dataframe to fill in missing values.

    Parameters:
        pro_pep_df_features_only (pd.DataFrame): Input dataframe with features only.

    Returns:
        pd.DataFrame: Imputed dataframe with missing values filled in using KNN imputation.
    """

  imputer = KNNImputer(n_neighbors=5) # Create instance for Imputer
  pro_pep_df_imputed = pd.DataFrame(imputer.fit_transform(pro_pep_df_features_only), columns=pro_pep_df_features_only.columns)   # Perform KNN imputation on the dataframe 

  return pro_pep_df_imputed


# Change feature reduction technique if necessary
def feature_reduction_pca(features_df, target_variance=0.95):

    """
    Perform Principal Component Analysis (PCA) on the features DataFrame.

    Parameters:
        - features_df (DataFrame): DataFrame containing the features to be transformed.
        - target_variance (float): Target cumulative variance to be explained by the selected components.

    Returns:
        - selected_features_df (DataFrame): DataFrame containing the selected features that explain the target variance.

    """

    # Perform PCA on the features DataFrame
    pca = PCA()
    pca.fit(features_df)

    # Calculate the cumulative variance explained by each principal component
    cum_variance = np.cumsum(pca.explained_variance_ratio_)

    # Find the index of the first principal component that exceeds the target variance threshold
    n_components = np.argmax(cum_variance >= target_variance) + 1

    print(f'Number of selected PCA Components :{n_components}')

    # Apply PCA transformation to the features DataFrame with the selected number of components
    pca = PCA(n_components=n_components)
    transformed_features = pca.fit_transform(features_df)

    # Create a DataFrame with the selected features that explain the target variance
    selected_features_df = pd.DataFrame(transformed_features, columns=[f'PC{i+1}' for i in range(n_components)])

    return selected_features_df

In [None]:
# Prepare train dataset

protein_col = list(set(train_proteins.columns.tolist()) & set(test_proteins.columns.tolist()))
peptide_col = list(set(train_peptides.columns.tolist()) & set(test_peptides.columns.tolist()))

train_proteins = train_proteins[protein_col]
train_peptides = train_peptides[peptide_col]

test_proteins = test_proteins[protein_col]
test_peptides = test_peptides[peptide_col]

In [None]:
features_df, target_df, features = prepare_dataset(train_proteins, train_peptides, train_clinical, feature_reduction_method='PCA', dataset='train', selected_features = features)
features_df.shape

Features After Correlation:  ['O00391', 'O00533', 'O00584', 'O14498', 'O14773', 'O14791', 'O15240', 'O15394', 'O43505', 'O60888', 'O75144', 'O75326', 'O94919', 'P00441', 'P00450', 'P00734', 'P00736', 'P00738', 'P00746', 'P00747', 'P00748', 'P00751', 'P01008', 'P01009', 'P01011', 'P01019', 'P01023', 'P01024', 'P01031', 'P01033', 'P01034', 'P01042', 'P01344', 'P01591', 'P01594', 'P01608', 'P01621', 'P01717', 'P01780', 'P01833', 'P01834', 'P01857', 'P01859', 'P01860', 'P01861', 'P01876', 'P01877', 'P02452', 'P02647', 'P02649', 'P02652', 'P02655', 'P02656', 'P02671', 'P02675', 'P02679', 'P02747', 'P02748', 'P02749', 'P02750', 'P02751', 'P02753', 'P02760', 'P02763', 'P02765', 'P02766', 'P02768', 'P02774', 'P02787', 'P02790', 'P02792', 'P04004', 'P04075', 'P04156', 'P04180', 'P04196', 'P04207', 'P04211', 'P04216', 'P04217', 'P04275', 'P04406', 'P04433', 'P05060', 'P05067', 'P05090', 'P05155', 'P05156', 'P05408', 'P05452', 'P05546', 'P06310', 'P06396', 'P06454', 'P06681', 'P06727', 'P07195', 

(4272, 10)

# Modelling

In [None]:

def smape(y_true, y_pred):
    """
    Calculate the Symmetric Mean Absolute Percentage Error (SMAPE).

    Parameters:
    - y_true: numpy array or list of true values
    - y_pred: numpy array or list of predicted values

    Returns:
    - smape_score: SMAPE score as a float value
    """
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    numerator = np.abs(y_pred - y_true)
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2.0
    smape_score = np.mean(numerator / denominator) * 100.0
    return smape_score

def train_and_evaluate_models(feature_df, target):
    """
    Train and evaluate different regression models using the SMAPE score.

    Parameters:
    - feature_df: pandas DataFrame containing the feature data
    - target: pandas Series containing the target values

    Returns:
    - best_model: the best trained model based on the SMAPE score
    """

    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(feature_df, target, test_size=0.2, random_state=42)

    # Initialize the models
    lr = LinearRegression()
    rfr = RandomForestRegressor(random_state=42)
    gbr = GradientBoostingRegressor(random_state=42)
    svr = SVR()

    # Train the models
    lr.fit(X_train, y_train)
    rfr.fit(X_train, y_train)
    gbr.fit(X_train, y_train)
    svr.fit(X_train, y_train)

    # Evaluate the models using SMAPE score
    lr_pred = lr.predict(X_test)
    rfr_pred = rfr.predict(X_test)
    gbr_pred = gbr.predict(X_test)
    svr_pred = svr.predict(X_test)

    lr_smape = smape(y_test, lr_pred)
    rfr_smape = smape(y_test, rfr_pred)
    gbr_smape = smape(y_test, gbr_pred)
    svr_smape = smape(y_test, svr_pred)

    # Find the best model based on SMAPE score
    smape_scores = {
        'Linear Regression': lr_smape,
        'Random Forest': rfr_smape,
        'Gradient Boosting': gbr_smape,
        'SVR': svr_smape
    }

    best_model = min(smape_scores, key=smape_scores.get)

    print("SMAPE Scores:")
    for model, score in smape_scores.items():
        print(f"{model}: {score:.2f}")

    print("Best Model:", best_model)

    # Return the best trained model
    if best_model == 'Linear Regression':
        return lr
    elif best_model == 'Random Forest':
        return rfr
    elif best_model == 'Gradient Boosting':
        return gbr
    elif best_model == 'SVR':
        return svr
    else:
        return None

def find_best_models(feature_df, target_df):
  trained_models = {}

  for name in target_df.columns:
    trained_models[str(name)] = train_and_evaluate_models(features_df, target_df[name])
    print(trained_models)
  return

In [None]:
find_best_models(features_df, target_df)

SMAPE Scores:
Linear Regression: 75.71
Random Forest: 73.37
Gradient Boosting: 74.25
SVR: 72.92
Best Model: SVR
{'updrs_1': SVR()}
SMAPE Scores:
Linear Regression: 104.78
Random Forest: 105.35
Gradient Boosting: 105.98
SVR: 104.66
Best Model: SVR
{'updrs_1': SVR(), 'updrs_2': SVR()}


ValueError: ignored

## Strategies for feature reduction  - doesnt include main code

In [None]:
# Feature variance - cannot remove with respect to variance

var_df = pro_pep_df.drop('visit_id', axis=1).var()
var_df.min()

  var_df = pro_pep_df.drop('visit_id', axis=1).var()


694.294585234452

In [None]:
# Check for missing value %
missing_values = pro_pep_df.isna().mean()*100
print(missing_values.sort_values(ascending=False))

EPQVYTLPPSRDELTK                                       47.079964
ALVQQMEQLR                                             27.852650
AVGDKLPEC(UniMod_4)EADDGC(UniMod_4)PKPPEIAHGYVEHSVR    26.594789
HYEGSTVPEK                                             26.235400
DVQLVESGGGLVKPGGSLR                                    25.336927
                                                         ...    
FSGTWYAMAK                                              0.000000
FSVVYAK                                                 0.000000
FTNIGPDTMR                                              0.000000
FYNQVSTPLLR                                             0.000000
patient_id                                              0.000000
Length: 1154, dtype: float64


In [None]:
# pro_pep_df_month = pro_pep_df[pro_pep_df['visit_id'].str.split('_').str[1].astype(int).isin([0, 6, 12, 24])]

In [None]:

pro_pep_df

Unnamed: 0,visit_id,O00391,O00533,O00584,O14498,O14773,O14791,O15240,O15394,O43505,...,YVGGQEHFAHLLILR,YVM(UniMod_35)LPVADQDQC(UniMod_4)IR,YVMLPVADQDQC(UniMod_4)IR,YVNKEIQNAVNGVK,YWGVASFLQK,YYC(UniMod_4)FQGNQFLR,YYTYLIMNK,YYWGGQYTWDMAK,patient_id,visit_month
0,10053_0,9104.27,402321.0,7126.96,24525.7,7150.57,2497.840,83002.9,15113.6,167327.0,...,4401830.0,77482.6,583075.0,76705.7,104260.0,530223.0,50835.5,7207.30,10053,0
1,10053_12,10464.20,435586.0,7126.96,24525.7,7150.57,2435.275,197117.0,15099.1,164268.0,...,5001750.0,36745.3,355643.0,92078.1,123254.0,453883.0,49281.9,25332.80,10053,12
2,10053_18,13235.70,507386.0,7126.96,24525.7,7150.57,2372.710,126506.0,16289.6,168107.0,...,5424380.0,39016.0,496021.0,63203.6,128336.0,447505.0,52389.1,21235.70,10053,18
3,10138_12,12600.20,494581.0,9165.06,27193.5,22506.10,6015.900,156313.0,54546.4,204013.0,...,3900280.0,48210.3,328482.0,89822.1,129964.0,552232.0,65657.8,9876.98,10138,12
4,10138_24,12003.20,522138.0,4498.51,17189.8,29112.40,2665.150,151169.0,52338.1,240892.0,...,3521800.0,69984.6,496737.0,80919.3,111799.0,568923.5,56977.6,4903.09,10138,24
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1108,8699_24,9983.00,400290.0,24240.10,,16943.50,6303.170,77493.6,46435.3,254247.0,...,8770410.0,33599.1,926094.0,118897.0,133682.0,571879.0,80268.3,54889.70,8699,24
1109,942_12,6757.32,360858.0,18367.60,14760.7,18603.40,1722.770,86847.4,37741.3,212132.0,...,374307.0,35767.3,250397.0,65966.9,77976.8,486239.0,45032.7,20700.30,942,12
1110,942_24,11218.70,352722.0,22834.90,23393.1,16693.50,1487.910,114772.0,36095.7,185836.0,...,374307.0,64049.8,479473.0,68505.7,74483.1,561398.0,52916.4,21847.60,942,24
1111,942_48,11627.80,251820.0,22046.50,26360.5,22440.20,2117.430,82241.9,30146.6,167633.0,...,374307.0,28008.8,231359.0,63265.8,64601.8,632782.0,51123.7,20700.30,942,48


In [None]:
# Create a copy of the dataframe
pro_pep_df_imputed = pro_pep_df.copy()

# Initialize KNNImputer with the desired number of neighbors
imputer = KNNImputer(n_neighbors=5)

df_details = pro_pep_df[['visit_id', 'patient_id', 'visit_month']]

# Perform KNN imputation on the dataframe
pro_pep_df_imputed = pd.DataFrame(imputer.fit_transform(pro_pep_df_imputed), columns=pro_pep_df_imputed.columns)

pro_pep_df_imputed

Unnamed: 0,visit_id,O00391,O00533,O00584,O14498,O14773,O14791,O15240,O15394,O43505,...,YTTEIIK,YVGGQEHFAHLLILR,YVM(UniMod_35)LPVADQDQC(UniMod_4)IR,YVMLPVADQDQC(UniMod_4)IR,YVNKEIQNAVNGVK,YWGVASFLQK,YYC(UniMod_4)FQGNQFLR,YYTYLIMNK,YYWGGQYTWDMAK,patient_id
0,100530.0,9104.27,402321.0,7126.96,24525.700000,7150.57,2497.840,83002.9,15113.6,167327.0,...,8663.170,4401830.0,77482.6,583075.0,76705.7,104260.0,530223.0,50835.5,7207.30,10053.0
1,1005312.0,10464.20,435586.0,7126.96,24525.700000,7150.57,2435.275,197117.0,15099.1,164268.0,...,7657.062,5001750.0,36745.3,355643.0,92078.1,123254.0,453883.0,49281.9,25332.80,10053.0
2,1005318.0,13235.70,507386.0,7126.96,24525.700000,7150.57,2372.710,126506.0,16289.6,168107.0,...,8322.276,5424380.0,39016.0,496021.0,63203.6,128336.0,447505.0,52389.1,21235.70,10053.0
3,1013812.0,12600.20,494581.0,9165.06,27193.500000,22506.10,6015.900,156313.0,54546.4,204013.0,...,9433.710,3900280.0,48210.3,328482.0,89822.1,129964.0,552232.0,65657.8,9876.98,10138.0
4,1013824.0,12003.20,522138.0,4498.51,17189.800000,29112.40,2665.150,151169.0,52338.1,240892.0,...,6365.150,3521800.0,69984.6,496737.0,80919.3,111799.0,568923.5,56977.6,4903.09,10138.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1108,869924.0,9983.00,400290.0,24240.10,26127.814286,16943.50,6303.170,77493.6,46435.3,254247.0,...,8615.270,8770410.0,33599.1,926094.0,118897.0,133682.0,571879.0,80268.3,54889.70,8699.0
1109,94212.0,6757.32,360858.0,18367.60,14760.700000,18603.40,1722.770,86847.4,37741.3,212132.0,...,4767.630,374307.0,35767.3,250397.0,65966.9,77976.8,486239.0,45032.7,20700.30,942.0
1110,94224.0,9867.94,352722.0,22834.90,23393.100000,16693.50,1487.910,114772.0,36095.7,185836.0,...,5554.530,374307.0,64049.8,479473.0,68505.7,74483.1,561398.0,52916.4,21847.60,942.0
1111,94248.0,11627.80,251820.0,22046.50,26360.500000,22440.20,2117.430,82241.9,30146.6,167633.0,...,6310.090,374307.0,28008.8,231359.0,63265.8,64601.8,632782.0,51123.7,20700.30,942.0


In [None]:
# Save columns in JSON
features_after_correlation = pro_pep_df.columns[1:-2].tolist()
with open(PATH+'features_after_correlation.json', 'w') as file:
    json.dump(features_after_correlation, file)

TypeError: ignored

In [None]:
# Check for missing values after KNN imputer

# Check for missing values in all columns
missing_values = pro_pep_df_imputed.isna().sum()

# Print the columns with missing values
print(missing_values.sort_values())

visit_id                                                                 0
MASGAANVVGPK                                                             0
MADEAGSEADHEGTHSTKR                                                      0
M(UniMod_35)YLGYEYVTAIR                                                  0
M(UniMod_35)VQEQC(UniMod_4)C(UniMod_4)HSQLEELHC(UniMod_4)ATGISLANEQDR    0
                                                                        ..
DQPFTILYR                                                                0
DQGNQEQDPNISNGEEEEEKEPGEVGTHNDNQER                                       0
DPTFIPAPIQAK                                                             0
ARAEAQEAEDQQAR                                                           0
patient_id                                                               0
Length: 1154, dtype: int64


In [None]:
# Function for data preparation 



# Prep

In [None]:
test_proteins = pd.read_csv(PATH+'test_proteins.csv')
test_peptides = pd.read_csv(PATH+'test_peptides.csv')
test_clinical = pd.read_csv(PATH+'test.csv')

submission = pd.read_csv(PATH+'sample_submission.csv')

In [None]:
submission

Unnamed: 0,prediction_id,rating,group_key
0,3342_0_updrs_1_plus_0_months,0,0
1,3342_0_updrs_1_plus_6_months,0,0
2,3342_0_updrs_1_plus_12_months,0,0
3,3342_0_updrs_1_plus_24_months,0,0
4,3342_0_updrs_2_plus_0_months,0,0
...,...,...,...
59,50423_6_updrs_3_plus_24_months,0,6
60,50423_6_updrs_4_plus_0_months,0,6
61,50423_6_updrs_4_plus_6_months,0,6
62,50423_6_updrs_4_plus_12_months,0,6
