In [1]:
import csv
import glob
import random

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

import numpy as np
from collections import defaultdict
from pyteomics import  fasta, parser, mass, achrom, electrochem, auxiliary, mzxml


In [2]:
def get_tsv_data(path):
    with open(path) as tsv:
        reader = csv.reader(tsv, dialect="excel-tab")
        header = next(reader) #skip header
        for line in reader:
            yield list(zip(header, line))


            
def get_mzxml_filenames(dir_path):
    for mzxml_filepath in glob.glob(f"{dir_path}/*.mzXML"):
        yield mzxml_filepath
    

In [3]:
# for understanding data
for data in get_tsv_data("./data/data.tsv"):
    print(*list(data)[:50], sep="\n")
    break

('Peptidoform', '.SPLFM+15.995GK.')
('Peptidoform ID', 'SPLFM+15.995GK')
('Unmod peptidoform', '.SPLFMGK.')
('Total', '11679')
('Total- Unmodified sequence', '11681')
('Peptidoforms- Unmodified sequence', '2')
('Proteins', 'sp|P01009|A1AT_HUMAN;tr|A0A024R6I7|A0A024R6I7_HUMAN')
('Mass', '795.406')
('Charge', '2')
('Num Mods', '1')
('All Mods', ',16,')
('Is Decoy', 'False')
('Lorikeet input', 'SPLFM+15.995GK')
('Orig cluster FDR', '0.000380267')
('Pep Prefix', 'SP')
('Annotation', 'M+16,5[Oxidation]')
('Annotation without position', 'M+16[Oxidation]')
('Known', 'UNIMOD')
('Num mod frags', '9')
('PValue', '4.715')
('% Explained', '75.1')
('Rep cluster task', '4e3c96640e534ecdb7053896d6f56f67')
('Rep cluster user', 'batch')
('Rep cluster index', '205322')
('Num tasks', '55')
('Rep spectrum filename', 'MSV000080596/ccms_peak/RAW/20150708_QEp1_LC7_PhGe_SA_Plate1C_1_4.mzXML')
('Rep spectrum scan', '10244')
('Outlier groups', '.Patient-29.Timepoint-4.')
('Outlier group ratio', '0.17')
('Outlie

In [4]:
class Data:
    def __init__(self, patient, pepti, intensity):
        '''
        patient is a list of patient name corresponding to x-asix in intensity
        pepti is a list of pepti id corresponsing to y-axis in intensity
        intensity is a 2D numpy array what stores intensity data of patient and pepti
        '''
        self.patient = patient
        self.pepti = pepti
        self.intensity = intensity 

    @staticmethod
    def intensity_string_to_int(intensity):
        if intensity == "N/A":
            return 0
        return int(intensity.replace(',', ''))

    @classmethod
    def fromFilePath(cls, path, pepti_id_col = 1, start_col = 32):
        #count line of file
        num_pepti = 0
        for _ in open(path): 
            num_pepti += 1
        num_pepti -= 1 # remove header

        #open file
        with open(path) as tsv:
            reader = csv.reader(tsv, dialect="excel-tab")
            patient = next(reader)[start_col:] #header after 32 is patient name with time stamp
            pepti = [""] * num_pepti
            intensity_data = np.empty((num_pepti, len(patient)), int)

            for i, line in enumerate(reader):
                pepti[i] = line[ pepti_id_col ]
                intensity_data[i,:] = np.array([ cls.intensity_string_to_int(l) for l in line[start_col:]])

        return cls(patient, pepti, intensity_data)

    def get_patient_from_index(self,p_index):
        return self.intensity[:,p_index]

    def get_patient(self, patient):
        p_index = self.patient.index(patient)
        return self.get_patient_from_index(p_index)

    def get_patient_list(self, patient_list):
        re = np.zeros((len(self.pepti), len(patient_list)))
        for i, pat in enumerate(patient_list):
            re[:,i] = self.get_patient(pat)
        return re

    def get_pepti_from_index(self,p_index):
        return self.intensity[p_index,:]

    def get_pepti(self, pepti):
        p_index = self.pepti.index(pepti)
        return self.get_pepti_from_index(p_index)

    def get_pepti_list(self, pepti_list):
        re = np.zeros((len(pepti_list), len(self.patient)),int)
        for i, pep in enumerate(pepti_list):
            re[i,:] = self.get_pepti(pep)
        return re

In [5]:
#loading data
data = Data.fromFilePath("./data/data.tsv")

#showing what data contains
print(data.intensity.shape)

print(len(data.patient))
print(data.patient[:10])

print(len(data.pepti))
print(data.pepti[:10])

(40921, 672)
672
['Patient_01.Timepoint_1', 'Patient_01.Timepoint_1_unmod', 'Patient_01.Timepoint_2', 'Patient_01.Timepoint_2_unmod', 'Patient_01.Timepoint_3', 'Patient_01.Timepoint_3_unmod', 'Patient_01.Timepoint_4', 'Patient_01.Timepoint_4_unmod', 'Patient_01.Timepoint_5', 'Patient_01.Timepoint_5_unmod']
40921
['SPLFM+15.995GK', 'EPQVYTLPPSREEM+15.995TK', 'AVM+15.995DDFAAFVEK', 'EFNAETFTFHADIC-33.988TLSEK', 'M+15.995ADEAGSEADHEGTHSTK', 'DVFLGM+15.995FLYEYAR', 'ETEGLRQEM+15.995SK', 'ALTDMPQM+15.995R', 'DTLM+15.995ISR', 'ALTDM+15.995PQM+15.995R']


In [6]:
#testing some get function
print(len(data.get_patient('Patient_01.Timepoint_1')))
print(len(data.get_pepti('SPLFM+15.995GK')))

temp_patient = data.get_patient_list(['Patient_01.Timepoint_1', 'Patient_01.Timepoint_1_unmod', 'Patient_01.Timepoint_2', 'Patient_01.Timepoint_2_unmod', 'Patient_01.Timepoint_3', 'Patient_01.Timepoint_3_unmod', 'Patient_01.Timepoint_4', 'Patient_01.Timepoint_4_unmod', 'Patient_01.Timepoint_5', 'Patient_01.Timepoint_5_unmod'])
print(temp_patient.shape)

temp_papti = data.get_pepti_list(['SPLFM+15.995GK', 'EPQVYTLPPSREEM+15.995TK', 'AVM+15.995DDFAAFVEK', 'EFNAETFTFHADIC-33.988TLSEK', 'M+15.995ADEAGSEADHEGTHSTK', 'DVFLGM+15.995FLYEYAR', 'ETEGLRQEM+15.995SK', 'ALTDMPQM+15.995R', 'DTLM+15.995ISR', 'ALTDM+15.995PQM+15.995R'])
print(temp_papti.shape)

40921
672
(40921, 10)
(10, 672)


In [7]:
class PatientDiffLoader(Dataset):
    # UNTESTED !!!
    def __init__(self, path):
        self.data = Data.fromFilePath(path)
    def __len__(self):
        return len(self.data.patient)

    def __getitem__(self, idx, mode="same_patient"):
        diff = bool(random.getrandbits(1))
        
        print(self.data.patient[idx])
        pid = int(self.data.patient[idx][8:10])
        tp = int(self.data.patient[idx][21])

        label = None
        data_1 = self.data.get_patient_from_index(idx)
        data_2 = None
        if diff:
            #different class
            label = 0.0

            #get a differnt pid
            diff_pid = -1
            while True:
                diff_pid = random.randint(1,58) #there is 58 patients? TODO: make sure it's 58
                if diff_pid != pid:
                    break

            #get diff data
            random_diff = f"Patient_{diff_pid:02d}.Timepoint_{random.randint(1,7)}{random.choice(['', '_unmod'])}"
            data_2 = self.data.get_patient( random_diff )
            
            
        else:
            #same class
            label = 1.0
            random_same = f"Patient_{pid:02d}.Timepoint_{random.randint(1,7)}{random.choice(['','_unmod'])}"
            data_2 = self.data.get_patient( random_same )

        return data_1, data_2, torch.from_numpy(np.array([label], dtype=np.float32))

In [8]:
#modified from https://www.kaggle.com/jiangstein/a-very-simple-siamese-network-in-pytorch, for testing data pipline only

class SiameseNetwork(nn.Module):# A simple implementation of siamese network, ResNet50 is used, and then connected by three fc layer.
    def __init__(self):
        super(SiameseNetwork, self).__init__()
        self.cnn1 = nn.Sequential(
            nn.Linear(40921, 2*32*100*100)
        )
        self.fc1 = nn.Linear(2*32*100*100, 500)
        #self.fc1 = nn.Linear(2*1000, 500)
        self.fc2 = nn.Linear(500, 500)
        self.fc3 = nn.Linear(500, 2)


    def forward(self, input1, input2):#did not know how to let two resnet share the same param.
        output1 = self.cnn1(input1)
        output1 = output1.view(output1.size()[0], -1)#make it suitable for fc layer.
        output2 = self.cnn1(input2)
        output2 = output2.view(output2.size()[0], -1)
        
        output = torch.cat((output1, output2),1)
        output = F.relu(self.fc1(output))
        output = F.relu(self.fc2(output))
        output = self.fc3(output)
        return output

In [9]:
full_dataset = PatientDiffLoader("./data/data.tsv")
loader = DataLoader(full_dataset, shuffle=True, batch_size=5)

In [10]:
net = SiameseNetwork()
criterion = nn.CrossEntropyLoss() # use a Classification Cross-Entropy loss
optimizer = torch.optim.SGD(net.parameters(), lr=0.001, momentum=0.9)

loss_val = 0
print("start")
for batch_id, (d1, d2, label) in enumerate(loader):
    optimizer.zero_grad()
    output = net.forward(d1, d2)
    loss = loss_fn(output, label)
    loss_val += loss.item()
    loss.backward()
    optimizer.step()

    print(batch_id, loss_val, flush=True)