In [90]:
import datetime
import zipfile
import urllib.request
import shutil
import csv
import json

import pickle

In [91]:
import gzip
import shutil
file_name = 'Kinase_Substrate_Dataset.gz'
with gzip.open(file_name, 'rb') as f_in:
    with open('Kinase_Substrate_Dataset', 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)
        
with open('Kinase_Substrate_Dataset', 'rb') as f:
    date_accessed = f.readline().strip().decode('utf-8')

In [92]:
import pandas as pd

In [93]:
# filter where substrate organism and kinase organism are both human

df = pd.read_csv('Kinase_Substrate_Dataset', sep='\t', skiprows=3)
df = df[df['SUB_ORGANISM'] == 'human']
df = df[df['KIN_ORGANISM'] == 'human']

In [94]:
hgnc = json.load(open('../HGNC/results.json'))
k_alias_v_approvedSymbol = hgnc['data']
hgnc_readme = hgnc['access_log']

In [95]:

def map_genes(genes):
    not_found = []
    result = []
    changed = {}
    for gene in genes:
        if gene not in k_alias_v_approvedSymbol:
            not_found.append(gene)
            result.append(gene)
        else:
            approved_symbol = k_alias_v_approvedSymbol[gene][0]
            result.append(gene)
            if approved_symbol != gene:
                changed[gene] = approved_symbol
    change_tracking = {
        'len_changed': len(changed),
        'len_not_found': len(not_found),
        'not_found': not_found,
        'changed': changed,
    }
    return result, change_tracking


In [96]:
kinase_genes = list(df['GENE'])
substrate_genes = list(df['SUB_GENE'])

kinase_genes_mapped, kinase_log = map_genes(kinase_genes)
substrate_genes_mapped, substrate_log = map_genes(substrate_genes)

if len(kinase_genes) != len(kinase_genes_mapped):
    raise RuntimeError('The mapped kinase genes do not match original length')
if len(substrate_genes) != len(substrate_genes_mapped):
    raise RuntimeError('The mapped substrate genes do not match original length')

pathway_change_tracking = {
    'kinase': kinase_log,
    'substrate': substrate_log,
}

df['GENE'] = kinase_genes
df['SUB_GENE'] = substrate_genes


In [97]:
db = 'PhosphositePlus'
readme_logging = {
    'db': db,
    'db_date_accessed': datetime.datetime.now().strftime("%b %d %Y %H:%M:%S"),                
    'HGNC_date_accessed': hgnc_readme['date_accessed'],
    'Date_gene_mapped': datetime.datetime.now().strftime("%b %d %Y %H:%M:%S"),
}
with open(f'../gene_mapped/PhosphositePlus/access_log.json', 'w') as fp:
    json.dump(readme_logging, fp)

In [98]:
pd.DataFrame.from_dict(pathway_change_tracking).T.to_csv(f'../gene_mapped/{db}/changelog.tsv', sep='\t')


In [99]:
df.to_csv(f'../gene_mapped/{db}/{db}.tsv', sep='\t', index=False)

In [100]:
db

'PhosphositePlus'