In [None]:
import requests
import json
import time
from tqdm import tqdm
from balance_date import BalanceDate

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import re

import utils

# train function

In [None]:
def submit(gi, sequence, change=None):
    url = "https://protein-sol.manchester.ac.uk/cgi-bin/solubility/sequenceprediction.php"
    sequence = sequence.replace("X", "A") 
    if change is not None:
        fr, index, t = change[0], change[1: -1], change[-1]
        index = int(index) - 1
        if sequence[index] != fr:
            raise RuntimeError("don't match")
        sequence = sequence[:index] + t + sequence[index+1:]
    payload = (
        ("sequence-input", ">gi{}\n{}".format(gi, sequence)), 
        ("singleprediction", "提交"),
    )
    response = requests.post(url, data=payload)
    if response.status_code == 200:
        job_id = re.search("<p>Job id.*?=.*?(\w+)</p>", response.text, flags=re.I).group(1)
        print("job id：", job_id)
    else:
        raise RuntimeError("error %s" % response.status_code )
    return job_id

In [None]:
def get_protein_sol_result(job_id):
    url = "https://protein-sol.manchester.ac.uk/cgi-bin/utilities/download_file.php"
    payload = (
        ("app","solubility"),
        ("dirname","run"),
        ("timestamp", job_id),
        ("idname", job_id),
        ("file","seq_prediction.txt"),
    )
    response = requests.get(url,params=payload)

    if response.status_code == 200:
        search_solubility = re.search("SEQUENCE PREDICTIONS,(.*)", response.text).group(1)
        solubility = search_solubility.split(",")[2]
        solubility = float(solubility)
        print("solubility：", solubility)
    else:
        raise RuntimeError("error %s" % response.status_code )
    return solubility

In [None]:
def get_solubility(gi, sequence, change=None):
    job_id = submit(gi, sequence, change)
    solubility = get_protein_sol_result(job_id)
    return solubility

In [None]:
gi = "P00547"
change = "M1A"
sequence = "MVKVYAPASSANMSVGFDVLGAAVTPVDGALLGDVVTVEAAETFSLNNLGRFADKLPSEPRENIVYQCWERFCQELGKQIPVAMTLEKNMPIGSGLGSSACSVVAALMAMNEHCGKPLNDTRLLALMGELEGRISGSIHYDNVAPCFLGGMQLMIEENDIISQQVPGFDEWLWVLAYPGIKVSTAEARAILPAQYRRQDCIAHGRHLAGFIHACYSRQPELAAKLMKDVIAEPYRERLLPGFRQARQAVAEIGAVASGISGSGPTLFALCDKPETAQRVADWLGKNYLQNQEGFVHICRLDTAGARVLEN"
get_solubility(gi, sequence, change)

# read data

In [None]:
OUT_PATH = os.path.join(".", "out", time.strftime("%Y%m%d"))  
if not os.path.exists(OUT_PATH):
    os.makedirs(OUT_PATH)
print("out path：", OUT_PATH)

In [None]:
config_path = "./config.json"
with open(config_path) as f:
    config = json.loads(f.read())
config

In [None]:
genes = pd.read_csv(config["genes_seq"])
genes

In [None]:
X_test1, y_test1, X_test2, y_test2 = BalanceDate.get_test() 
X_test1 = pd.merge(X_test1, genes, left_on="gi", right_on="gi").loc[:, ["index", "gi", "solubility", "variation", "mut_residue", "mut_from", "mut_to", "seq"]]
X_test2 = pd.merge(X_test2, genes, left_on="gi", right_on="gi").loc[:, ["index", "gi", "solubility", "variation", "mut_residue", "mut_from", "mut_to", "seq"]]

# train

In [None]:
genes["protein-sol"] = genes.apply(lambda x: get_solubility(x.gi, x.seq), axis=1)

X_test1["protein-sol"] = X_test1.apply(lambda x: get_solubility(x.gi, x.seq, x.variation), axis=1)
X_test2["protein-sol"] = X_test2.apply(lambda x: get_solubility(x.gi, x.seq, x.variation), axis=1)

In [None]:
_test1_path = os.path.join(OUT_PATH, "protein_sol_test1_result.xlsx")
_test2_path = os.path.join(OUT_PATH, "protein_sol_test2_result.xlsx")
X_test1.to_excel(_test1_path)
X_test2.to_excel(_test2_path)

print("out path：")
print(_test1_path)
print(_test2_path)