In [1]:
% load_ext autoreload
% autoreload 2

In [2]:
import subprocess
from collections import defaultdict
from pathlib import Path

import numpy as np
import pandas as pnd
from tqdm.notebook import tqdm

In [3]:
data_folder = Path("data/cath_data")
data_folder.mkdir(exist_ok=True)

# Download CATH data

In [4]:
from ftplib import FTP

In [5]:
ftp = FTP("orengoftp.biochem.ucl.ac.uk")
ftp.login()

'230 Login successful.'

In [6]:
ftp.cwd("cath/releases/latest-release/")
ftp.nlst()

['cath-classification-data', 'non-redundant-data-sets', 'sequence-data']

In [7]:
ftp.cwd('non-redundant-data-sets')
ftp.nlst()

['cath-dataset-nonredundant-S20.atom.fa',
 'cath-dataset-nonredundant-S20.fa',
 'cath-dataset-nonredundant-S20.list',
 'cath-dataset-nonredundant-S20.pdb.tgz',
 'cath-dataset-nonredundant-S40.atom.fa',
 'cath-dataset-nonredundant-S40.fa',
 'cath-dataset-nonredundant-S40.list',
 'cath-dataset-nonredundant-S40.pdb.tgz']

In [14]:
filename = 'cath-dataset-nonredundant-S40.atom.fa'
with open(data_folder / filename, "wb") as f:
    ftp.retrbinary(f"RETR {filename}", f.write)

```sh
wget http://download.cathdb.info/cath/releases/all-releases/v4_2_0/sequence-data/funfam-hmm3-v4_2_0.lib.gz
hmmpress funfam-hmm3-v4_2_0.lib

git clone https://github.com/UCLOrengoGroup/cath-tools-genomescan.git
cath-tools-genomescan/apps/cath-genomescan.pl -i data/cath-dataset-nonredundant-S40.atom.fa -l funfam-hmm3-v4_2_0.lib -o data/cath_funfam_results/
```

# Get funfam clusters

In [5]:
def filter_full_domains(row, threshold=0.9):
    if "(" in row["query-id"]:
        return False
    boundaries = row["query-id"].split("/")[-1]
    length = 0
    if "_" in boundaries:
        return False
    start, end = map(int, boundaries.lstrip("-").split("-"))
    length += end - start
    if "," in row["boundaries"]:
        return False
    b_start, b_end = map(int, row["boundaries"].split("-"))
    return ((b_end - b_start) / length) > threshold

In [9]:
crh = pnd.read_csv("data/cath_funfam_results/cath-dataset-nonredundant-S40.atom.crh", sep=" ",
                   comment="#", names="query-id match-id score boundaries resolved cond-evalue indp-evalue".split())

In [10]:
crh = crh.groupby("query-id", as_index=False).agg({x: "min" if x == "indp-evalue" else "first" for x in crh.columns})
crh = crh[crh["indp-evalue"] < 10e-10]

In [11]:
crh.shape

(16382, 7)

In [12]:
crh = crh[crh.apply(filter_full_domains, axis=1)]
crh.shape

(12154, 7)

In [110]:
len(crh["match-id"]), len(set(crh["match-id"]))

(12154, 8333)

In [111]:
crh.head()

Unnamed: 0,query-id,match-id,score,boundaries,resolved,cond-evalue,indp-evalue
0,cath|current|12asA00/4-330,3.30.930.10/FF/31459,511.7,1-326,1-326,4.4e-158,4.4e-151
1,cath|current|132lA00/2-129,1.10.530.10/FF/12462,187.4,1-124,1-124,2.3000000000000002e-59,5.9e-53
2,cath|current|153lA00/1-185,1.10.530.10/FF/12422,321.3,2-185,2-185,1.1e-100,1.1e-93
4,cath|current|16pkA02/199-406,3.40.50.1260/FF/4342,273.9,1-208,1-208,1.3999999999999998e-85,4.7e-79
11,cath|current|1a02F00/140-192,1.20.5.170/FF/18272,76.3,1-53,1-53,9.5e-26,9.5e-19


In [42]:
with open(data_folder / "clusters.txt", "w") as f:
    def align_domains(row):
        if len(row["query-id"]) > 1:
            query_ids = [q.split("|")[-1].split("/")[0] for q in row["query-id"]]
            f.write(list(row["match-id"])[0] + ": " + ", ".join(query_ids) + "\n")


    crh.groupby("match-id").apply(align_domains);

In [4]:
funfam_clusters = {}
id_to_funfam_cluster = {}
superfamily_clusters = defaultdict(list)
id_to_superfamily_cluster = {}
with open(data_folder / "clusters.txt") as f:
    for line in tqdm(f):
        match_id, query_ids = line.strip().split(": ")
        query_ids = query_ids.split(", ")
        funfam_clusters[match_id] = query_ids
        superfamily_id = match_id.split("/FF")[0]
        superfamily_clusters[superfamily_id] += query_ids
        for qid in query_ids:
            id_to_funfam_cluster[qid] = match_id
            id_to_superfamily_cluster[qid] = superfamily_id

0it [00:00, ?it/s]

In [5]:
superfamily_ids = list(superfamily_clusters.keys())

In [6]:
len(funfam_clusters), len(superfamily_clusters)

(1567, 721)

In [7]:
def choose_negatives(query_id, num_negatives, num_difficult=1):
    negatives = []
    superfamily_id = id_to_superfamily_cluster[query_id]
    for n in range(min(num_difficult, len(superfamily_clusters[superfamily_id]) - 1)):
        nid = superfamily_clusters[superfamily_id][n]
        if id_to_funfam_cluster[nid] != id_to_funfam_cluster[query_id]:
            negatives.append(nid)
    num_left = num_negatives - len(negatives)
    indices = np.random.choice([i for i in range(len(superfamily_ids)) if superfamily_ids[i] != superfamily_id],
                               num_left)
    for n in range(num_left):
        ids = superfamily_clusters[superfamily_ids[indices[n]]]
        negatives.append(ids[np.random.randint(len(ids))])
    return negatives

In [8]:
pdb_folder = data_folder / "dompdb"
matrices_folder = data_folder / "rotation_matrices"

In [84]:
if not matrices_folder.exists():
    matrices_folder.mkdir()
with open(data_folder / "clusters.txt") as f:
    for line in tqdm(f):
        match_id, query_ids = line.strip().split(": ")
        query_ids = query_ids.split(", ")
        for q in query_ids:
            negatives = choose_negatives(q, 3)
            for n in negatives:
                fasta_file = matrices_folder / f"{q}_{n}.fasta"
                if fasta_file.exists():
                    continue
                with open(fasta_file, "w") as outfile:
                    subprocess.run(["USAlign",
                                    str(pdb_folder / q),
                                    str(pdb_folder / n),
                                    "-mm", "1", "-ter", "1",
                                    "-m", matrices_folder / f"{q}_{n}",
                                    "-outfmt", "1"],
                                   stdout=outfile)

        for q1 in range(len(query_ids) - 1):
            for q2 in range(q1 + 1, len(query_ids)):
                with open(matrices_folder / f"{query_ids[q1]}_{query_ids[q2]}.fasta", "w") as outfile:
                    subprocess.run(["USAlign",
                                    str(pdb_folder / query_ids[q1]),
                                    str(pdb_folder / query_ids[q2]),
                                    "-mm", "1", "-ter", "1",
                                    "-m", matrices_folder / f"{query_ids[q1]}_{query_ids[q2]}",
                                    "-outfmt", "1"],
                                   stdout=outfile)

0it [00:00, ?it/s]

