In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from src import make_data
from pathlib import Path

In [3]:
# Folder to store all the data
DATA_FOLDER = Path("/path/to/data/folder")

Download AlphaFold (AF) database proteins as tar files and extract

In [None]:
make_data.download_data(DATA_FOLDER)
make_data.extract_data(DATA_FOLDER, DATA_FOLDER)

Download UniProt annotations for all AF proteins, split by species

In [None]:
uniprot_folder = DATA_FOLDER / "uniprot_files"
if not uniprot_folder.exists():
    uniprot_folder.mkdir()
make_data.get_uniprot_info(DATA_FOLDER, uniprot_folder)

Get average pLDDT scores, number of high confidence residues, and total number of residues for each AF protein

In [7]:
avg_scores, lengths_high_confidence, lengths_full = make_data.get_AF_protein_information(DATA_FOLDER)

3988it [01:33, 42.74it/s]
19694it [10:20, 31.73it/s]
12622it [09:22, 22.44it/s]
23391it [21:38, 18.01it/s]
27434it [16:18, 28.05it/s]
39299it [22:24, 29.22it/s]
4363it [02:29, 29.27it/s]
13458it [09:59, 22.45it/s]
1773it [00:51, 34.39it/s]
5187it [05:02, 17.14it/s]
19036it [13:44, 23.09it/s]
6040it [04:20, 23.15it/s]
5128it [03:46, 22.66it/s]
21272it [16:47, 21.10it/s]
7924it [07:17, 18.10it/s]
2888it [01:25, 33.86it/s]
55799it [32:34, 28.55it/s]
5974it [04:12, 23.64it/s]]
21615it [15:28, 23.29it/s]


In [8]:
uniprot_folder = DATA_FOLDER / "uniprot_go"

Combine all UniProt data and scores into a dataframe

In [9]:
import pandas as pnd
AF_dataframe = pnd.concat([pnd.read_csv(filename, sep="\t") for filename in uniprot_folder.glob("UP*_uniprot.txt")])
AF_dataframe["Protein family"] = [str(val).split(",")[0] for val in AF_dataframe["Protein families"]] # Superfamily
AF_dataframe["Organism"] = [" ".join(str(val).split(" (")[0].split(" ")[:2]) for val in AF_dataframe["Organism"]]
AF_dataframe["ID"] = [f"AF-{k}-F1-model_v1.pdb" for k in AF_dataframe["Entry"]]
AF_dataframe["Avg. score"] = [avg_scores[key] if key in avg_scores else 40 for key in AF_dataframe["ID"]]
AF_dataframe["Length"] = [lengths_full[key] if key in lengths_full else 0 for key in AF_dataframe["ID"]]
AF_dataframe["High confidence length"] = [lengths_high_confidence[key] if key in lengths_high_confidence else 0 for key in AF_dataframe["ID"]]

AF_dataframe = AF_dataframe[[c for c in AF_dataframe.columns if not c.startswith("yourlist")]]
AF_dataframe.to_csv(DATA_FOLDER / "AF_dataframe.txt", sep="\t")
AF_dataframe = AF_dataframe.set_index("ID")

  if (await self.run_code(code, result,  async_=asy)):


Calculate shapemers for each AF protein

In [None]:
make_data.get_AF_shapemers(DATA_FOLDER)

Download and extract CASP12 data from
`https://sharehost.hms.harvard.edu/sysbio/alquraishi/proteinnet/human_readable/casp12.tar.gz`
into DATA_FOLDER / casp12

Calculate shapemers for all CASP12 proteins

In [None]:
make_data.get_PDB_shapemers(DATA_FOLDER / "casp12" / "training_100",
                            DATA_FOLDER)
make_data.get_PDB_shapemers(DATA_FOLDER / "casp12" / "validation",
                            DATA_FOLDER)
make_data.get_PDB_shapemers(DATA_FOLDER / "casp12" / "testing",
                            DATA_FOLDER)

Get UniProt annotations for all PDB proteins

In [16]:
import itertools
from src import uniprot_parser
import pickle

corpus_files = DATA_FOLDER.glob("*_ids_corpus_res4_6*.txt")
keys = (line.strip().split("\t")[0] for line in itertools.chain.from_iterable((open(file) for file in corpus_files)))
pdb_ids = []
for k in keys:
    if k.endswith(".pdb"):
        continue
    if "#" in k:
        if "TBM" in k or "FM" in k:
            continue
        k = k.split("#")[1][:4]
    else:
        k = k[:4]
    pdb_ids.append(k)
uniprot_parser.get_uniprot_info_from_ids(pdb_ids,
                                         DATA_FOLDER / "uniprot_go" / "casp12_uniprot.txt",
                                         identifier="PDB_ID",
                                         columns=make_data.UNIPROT_COLUMNS,
                                         chunk=True)

1043it [16:36,  1.05it/s]


In [11]:
import pickle

In [None]:
coords = make_data.get_PDB_protein_information([DATA_FOLDER / "casp12" / f for f in ["training_100",
                                                                                     "validation",
                                                                                     "testing"]])

In [12]:
with open(DATA_FOLDER / "PDB_coords.pkl", "wb") as f:
    pickle.dump(coords, f)

In [17]:
PDB_dataframe = pnd.read_csv(DATA_FOLDER / "uniprot_go" / "casp12_uniprot.txt", sep="\t")
mapping_column = [c for c in PDB_dataframe.columns if c.startswith("yourlist")][0]
PDB_dataframe["PDB_ID"] = PDB_dataframe[mapping_column]
PDB_dataframe["Protein family"] = [str(val).split(",")[0] for val in PDB_dataframe["Protein families"]] # Superfamily
PDB_dataframe["Organism"] = [" ".join(str(val).split(" (")[0].split(" ")[:2]) for val in PDB_dataframe["Organism"]]
PDB_dataframe = PDB_dataframe[[c for c in PDB_dataframe.columns if c != mapping_column]]

Match AF proteins with previously determined PDB proteins

In [18]:
import numpy as np
from collections import defaultdict

AF_PDB_cross_references = AF_dataframe['Cross-reference (PDB)'][AF_dataframe['Cross-reference (PDB)'].notna()]
AF_PDB_mapping = {key: AF_PDB_cross_references[key] for key in AF_PDB_cross_references.keys()}
PDB_AF_mapping = defaultdict(list)
for p in AF_PDB_mapping:
    if type(AF_PDB_mapping[p]) == str:
        for p1 in AF_PDB_mapping[p][:-1].split(";"):
            PDB_AF_mapping[p1].append(p)
    else:
        for p1 in AF_PDB_mapping[p].values:
            PDB_AF_mapping[p1[:-1]].append(p)

PDB_dataframe["AF"] = [";".join(PDB_AF_mapping[p]) if p in PDB_AF_mapping else np.nan for p in PDB_dataframe["PDB_ID"]]
PDB_dataframe.to_csv(DATA_FOLDER / "PDB_dataframe.txt", sep="\t")