In [1]:
# make sure to:
# 1) install `fastparquet` in your Python environment
# 2) use `pandas` version >= 1.0

import pandas as pd  
import numpy as np

from pathlib import Path

# Globals

In [2]:
DATA_PATH = Path("../data") # specify data directory

In [3]:
ls {DATA_PATH}

coordinates.parquet    heterogeneity_hq.parquet
heterogeneity.parquet  supercogs.parquet


# Load data

Below, we use the following naming convention:

- *cluster*: group of at least 2 proteins
- *singleton*: group of 1 protein
- *structure representative*: name of protein that has the most representative structure (according to Foldseek) in a group

There are three main pandas dataframes available in the `data` folder:

`coordinates.parquet` - describes features of structure representatives in the structure space
- `x`: first coordinate in the protein structure space (for normalized Geometricus representations) (`float32`)
- `y`: second coordinate in the protein structure space (for normalized Geometricus representations) (`float32`)
- `x_un`: first coordinate in the protein structure space (for unnormalized Geometricus representations) (`float32`)
- `y_un`: second coordinate in the protein structure space (for unnormalized Geometricus representations) (`float32`)
- `origin`: database to which a protein belongs (`category`)
- `is_cluster`: whether a protein builds a cluster; otherwise is a singleton (`bool`)
- `length`: protein length (`int16`)
- `afdb_pLDDT`: AFDB protein pLDDT (`float32`)

`supercogs.parquet` - contains functional annotations of all structures in the final database
- `cluster_or_singleton`: name of cluster or singleton to which a protein belongs (`category`)
- `afdb_hq`: if an AFDB protein is of high quality (pLDDT > 70) (`boolean`)
- `superCOG_v10`: deepFRI v1.0 superCOG prediction (`category`)
- `superCOG_v11`: deepFRI v1.1 superCOG prediction (`category`)

`heterogeneity.parquet` - comprises number of structures from each database in a given cluster / singleton
- `counts_afdb_light`: number of structures from AFDB light clusters (`int16`)
- `counts_afdb_dark`: number of structures from AFDB dark clusters (`int16`)
- `counts_esmatlas`: number of structures from ESMAtlas (`int16`)
- `counts_mip_clusters`: number of structures from MIP clusters (`int16`)
- `counts_mip_singletons`: number of structures from MIP singletons (`int16`)

`heterogeneity_hq.parquet` - the same as above but considering high quality AFDB structures only

In [4]:
coordinates      = pd.read_parquet(DATA_PATH / 'coordinates.parquet')
supercogs        = pd.read_parquet(DATA_PATH / 'supercogs.parquet')
heterogeneity    = pd.read_parquet(DATA_PATH / 'heterogeneity.parquet')
heterogeneity_hq = pd.read_parquet(DATA_PATH / 'heterogeneity_hq.parquet')

# Explore data

## Basic statistics

#### Number of representative structures (clusters + singletons)

In [5]:
assert len(coordinates) == len(heterogeneity)
len(coordinates)

1505141

#### Number of representative structures (clusters + singletons) including only high quality AFDB models

In [6]:
len(heterogeneity_hq)

1070401

#### Number of all structures in the final clustered database

In [7]:
assert heterogeneity.sum().sum() == len(supercogs)
print(heterogeneity.sum().sum())

4035121


#### Number of all structures in the final clustered database including only high quality AFDB models

In [8]:
assert heterogeneity_hq.sum().sum() == len(supercogs[supercogs.afdb_hq | supercogs.afdb_hq.isna()])
print(heterogeneity_hq.sum().sum())

3060808
