In [1]:
import os
import pandas as pd
import json
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import scipy

# 1. Analyse the darkness content of UniRef50 and AlphaFold DB (v4)

### 1.1. Load data
First, run:

`python3 scripts/AFDBv4_pLDDT_analysis.py UniRef50`

This will generate the file `data_generated/AFDBv4_pLDDT_diggestion.csv`. The corresponding for the AFDB90v4 paper, is `data_generated/AFDBv4_pLDDT_diggestion_UniRef50_2023-02-01.csv`, which we load in the next cell.

In [3]:
indata = 'data_generated/AFDBv4_pLDDT_diggestion_UniRef50_2023-02-01.csv'
indata = pd.read_csv(indata)
indata = indata.sort_values(by='unirefID')
indata = indata.set_index("unirefID")
indata = indata[:-1]
indata

In [None]:
indata['darkness_bins'] = pd.cut(indata['FULL_noDUF'].astype(float), bins=[i for i in range(0, 105, 5)], include_lowest=True)
indata['median_Evidence'] = indata['median_Evidence'].fillna(0)
indata

To add DUF counts into the dataframe, run `python3 scripts/AFDBv4_DUF_analysis_dark.py UniRef50`, which will generate the `generated_data/AFDBv4_DUF_dark_diggestion_UniRef50.csv`

For the AFDB90v4 paper, the precomupted file is `data_generated/AFDBv4_DUF_dark_diggestion_UniRef50_2023-02-06.csv`

In [None]:
# get DUF distribution of all darks and merge with the data

duf_dark_data = 'data_generated/AFDBv4_DUF_dark_diggestion_UniRef50_2023-02-06.csv'
duf_dark_data = pd.read_csv(duf_dark_data)
duf_dark_data = duf_dark_data.sort_values(by='unirefID')
duf_dark_data = duf_dark_data.set_index("unirefID")
duf_dark_data = duf_dark_data[:-1]

indata = pd.concat([indata, duf_dark_data], axis=1)
indata

## 1.2. Make histogram at different pLDDT cutoffs

In [None]:
modes = ['Full', 'AFDB', 'AFDB70', 'AFDB90']
panel = ['A', 'B', 'C', 'D', 'E']

fig, ax = plt.subplots(1, len(panel), figsize=(2.5*len(panel), 3))
percentage_dufs = []

for j, mode in enumerate(modes):
    if mode == 'Full':
        tmp = indata
    if 'AFDB' in mode:
        tmp = indata.loc[indata.nAF2.astype(float) > 0]
        if len(mode.split('AFDB')[-1]) > 0:
            cut = int(mode.split('AFDB')[-1])
            tmp = tmp.loc[tmp.AF2_longest_best70_pLDDT.astype(float) >= cut]
    
    h,_ = np.histogram(tmp.FULL_noDUF.astype(float), bins=[i for i in range(0, 105, 5)])
    n_dark = h[0]
    h = h*100/sum(h)

    colors = ['#57257F']
    for i in range(len(h)-2):
        colors.append('silver')
    colors.append('white')

    x = list(range(len(h)))
    y = list(h)

    ax[j].bar(x,y,1, align='edge', color=colors, edgecolor='k')
    ax[j].set_facecolor('#F2F2F2')
    ax[j].set_xticks(range(0,21,5))
    ax[j].set_xticklabels(range(0,101,25))
    ax[j].set_ylabel('% of UniRef50 clusters')
    ax[j].set_xlabel('Functional Brightness (%)')
    
    ax[j].title.set_text('{} {}'.format(panel[j], mode))

    ax[j].set_ylim(0,100)
    
    percentage_dark = round(h[0])
    ax[j].text(-0.1, percentage_dark+1, '{}%'.format(percentage_dark),
               verticalalignment='bottom', horizontalalignment='left',
               color='#57257F', fontsize=9)
    
    uniprot_n_dark = sum(tmp.loc[tmp.FULL_noDUF.astype(float) <=5].nACCs.astype(float))
    print(mode, 'n =', len(tmp), 'n_dark =', n_dark, 'uniprot_n_dark =', uniprot_n_dark, '% uniprot =', uniprot_n_dark*100/sum(tmp.nACCs.astype(float)))

    uniref_n_dark = sum(tmp.loc[tmp.FULL_noDUF.astype(float) <=5].nUniRef100.astype(float))
    print(mode, 'n =', len(tmp), 'n_dark =', n_dark, 'uniref100_n_dark =', uniref_n_dark, '% uniref100 =', uniref_n_dark*100/sum(tmp.nUniRef100.astype(float)))
    
    percentage_duf = len(tmp.loc[tmp.Has_duf == 1])*100/len(tmp.loc[tmp.FULL_noDUF.astype(float) <=5])
    print('% UniRef50 dark with dufs =', percentage_duf)
    print()
    
    percentage_dufs.append(percentage_duf)

ax[j+1].bar(panel[:-1],percentage_dufs,1, align='center', color=['#57257F' for i in modes], edgecolor='k')
ax[j+1].set_facecolor('#F2F2F2')
ax[j+1].set_ylabel('% of dark clusters with DUF')
ax[j+1].set_xlabel('Set')
ax[j+1].title.set_text('({}) DUF content'.format(panel[j+1]))
ax[j+1].set_ylim(0,0.2)
    
plt.tight_layout()
plt.savefig('plots/AFDBv4_uniref50_histogram_dark_content.pdf')
plt.savefig('plots/AFDBv4_uniref50_histogram_dark_content.png', dpi=2000)

In [None]:
print('brightness vs size Correlation {}:'.format(mode), scipy.stats.pearsonr(indata['FULL_noDUF'], indata['nUniRef100'])[1])

In [None]:
indata.groupby(['darkness_bins'])['nUniRef100'].agg([np.mean, np.std, np.median])

# 2. Define AFDB90 set and collect all associated sequences from previously contructed mongoDB

In [None]:
AFDB90 = indata.loc[indata.AF2_longest_best70_pLDDT.astype(float) >= 90]
AFDB90.to_csv('data_generated/AFDB90v4_data.csv')

In [None]:
dbuilder_path = None # change accordingly

import sys
import os
sys.path.append(dbuilder_path)

import extract_uniprot     as uniprot

MONGO_HOST = "10.1.0.202"
MONGO_PORT = 30077

uniprot_db   = uniprot.uniprot_extractor(mongo_host = MONGO_HOST, mongo_port = MONGO_PORT)

In [None]:
outfasta = 'data_generated/AFDBv4_90.fasta'

count = 0
step = 100000

target_ids =  [i.split('_')[1] for i in AFDB90.index]
n_entries = len(target_ids)

chuncks   = [target_ids[i:i+step] if i+step < len(target_ids) else target_ids[i:] for i in range(0, n_entries, step)]
collected_ids = []

print('Getting sequences for {} chuncks'.format(len(chuncks)))
      
with open(outfasta, 'w') as out:
    for i, chunck in enumerate(chuncks):
        documents = uniprot_db.col.find({'_id': {'$in': chunck}})
        for doc in documents:
            out.write('>{}\n{}\n'.format(doc['_id'], doc['data']['SEQ']))

    