## Libraries

In [7]:
import matplotlib.pyplot as plt
import sys
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt
import datetime

plt.rcParams.update({
    "font.weight": "bold",
    "xtick.labelsize": 24,
    "ytick.labelsize": 24,
    'font.size': 28,
    'axes.labelweight': 'bold',
    'figure.dpi': 100.0,
    'axes.linewidth':2.0,
})

xdate = datetime.datetime.now().strftime("%m-%d-%Y")

## Segregated by number of residues
Residues classified by protein lengths.
Variation of PLDDT as a function of residue numbers. Each plot shows all residues.

### Minimum and maximum values

In [None]:
# This cell gets the PLDDT from *.boxplot files in a directory and produces boxplots
# Plots are classified by number of amino acids
# Minimum and maximum values

pathway = Path()
residues=['ALA','ARG','ASN','ASP','CYS','GLU','GLN','GLY','HIS','ILE','LEU','LYS','MET','PHE','PRO','SER','THR','TRP','TYR','VAL']


res_plddt = pd.DataFrame(
    {'Residue': pd.Series(dtype='str'),
    'min': pd.Series(dtype='float'),
    '25th_Percentile': pd.Series(dtype='float'),
    'median': pd.Series(dtype='float'),
    '75th_Percentile': pd.Series(dtype='float'),
    'max': pd.Series(dtype='float')}
)

for file in pathway.glob("../residue/*-res.boxplot*"): 
    print(file.name)
    min = []
    percentile_25 = []
    percentile_75 = []
    median = []
    max = []
    std = []  
    with open(file, 'r') as note:
        lines = note.readlines()

        for line in lines:
            item = line.strip()
            if item in residues:
                # print(item)
                pass
            elif item.startswith('median'):
                split = item.split(':')
                split_float = float(split[-1].strip())
                median.append(split_float)
            elif item.startswith('min'):
                split = item.split(':')
                split_float = float(split[-1].strip())
                min.append(split_float)
            elif item.startswith('max'):
                split = item.split(':')
                split_float = float(split[-1].strip())
                max.append(split_float)
            elif item.startswith('75 percentile'):
                split = item.split(':')
                split_float = float(split[-1].strip())
                percentile_75.append(split_float)
            elif item.startswith('25 percentile'):
                split = item.split(':')
                split_float = float(split[-1].strip())
                percentile_25.append(split_float)



    res_plddt['Residue'] = residues
    res_plddt['min'] = min
    res_plddt['max'] = max
    res_plddt['median'] = median
    res_plddt['25th_Percentile'] = percentile_25
    res_plddt['75th_Percentile'] = percentile_75

    res_plddt.index = range(1, res_plddt.shape[0] + 1)
    res_plddt.transpose()


    font = {
        'family': 'monospace',
        'weight': 'bold',
        'size': 18
    }

    plt.rcParams["figure.autolayout"] = True
    plt.rc('lines', linewidth=5)
    plt.rc('font', **font)

    fig_1, ax_1 = plt.subplots(1,1)
    fig_1.set_size_inches(20,10, forward=True)
    boxes_list = []
    for i in range(len(residues)):
        boxplot_dict = {
            'label' : residues[i],  # residue names
            'whislo': min[i],    # Bottom whisker position
            'q1'    : percentile_25[i],    # First quartile (25th percentile)
            'med'   : median[i],    # Median         (50th percentile)
            'q3'    : percentile_75[i],    # Third quartile (75th percentile)
            'whishi': max[i],    # Top whisker position
            'fliers': []        # Outliers
        }

        boxes_list.append(boxplot_dict)


    ax_1.bxp(boxes_list, showfliers=False, boxprops=dict(linestyle='-', linewidth=2.5),
flierprops=dict(linestyle='-', linewidth=2.5),
             medianprops=dict(linestyle='-', linewidth=2.5),
             whiskerprops=dict(linestyle='-', linewidth=2.5),
             capprops=dict(linestyle='-', linewidth=2.5))
    title = file.name.split('.')[-1]
    ax_1.set_title(f'Residues {title}')
    # plt.savefig(f"plots/{title}_residue", facecolor="white", bbox_inches="tight", dpi=800)
    plt.show()

### 10th and 90th percentile
1 plot per length classification

In [None]:
# This cell gets the PLDDT from *.boxplot files in a directory and produces boxplots
# Plots are classified by number of amino acids.
# There is a total PLDDT over all residues

pathway = Path()
residues=['total-res','ALA','ARG','ASN','ASP','CYS','GLU','GLN','GLY','HIS','ILE','LEU','LYS','MET','PHE','PRO','SER','THR','TRP','TYR','VAL']
xdate = datetime.datetime.now().strftime("%m-%d-%Y")

res_plddt = pd.DataFrame(
    {'Residue': pd.Series(dtype='str'),
    'min': pd.Series(dtype='float'),
    '25th_Percentile': pd.Series(dtype='float'),
    'median': pd.Series(dtype='float'),
    '75th_Percentile': pd.Series(dtype='float'),
    'max': pd.Series(dtype='float')}
)

# n = 5
for n in range(1,6):
    print(f'Batch{n}')
    for file in pathway.glob(f"../residue/batch{n}/*-res_10_90.boxplot*"): 
        print(f'batch{n}: ', file.name)
        min = []
        percentile_25 = []
        percentile_75 = []
        median = []
        max = []
        std = []  
        with open(file, 'r') as note:
            lines = note.readlines()

            for line in lines:
                item = line.strip()
                if item in residues:
                    # print(item)
                    pass
                elif item.startswith('median'):
                    split = item.split(':')
                    split_float = float(split[-1].strip())
                    median.append(split_float)
                elif item.startswith('min'):
                    split = item.split(':')
                    split_float = float(split[-1].strip())
                    min.append(split_float)
                elif item.startswith('max'):
                    split = item.split(':')
                    split_float = float(split[-1].strip())
                    max.append(split_float)
                elif item.startswith('75 percentile'):
                    split = item.split(':')
                    split_float = float(split[-1].strip())
                    percentile_75.append(split_float)
                elif item.startswith('25 percentile'):
                    split = item.split(':')
                    split_float = float(split[-1].strip())
                    percentile_25.append(split_float)



        res_plddt['Residue'] = residues
        res_plddt['min'] = min
        res_plddt['max'] = max
        res_plddt['median'] = median
        res_plddt['25th_Percentile'] = percentile_25
        res_plddt['75th_Percentile'] = percentile_75

        res_plddt.index = range(1, res_plddt.shape[0] + 1)
        res_plddt.transpose()


        font = {
            'family': 'monospace',
            'weight': 'bold',
            # 'size': 18
        }

        plt.rcParams["figure.autolayout"] = True
        plt.rc('lines', linewidth=5)
        plt.rc('font', **font)

        fig_1, ax_1 = plt.subplots(1,1)
        fig_1.set_size_inches(26,10, forward=True)
        boxes_list = []
        for i in range(len(residues)):
            if residues[i] == 'total-res':
                label = '   total \n res'
            else:
                label = residues[i]

            boxplot_dict = {
                'label' : label,  # residue names
                'whislo': min[i],    # Bottom whisker position
                'q1'    : percentile_25[i],    # First quartile (25th percentile)
                'med'   : median[i],    # Median         (50th percentile)
                'q3'    : percentile_75[i],    # Third quartile (75th percentile)
                'whishi': max[i],    # Top whisker position
                'fliers': []        # Outliers
            }

            boxes_list.append(boxplot_dict)


        ax_1.bxp(boxes_list, showfliers=False, boxprops=dict(linestyle='-', linewidth=2.5),
    flierprops=dict(linestyle='-', linewidth=2.5),
                medianprops=dict(linestyle='-', linewidth=2.5),
                whiskerprops=dict(linestyle='-', linewidth=2.5),
                capprops=dict(linestyle='-', linewidth=2.5))
        ax_1.set_ylim(bottom=0)
        plt.axhline(y = 80, color = 'g', linestyle = '-', lw=3)  # use total-res median as cut-off 90.63
        title = file.name.split('.')[-1]
        ax_1.set_title(f'Residues {title}')
        # plt.savefig(f"../plots/batch{n}/{title}_res{n}length_total_{xdate}.tiff", facecolor="white", bbox_inches="tight", dpi=150)
        plt.show()

## Classified by residue type
This shows variation of PLDDT as a function of number of residues. Each plot shows only one residue.

### single residue plots
1 residue per plot

In [None]:
# Plots are classified by residue types. Each residue type's variation with number of
# amino acids is shown.
# Horizontal PLDDT cutoff line is added
import sys
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt


pathway = Path()
res_names=['ALA','ARG','ASN','ASP','CYS','GLU','GLN','GLY','HIS','ILE','LEU','LYS','MET','PHE','PRO','SER','THR','TRP','TYR','VAL']


res_plddt = pd.DataFrame(
    {'res': pd.Series(dtype='str'),
    'min': pd.Series(dtype='float'),
    '25th_Percentile': pd.Series(dtype='float'),
    'median': pd.Series(dtype='float'),
    '75th_Percentile': pd.Series(dtype='float'),
    'max': pd.Series(dtype='float')}
)

# Uncomment for the old batches where pathway.glob(f"../residue/*-res_10_90.boxplot*"): 
# box_names = ['stat-res.boxplot', 'stat-res.boxplot-0', 'stat-res.boxplot-1', 
# 'stat-res.boxplot-2', 'stat-res.boxplot-3', 'stat-res.boxplot-4', 'stat-res.boxplot-5', 
# 'stat-res.boxplot-6', 'stat-res.boxplot-7', 'stat-res.boxplot-8', 'stat-res.boxplot-9']

box_names = ['stat-res_10_90.boxplot', 'stat-res_10_90.boxplot-0', 'stat-res_10_90.boxplot-1', 
'stat-res_10_90.boxplot-2', 'stat-res_10_90.boxplot-3', 'stat-res_10_90.boxplot-4', 'stat-res_10_90.boxplot-5', 
'stat-res_10_90.boxplot-6', 'stat-res_10_90.boxplot-7', 'stat-res_10_90.boxplot-8', 'stat-res_10_90.boxplot-9', 
'stat-res_10_90.boxplot-10']

tags = ['-total', '-0', '-1', '-2', '-3', '-4', '-5', '-6', '-7', '-8', '-9', '-10']

# n = 5
for n in range(2,6):
    print(f'Batch{n}')
    for res in res_names:
        print(f'{n}', res)
        min = []
        percentile_25 = []
        percentile_75 = []
        median = []
        max = []
        std = [] 

        for name in box_names:   
            for file in pathway.glob(f"../residue/batch{n}/{name}"): 
                # print(file.name)
                with open(file, 'r') as note:
                    lines = note.readlines()

                    for line in lines:                
                        item = line.strip()
                        if item==res:
                            start_index = lines.index(line)
                            # print(start_index)
                            if lines[start_index + 1].startswith('median'):
                                # print(lines[start_index + 1])
                                split = lines[start_index + 1].split(':')
                                split_float = float(split[-1].strip())
                                median.append(split_float)
                            if lines[start_index + 2].startswith('min'):
                                split = lines[start_index + 2].split(':')
                                split_float = float(split[-1].strip())
                                min.append(split_float)
                            if lines[start_index + 3].startswith('max'):
                                split = lines[start_index + 3].split(':')
                                split_float = float(split[-1].strip())
                                max.append(split_float)
                            if lines[start_index + 4].startswith('75 percentile'):
                                split = lines[start_index + 4].split(':')
                                split_float = float(split[-1].strip())
                                percentile_75.append(split_float)
                            if lines[start_index + 5].startswith('25 percentile'):
                                split = lines[start_index + 5].split(':')
                                split_float = float(split[-1].strip())
                                percentile_25.append(split_float)
                        else:
                            continue

        # print('min: ', min)
        # print('max: ', max)
        # print('median: ', median)
        # print('25th: ', percentile_25)
        # print('75th: ', percentile_75)

        tag_list = [ res + t for t in tags ]
        res_plddt['res'] = tag_list
        res_plddt['min'] = min
        res_plddt['max'] = max
        res_plddt['median'] = median
        res_plddt['25th_Percentile'] = percentile_25
        res_plddt['75th_Percentile'] = percentile_75
        res_plddt.start_index = range(1, res_plddt.shape[0] + 1)
        res_plddt.transpose()

        font = {
            'family': 'monospace',
            'weight': 'bold',
            'size': 19
        }

        # plt.rcParams["figure.autolayout"] = True
        plt.rc('lines', linewidth=30)
        plt.rc('font', **font)

        fig_1, ax_1 = plt.subplots(1,1)
        fig_1.set_size_inches(26,10, forward=True)
        boxes_list = []
        for i in range(len(tags)):
            boxplot_dict = {
                'label' : tag_list[i],  # res names
                'whislo': min[i],    # Bottom whisker position
                'q1'    : percentile_25[i],    # First quartile (25th percentile)
                'med'   : median[i],    # Median         (50th percentile)
                'q3'    : percentile_75[i],    # Third quartile (75th percentile)
                'whishi': max[i],    # Top whisker position
                'fliers': []        # Outliers
            }

            boxes_list.append(boxplot_dict)

    
        ax_1.bxp(boxes_list, showfliers=False, boxprops=dict(linestyle='-', linewidth=2.5),
    flierprops=dict(linestyle='-', linewidth=2.5),
                medianprops=dict(linestyle='-', linewidth=2.5),
                whiskerprops=dict(linestyle='-', linewidth=2.5),
                capprops=dict(linestyle='-', linewidth=2.5))
        plt.axhline(y = 80, color = 'g', linestyle = '-', lw=3)
        ax_1.set_ylim(bottom=0)        
        title = res
        ax_1.set_title(f'Residue: {title}')
        plt.savefig(f"../plots/batch{n}/res/{title}_res{n}_{xdate}.tiff", facecolor="white", bbox_inches="tight", dpi=150)
        plt.show()

### minimum and maximum
All residues

In [None]:
# This cell gets the PLDDT from res *.boxplot files in a directory and produces total boxplots only
# This draws a horizontal PLDDT cutoff line

import sys
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt


pathway = Path()
res_names = ['ALA','ARG','ASN','ASP','CYS','GLU','GLN','GLY','HIS','ILE','LEU','LYS','MET','PHE','PRO','SER','THR','TRP','TYR','VAL']
box_names = ['stat-res.boxplot']
tags = ['-total']
boxes_list = []
total_tags = []

min = []
percentile_25 = []
percentile_75 = []
median = []
max = []
std = [] 

for res in res_names:
    for name in box_names:        
        for file in pathway.glob(f"../residue/{name}"): 
            # print(file.name)
            with open(file, 'r') as note:
                lines = note.readlines()

                for line in lines:                
                    item = line.strip()
                    if item==res:
                        start_index = lines.index(line)
                        # print(start_index)
                        if lines[start_index + 1].startswith('median'):
                            # print(lines[start_index + 1])
                            split = lines[start_index + 1].split(':')
                            split_float = float(split[-1].strip())
                            median.append(split_float)
                        if lines[start_index + 2].startswith('min'):
                            split = lines[start_index + 2].split(':')
                            split_float = float(split[-1].strip())
                            min.append(split_float)
                        if lines[start_index + 3].startswith('max'):
                            split = lines[start_index + 3].split(':')
                            split_float = float(split[-1].strip())
                            max.append(split_float)
                        if lines[start_index + 4].startswith('75 percentile'):
                            split = lines[start_index + 4].split(':')
                            split_float = float(split[-1].strip())
                            percentile_75.append(split_float)
                        if lines[start_index + 5].startswith('25 percentile'):
                            split = lines[start_index + 5].split(':')
                            split_float = float(split[-1].strip())
                            percentile_25.append(split_float)
                    else:
                        continue

   

    # tag_list = [ res + '\n' + t for t in tags ]
    tag_list = [ res for t in tags ]
    total_tags.append(tag_list[0])

boxes_list = []
for i in range(len(total_tags)):
    boxplot_dict = {
        'label' : total_tags[i],  # res names
        'whislo': min[i],    # Bottom whisker position
        'q1'    : percentile_25[i],    # First quartile (25th percentile)
        'med'   : median[i],    # Median         (50th percentile)
        'q3'    : percentile_75[i],    # Third quartile (75th percentile)
        'whishi': max[i],    # Top whisker position
        'fliers': []        # Outliers
    }

    boxes_list.append(boxplot_dict)

font = {
    'family': 'monospace',
    'weight': 'bold',
    # 'size': 19
}

# plt.rcParams["figure.autolayout"] = True
plt.rc('lines', linewidth=30)
plt.rc('font', **font)

fig_1, ax_1 = plt.subplots(1,1)
fig_1.set_size_inches(32,10, forward=True)  
ax_1.bxp(boxes_list, showfliers=False, boxprops=dict(linestyle='-', linewidth=2.5),
flierprops=dict(linestyle='-', linewidth=2.5),
             medianprops=dict(linestyle='-', linewidth=2.5),
             whiskerprops=dict(linestyle='-', linewidth=2.5),
             capprops=dict(linestyle='-', linewidth=2.5))
plt.axhline(y = 80, color = 'g', linestyle = '-', lw=3)
title = res
ax_1.set_title(f'Total PLDDT for all residues', weight='bold')
# plt.savefig(f"../plots/residues/residues_total_plddt_cutoff", facecolor="white", bbox_inches="tight", dpi=fig_1.dpi)
plt.show()

### 10th and 90th percentiles
All residues in one boxplot

In [None]:
# This cell gets the PLDDT from res *.boxplot files in a directory and produces total boxplots only
# This draws a horizontal PLDDT cutoff line
# Min and Max are replaced with 10th and 90th percentiles

import sys
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt


pathway = Path()
res_names = ['total-res','ALA','ARG','ASN','ASP','CYS','GLU','GLN','GLY','HIS','ILE','LEU','LYS','MET','PHE','PRO','SER','THR','TRP','TYR','VAL']
box_names = ['stat-res_10_90.boxplot']
tags = ['-total']

# n=1

for n in range(1,6):    
    boxes_list = []
    total_tags = []

    min = []
    percentile_25 = []
    percentile_75 = []
    median = []
    max = []
    std = [] 
    print(f'batch{n} total residue plots')
    for res in res_names:
        for name in box_names:        
            for file in pathway.glob(f"../residue/batch{n}/{name}"): 
                # print(file.name)
                with open(file, 'r') as note:
                    lines = note.readlines()

                    for line in lines:                
                        item = line.strip()
                        if item==res:
                            start_index = lines.index(line)
                            # print(start_index)
                            if lines[start_index + 1].startswith('median'):
                                # print(lines[start_index + 1])
                                split = lines[start_index + 1].split(':')
                                split_float = float(split[-1].strip())
                                median.append(split_float)
                            if lines[start_index + 2].startswith('min'):
                                split = lines[start_index + 2].split(':')
                                split_float = float(split[-1].strip())
                                min.append(split_float)
                            if lines[start_index + 3].startswith('max'):
                                split = lines[start_index + 3].split(':')
                                split_float = float(split[-1].strip())
                                max.append(split_float)
                            if lines[start_index + 4].startswith('75 percentile'):
                                split = lines[start_index + 4].split(':')
                                split_float = float(split[-1].strip())
                                percentile_75.append(split_float)
                            if lines[start_index + 5].startswith('25 percentile'):
                                split = lines[start_index + 5].split(':')
                                split_float = float(split[-1].strip())
                                percentile_25.append(split_float)
                        else:
                            continue

    

        # tag_list = [ res + '\n' + t for t in tags ]
        tag_list = [ res for t in tags ]
        total_tags.append(tag_list[0])

    boxes_list = []
    for i in range(len(total_tags)):
        boxplot_dict = {
            'label' : total_tags[i],  # res names
            'whislo': min[i],    # Bottom whisker position
            'q1'    : percentile_25[i],    # First quartile (25th percentile)
            'med'   : median[i],    # Median         (50th percentile)
            'q3'    : percentile_75[i],    # Third quartile (75th percentile)
            'whishi': max[i],    # Top whisker position
            'fliers': []        # Outliers
        }

        boxes_list.append(boxplot_dict)

    font = {
        'family': 'monospace',
        'weight': 'bold',
        # 'size': 19
    }

    # plt.rcParams["figure.autolayout"] = True
    plt.rc('lines', linewidth=30)
    plt.rc('font', **font)

    fig_1, ax_1 = plt.subplots(1,1)
    fig_1.set_size_inches(32,10, forward=True)  
    ax_1.bxp(boxes_list, showfliers=False, boxprops=dict(linestyle='-', linewidth=2.5),
    flierprops=dict(linestyle='-', linewidth=2.5),
                medianprops=dict(linestyle='-', linewidth=2.5),
                whiskerprops=dict(linestyle='-', linewidth=2.5),
                capprops=dict(linestyle='-', linewidth=2.5))
    ax_1.set_ylim(bottom=0)
    plt.axhline(y = 80, color = 'g', linestyle = '-', lw=3)  # use the total-res median as cut-off
    title = res
    ax_1.set_title(f'Total PLDDT for all residues', weight='bold')
    plt.savefig(f"../plots/batch{n}/batch{n}_residues_total.tiff", facecolor="white", bbox_inches="tight", dpi=150)
    plt.show()

## SS
PLDDT of alphfold predictions according to the 7 secondary structures.

### Proteins classified according to number of residues in each protein
Each plot has all the secondary structures within the same range of number of residues

### minimum and maximum

In [None]:
# This cell gets the PLDDT from *.boxplot files in a directory and produces boxplots

import sys
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt


pathway = Path()
ss_names = ['coil','beta-sheet','beta-bridge','turn','bend','ahelix','three-helix']


ss_plddt = pd.DataFrame(
    {'ss': pd.Series(dtype='str'),
    'min': pd.Series(dtype='float'),
    '25th_Percentile': pd.Series(dtype='float'),
    'median': pd.Series(dtype='float'),
    '75th_Percentile': pd.Series(dtype='float'),
    'max': pd.Series(dtype='float')}
)

for file in pathway.glob("../ss/*-ss.boxplot*"): 
    min = []
    percentile_25 = []
    percentile_75 = []
    median = []
    max = []
    std = []  
    with open(file, 'r') as note:
        lines = note.readlines()

        for line in lines:
            item = line.strip()
            if item in ss_names:
                # print(item)
                pass
            elif item.startswith('median'):
                split = item.split(':')
                split_float = float(split[-1].strip())
                median.append(split_float)
            elif item.startswith('min'):
                split = item.split(':')
                split_float = float(split[-1].strip())
                min.append(split_float)
            elif item.startswith('max'):
                split = item.split(':')
                split_float = float(split[-1].strip())
                max.append(split_float)
            elif item.startswith('75 percentile'):
                split = item.split(':')
                split_float = float(split[-1].strip())
                percentile_75.append(split_float)
            elif item.startswith('25 percentile'):
                split = item.split(':')
                split_float = float(split[-1].strip())
                percentile_25.append(split_float)



    ss_plddt['ss'] = ss_names
    ss_plddt['min'] = min
    ss_plddt['max'] = max
    ss_plddt['median'] = median
    ss_plddt['25th_Percentile'] = percentile_25
    ss_plddt['75th_Percentile'] = percentile_75

    ss_plddt.index = range(1, ss_plddt.shape[0] + 1)
    ss_plddt.transpose()


    font = {
        'family': 'monospace',
        'weight': 'bold',
        'size': 18
    }

    plt.rcParams["figure.autolayout"] = True
    plt.rc('lines', linewidth=5)
    plt.rc('font', **font)

    fig_1, ax_1 = plt.subplots(1,1)
    fig_1.set_size_inches(20,10, forward=True)
    boxes_list = []
    for i in range(len(ss_names)):
        boxplot_dict = {
            'label' : ss_names[i],  # ss names
            'whislo': min[i],    # Bottom whisker position
            'q1'    : percentile_25[i],    # First quartile (25th percentile)
            'med'   : median[i],    # Median         (50th percentile)
            'q3'    : percentile_75[i],    # Third quartile (75th percentile)
            'whishi': max[i],    # Top whisker position
            'fliers': []        # Outliers
        }

        boxes_list.append(boxplot_dict)


    ax_1.bxp(boxes_list, showfliers=False, boxprops=dict(linestyle='-', linewidth=2.5),
flierprops=dict(linestyle='-', linewidth=2.5),
             medianprops=dict(linestyle='-', linewidth=2.5),
             whiskerprops=dict(linestyle='-', linewidth=2.5),
             capprops=dict(linestyle='-', linewidth=2.5))    
    title = file.name.split('.')[-1]
    ax_1.set_title(f'SS {title}')
    # plt.savefig(f"plots/{title}_ss", facecolor="white", bbox_inches="tight", dpi=800)
    plt.show()

### 10th and 90th percentiles

In [None]:
# This cell gets the PLDDT from *.boxplot files in a directory and produces boxplots
# Total PLDDT across all structures are added. There is a cutoff line too.

import sys
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt


pathway = Path()
# ss_names = ['total-ss','coil','beta-sheet','beta-bridge','turn','bend','ahelix','three-helix']
ss_names = ['total-ss','ahelix','bend','beta-bridge','beta-sheet','coil', 'five-helix', 'three-helix','turn']


ss_plddt = pd.DataFrame(
    {'ss': pd.Series(dtype='str'),
    'min': pd.Series(dtype='float'),
    '25th_Percentile': pd.Series(dtype='float'),
    'median': pd.Series(dtype='float'),
    '75th_Percentile': pd.Series(dtype='float'),
    'max': pd.Series(dtype='float')}
)

# n = 1
for n in range(1,6):
    print(f'Batch{n}')
    for file in pathway.glob(f"../ss/batch{n}/*-ss_10_90.boxplot*"): 
        min = []
        percentile_25 = []
        percentile_75 = []
        median = []
        max = []
        std = []  
        with open(file, 'r') as note:
            lines = note.readlines()

            for line in lines:
                item = line.strip()
                if item in ss_names:
                    # print(item)
                    pass
                elif item.startswith('median'):
                    split = item.split(':')
                    split_float = float(split[-1].strip())
                    median.append(split_float)
                elif item.startswith('min'):
                    split = item.split(':')
                    split_float = float(split[-1].strip())
                    min.append(split_float)
                elif item.startswith('max'):
                    split = item.split(':')
                    split_float = float(split[-1].strip())
                    max.append(split_float)
                elif item.startswith('75 percentile'):
                    split = item.split(':')
                    split_float = float(split[-1].strip())
                    percentile_75.append(split_float)
                elif item.startswith('25 percentile'):
                    split = item.split(':')
                    split_float = float(split[-1].strip())
                    percentile_25.append(split_float)



        ss_plddt['ss'] = ss_names
        ss_plddt['min'] = min
        ss_plddt['max'] = max
        ss_plddt['median'] = median
        ss_plddt['25th_Percentile'] = percentile_25
        ss_plddt['75th_Percentile'] = percentile_75

        ss_plddt.index = range(1, ss_plddt.shape[0] + 1)
        ss_plddt.transpose()


        font = {
            'family': 'monospace',
            'weight': 'bold',
            # 'size': 18
        }

        plt.rcParams["figure.autolayout"] = True
        plt.rc('lines', linewidth=5)
        plt.rc('font', **font)

        fig_1, ax_1 = plt.subplots(1,1)
        fig_1.set_size_inches(26,10, forward=True)
        boxes_list = []
        for i in range(len(ss_names)):
            boxplot_dict = {
                'label' : ss_names[i],  # ss names
                'whislo': min[i],    # Bottom whisker position
                'q1'    : percentile_25[i],    # First quartile (25th percentile)
                'med'   : median[i],    # Median         (50th percentile)
                'q3'    : percentile_75[i],    # Third quartile (75th percentile)
                'whishi': max[i],    # Top whisker position
                'fliers': []        # Outliers
            }

            boxes_list.append(boxplot_dict)


        ax_1.bxp(boxes_list, showfliers=False, boxprops=dict(linestyle='-', linewidth=2.5),
        flierprops=dict(linestyle='-', linewidth=2.5),
                medianprops=dict(linestyle='-', linewidth=2.5),
                whiskerprops=dict(linestyle='-', linewidth=2.5),
                capprops=dict(linestyle='-', linewidth=2.5))    

        ax_1.set_ylim(bottom=0)
        plt.axhline(y = 90.63, color = 'g', linestyle = '-', lw=3)
        title = file.name.split('.')[-1]
        ax_1.set_title(f'SS {title}', weight='bold')
        # f"../plots/batch{n}/{title}_res{n}length_total_{xdate}.tiff", facecolor="white", bbox_inches="tight", dpi=150
        plt.savefig(f"../plots/batch{n}/{title}_ss{n}_reslength_total_{xdate}.tiff", facecolor="white", bbox_inches="tight", dpi=150)
        plt.show()

### Plots classified according to secondary structure type
Each plot shows a secondary structure's PLDDT variation with protein length.

### single ss plots

In [None]:
# This cell gets the PLDDT from SS *.boxplot files in a directory and produces boxplots
# This draws a horizontal PLDDT cutoff line

import sys
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt


pathway = Path()
ss_names = ["total-ss","ahelix","bend","beta-bridge","beta-sheet","coil", "five-helix", "three-helix","turn"]
# ss_names = ["total \n ss", "ahelix", "bend", "beta \n bridge", "beta \n sheet", "coil", "five \n helix", "three \n helix", "turn"]

ss_plddt = pd.DataFrame(
    {"ss": pd.Series(dtype="str"),
    "min": pd.Series(dtype="float"),
    "25th_Percentile": pd.Series(dtype="float"),
    "median": pd.Series(dtype="float"),
    "75th_Percentile": pd.Series(dtype="float"),
    "max": pd.Series(dtype="float")}
)

# For old batches.
# box_names = ["stat-ss.boxplot", "stat-ss.boxplot-0", "stat-ss.boxplot-1", 
# "stat-ss.boxplot-2", "stat-ss.boxplot-3", "stat-ss.boxplot-4", "stat-ss.boxplot-5", 
# "stat-ss.boxplot-6", "stat-ss.boxplot-7", "stat-ss.boxplot-8", "stat-ss.boxplot-9", "stat-ss.boxplot-10"]

# tags = ["-total", "-0", "-1", "-2", "-3", "-4", "-5", "-6", "-7", "-8", "-9", "-10"]

box_names = ["stat-ss_10_90.boxplot", "stat-ss_10_90.boxplot-0", "stat-ss_10_90.boxplot-1", 
"stat-ss_10_90.boxplot-2", "stat-ss_10_90.boxplot-3", "stat-ss_10_90.boxplot-4", "stat-ss_10_90.boxplot-5", 
"stat-ss_10_90.boxplot-6", "stat-ss_10_90.boxplot-7", "stat-ss_10_90.boxplot-8", "stat-ss_10_90.boxplot-9", 
"stat-ss_10_90.boxplot-10"]

tags = ["\ntotal", "\n0", "\n1", "\n2", "\n3", "\n4", "\n5", "\n6", "\n7", "\n8", "\n9", "\n10"]
# tags = ["\ntotal", "\n-0", "\n-1", "\n-2", "\n-3", "\n-4", "\n-5", "\n-6", "\n-7", "\n-8", "\n-9", "\n-10"]
       

for n in range(1,6):
    if n==3:
        continue
    print(f"Batch{n}")    
    for ss in ss_names:
        min = []
        percentile_25 = []
        percentile_75 = []
        median = []
        max = []
        std = [] 

        for name in box_names:        
            for file in pathway.glob(f"../ss/batch{n}/{name}"): 
                # print(file.name)
                with open(file, "r") as note:
                    lines = note.readlines()

                    for line in lines:                
                        item = line.strip()
                        if item==ss:
                            start_index = lines.index(line)
                            # print(start_index)
                            if lines[start_index + 1].startswith("median"):
                                # print(lines[start_index + 1])
                                split = lines[start_index + 1].split(":")
                                split_float = float(split[-1].strip())
                                median.append(split_float)
                            if lines[start_index + 2].startswith("min"):
                                split = lines[start_index + 2].split(":")
                                split_float = float(split[-1].strip())
                                min.append(split_float)
                            if lines[start_index + 3].startswith("max"):
                                split = lines[start_index + 3].split(":")
                                split_float = float(split[-1].strip())
                                max.append(split_float)
                            if lines[start_index + 4].startswith("75 percentile"):
                                split = lines[start_index + 4].split(":")
                                split_float = float(split[-1].strip())
                                percentile_75.append(split_float)
                            if lines[start_index + 5].startswith("25 percentile"):
                                split = lines[start_index + 5].split(":")
                                split_float = float(split[-1].strip())
                                percentile_25.append(split_float)
                        else:
                            continue

        # print("min: ", min)
        # print("max: ", max)
        # print("median: ", median)
        # print("25th: ", percentile_25)
        # print("75th: ", percentile_75)

        tag_list = [ ss + t for t in tags ]
        ss_plddt["ss"] = tag_list
        ss_plddt["min"] = min
        ss_plddt["max"] = max
        ss_plddt["median"] = median
        ss_plddt["25th_Percentile"] = percentile_25
        ss_plddt["75th_Percentile"] = percentile_75
        ss_plddt.start_index = range(1, ss_plddt.shape[0] + 1)
        ss_plddt.transpose()

        font = {
            "family": "monospace",
            "weight": "bold",
            # "size": 19
        }

        # plt.rcParams["figure.autolayout"] = True
        plt.rc("lines", linewidth=30)
        plt.rc("font", **font)

        fig_1, ax_1 = plt.subplots(1,1)
        fig_1.set_size_inches(38,10, forward=True)
        boxes_list = []
        for i in range(len(tags)):
            boxplot_dict = {
                "label" : tag_list[i],  # ss names
                "whislo": min[i],    # Bottom whisker position
                "q1"    : percentile_25[i],    # First quartile (25th percentile)
                "med"   : median[i],    # Median         (50th percentile)
                "q3"    : percentile_75[i],    # Third quartile (75th percentile)
                "whishi": max[i],    # Top whisker position
                "fliers": []        # Outliers
            }

            boxes_list.append(boxplot_dict)

    
        ax_1.bxp(boxes_list, showfliers=False, boxprops=dict(linestyle="-", linewidth=2.5),
        flierprops=dict(linestyle="-", linewidth=2.5),
                medianprops=dict(linestyle="-", linewidth=2.5),
                whiskerprops=dict(linestyle="-", linewidth=2.5),
                capprops=dict(linestyle="-", linewidth=2.5))
        plt.axhline(y = 80, color = "g", linestyle = "-", lw=3)
        ax_1.set_ylim(bottom=0)
        title = ss
        ax_1.set_title(f"Secondary Structure: {title}", weight='bold')
        plt.savefig(f"../plots/batch{n}/ss/{title}_ss{n}_reslength_{xdate}.tiff", facecolor="white", bbox_inches="tight", dpi=150)
        plt.show()

### All ss in one plot

In [None]:
# This cell gets the PLDDT from SS *.boxplot files in a directory and produces total boxplots only
# This draws a horizontal PLDDT cutoff line
# Replaces Min and Max with 10th and 90th percentiles

import sys
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt


pathway = Path()
ss_names = ['total-ss','ahelix','bend','beta-bridge','beta-sheet','coil', "five-helix", 'three-helix','turn']
box_names = ['stat-ss_10_90.boxplot']
tags = ['-total']


for n in range(1,6):
    if n==3:
        continue

    boxes_list = []
    total_tags = []

    min = []
    percentile_25 = []
    percentile_75 = []
    median = []
    max = []
    std = [] 
    print(f"Batch{n}") 
    for ss in ss_names:
        for name in box_names:        
            for file in pathway.glob(f"../ss/batch{n}/{name}"): 
                # print(file.name)
                with open(file, 'r') as note:
                    lines = note.readlines()

                    for line in lines:                
                        item = line.strip()
                        if item==ss:
                            start_index = lines.index(line)
                            # print(start_index)
                            if lines[start_index + 1].startswith('median'):
                                # print(lines[start_index + 1])
                                split = lines[start_index + 1].split(':')
                                split_float = float(split[-1].strip())
                                median.append(split_float)
                            if lines[start_index + 2].startswith('min'):
                                split = lines[start_index + 2].split(':')
                                split_float = float(split[-1].strip())
                                min.append(split_float)
                            if lines[start_index + 3].startswith('max'):
                                split = lines[start_index + 3].split(':')
                                split_float = float(split[-1].strip())
                                max.append(split_float)
                            if lines[start_index + 4].startswith('75 percentile'):
                                split = lines[start_index + 4].split(':')
                                split_float = float(split[-1].strip())
                                percentile_75.append(split_float)
                            if lines[start_index + 5].startswith('25 percentile'):
                                split = lines[start_index + 5].split(':')
                                split_float = float(split[-1].strip())
                                percentile_25.append(split_float)
                        else:
                            continue

    

        # tag_list = [ ss + t for t in tags ]
        tag_list = [ ss for t in tags ]
        total_tags.append(tag_list[0])

    boxes_list = []
    for i in range(len(total_tags)):
        boxplot_dict = {
            'label' : total_tags[i],  # ss names
            'whislo': min[i],    # Bottom whisker position
            'q1'    : percentile_25[i],    # First quartile (25th percentile)
            'med'   : median[i],    # Median         (50th percentile)
            'q3'    : percentile_75[i],    # Third quartile (75th percentile)
            'whishi': max[i],    # Top whisker position
            'fliers': []        # Outliers
        }

        boxes_list.append(boxplot_dict)

    
    fig_1, ax_1 = plt.subplots(1,1)
    fig_1.set_size_inches(32,10, forward=True)  
    # plt.rc('lines', linewidth=30)
    # plt.rc('font', **font)
    ax_1.bxp(boxes_list, showfliers=False, boxprops=dict(linestyle='-', linewidth=2.5),
    flierprops=dict(linestyle='-', linewidth=3.5),
                medianprops=dict(linestyle='-', linewidth=3.5),
                whiskerprops=dict(linestyle='-', linewidth=3.5),
                capprops=dict(linestyle='-', linewidth=3.5))
                
    ax_1.set_ylim(bottom=0)
    plt.axhline(y = 90.6, color = 'g', linestyle = '-', lw=3)  # use total-ss median as cut-off
    title = ss
    ax_1.set_title(f'Total PLDDT for all Secondary Structures', weight='bold')
    plt.savefig(f"../plots/batch{n}/batch{n}_ss_total.tiff", facecolor="white", bbox_inches="tight", dpi=150)
    plt.show()

## Generating a list of 1M proteins

In [None]:
# This cell grabs uniprot IDs and alphafold2 IDs from file names and creates a list

import sys
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt


pathway = Path()
uniprot_id_list = []
alphadb_id_list = []
version_list = []
count = 0 

df_1m = pd.DataFrame(
    {'Uniprot ID': pd.Series(dtype='str'),
    'Uniprot hash': pd.Series(dtype='int'),
    'AlphaFoldDB ID': pd.Series(dtype='str'),
    'Version': pd.Series(dtype='int')}
)

for file in pathway.glob(f"../data/uniprot-id/first_1m/filelist*"): 
    # print(file.name)
    with open(file, 'r') as note:
        lines = note.readlines()

        for line in lines:
            # if count > 5:
            #     break

            item = line.strip().split('-')
            alphadb_id = item[0] + '-' + item[1] + '-' + item[2]
            alphadb_id_list.append(alphadb_id)

            uniprot_id = item[1]
            uniprot_id_list.append(uniprot_id)

            version = item[3].split('.')[0][-1]
            version_list.append(version)
            # print(uniprot_id, alphadb_id, version)
            # count+=1


df_1m['Uniprot ID'] = uniprot_id_list
df_1m['Uniprot hash'] = df_1m['Uniprot ID'].apply(hash)
df_1m['AlphaFoldDB ID'] = alphadb_id_list
df_1m['Version'] = version
df_1m.to_csv('first_1m_version.csv', index=False)