# Predicting accessibility from AlphaFold structures - human proteome

### Scripts are adapted from example scripts in the StructureMap GitHub: https://github.com/MannLabs/structuremap/tree/main/nbs
### In accordance with the original Apache license, we have adapted the original script to run through a list of UniProt accessions from the human proteome FASTA

## Import libraries

In [None]:
# Import structuremap functions
import structuremap.utils
structuremap.utils.set_logger()
from structuremap.processing import download_alphafold_cif, download_alphafold_pae, format_alphafold_data, annotate_accessibility, get_smooth_score, annotate_proteins_with_idr_pattern, get_extended_flexible_pattern, get_proximity_pvals, perform_enrichment_analysis, perform_enrichment_analysis_per_protein, evaluate_ptm_colocalization, extract_motifs_in_proteome
from structuremap.plotting import plot_enrichment, plot_ptm_colocalization


In [None]:
# Import 
import pandas as pd
import numpy as np
import os
import re
import plotly.express as px
import tqdm
import tempfile
import csv

## Set input/output directories

In [None]:
# File output
output_dir = "/Users/mew21/Documents/GitHub/structuremap/analyses/hsapiens/"

# Uniprot accession input list
input_list = "/Users/mew21/Downloads/hsapiens_headers.txt"


## Import human proteome headers for AlphaFold download

In [None]:
## Try with different approach and str.split into a list
txt_file = open(input_list, "r")
file_content = txt_file.read()
#print("The file content are: ", file_content)

human_fasta_list = file_content.split("\t")

In [None]:
len(human_fasta_list)

In [None]:
human_fasta_list[-1]

## Download AlphaFold data

In [None]:
cif_dir = os.path.join(output_dir, 'cif')
pae_dir = os.path.join(output_dir, 'pae')

In [None]:
print(cif_dir)

In [None]:
valid_proteins_cif, invalid_proteins_cif, existing_proteins_cif = download_alphafold_cif(
    proteins=human_fasta_list,
    out_folder=cif_dir)

In [None]:
valid_proteins_pae, invalid_proteins_pae, existing_proteins_pae = download_alphafold_pae(
    proteins=human_fasta_list,
    out_folder=pae_dir, 
    )

## Format AlphaFold data input

In [None]:
alphafold_annotation = format_alphafold_data(
    directory=cif_dir, 
    protein_ids=human_fasta_list)

In [None]:
#alphafold_annotation[0:3]

## Annotate pPSE values

In [None]:
full_sphere_exposure = annotate_accessibility(
    df=alphafold_annotation, 
    max_dist=24, 
    max_angle=180, 
    error_dir=pae_dir)

In [None]:
alphafold_accessibility = alphafold_annotation.merge(
    full_sphere_exposure, how='left', on=['protein_id','AA','position'])

In [None]:
part_sphere_exposure = annotate_accessibility(
    df=alphafold_annotation, 
    max_dist=12, 
    max_angle=70, 
    error_dir=pae_dir)

In [None]:
alphafold_accessibility = alphafold_accessibility.merge(
    part_sphere_exposure, how='left', on=['protein_id','AA','position'])

In [None]:
alphafold_accessibility['high_acc_5'] = np.where(alphafold_accessibility.nAA_12_70_pae <= 5, 1, 0)
alphafold_accessibility['low_acc_5'] = np.where(alphafold_accessibility.nAA_12_70_pae > 5, 1, 0)

## Annotate IDRs

In [None]:
alphafold_accessibility_smooth = get_smooth_score(
    alphafold_accessibility, 
    np.array(['nAA_24_180_pae']), 
    [10])

In [None]:
alphafold_accessibility_smooth['IDR'] = np.where(
    alphafold_accessibility_smooth['nAA_24_180_pae_smooth10']<=34.27, 1, 0)

## Annotate short IDRs

In [None]:
alphafold_accessibility_smooth_pattern = annotate_proteins_with_idr_pattern(
    alphafold_accessibility_smooth,
    min_structured_length = 80, 
    max_unstructured_length = 20)

In [None]:
alphafold_accessibility_smooth_pattern_ext = get_extended_flexible_pattern(
    alphafold_accessibility_smooth_pattern, 
    ['flexible_pattern'], [5])

In [None]:
alphafold_accessibility_smooth_pattern_ext[0:3]

In [None]:
alphafold_accessibility_smooth_pattern_ext.columns


In [None]:
alphafold_accessibility.to_csv('AlphaFoldPredicted_Accessibility_hsapiens.csv')