# Generation of Negative Control Protein List for AlphaFold Multimer

This notebook creates a curated list of human proteins to serve as negative controls 
for AlphaFold Multimer (AFM) interaction prediction experiments.

These proteins are selected based on the following criteria:
- Swiss-Prot reviewed (high-quality UniProt entries)
- Sequence length ≤ 500 amino acids (to limit computational burden when used for interaction prediction with POI)
- High-confidence AlphaFold structure (at least 60% of residues with pLDDT > 85)
- Not present in any of our own experimental interactor datasets (e.g., LMCD1, TESTIN)

The goal is to obtain a set of proteins that are unlikely to interact with the bait proteins,
providing a robust baseline for:
- Estimating false positive interaction rates
- Computing Z-score normalized interaction confidence
- Benchmarking AlphaFold Multimer prediction specificity

The output is a biologically meaningful negative set, suitable for large-scale 
in silico screening workflows.


In [56]:
import pandas as pd
import os
import sys

sys.path.append('G:\\My Drive\\Uni\\Thesis\\Data\\python_functions')

from cutoff_method_final import cutoff_analysis, process_results_dict, plot_updated_venn_cutoff, combined_function

In [None]:
# Read in all the data from excel files.
filenames1 = ["ΔPET-GFP_processed_results", "FL-GFP_processed_results", "PET-GFP_processed_results", "NT-GFP_processed_results", "LIM1-3-GFP_processed_results"]
filenames2 = ["Stefano_EXT983_LMCD1_FL_processed_results", "Stefano_EXT983_LMCD1_CR_processed_results", "Stefano_EXT983_LMCD1_LIM1-2_processed_results", "Stefano_EXT983_LMCD1_PET_processed_results", "Stefano_EXT983_TES_CRC_processed_results" ]

# Read each file into a dictionary
data_dict_tes = {
    filename: pd.read_excel(
        rf"G:\My Drive\Uni\Thesis\Data\datasets_cutoff\tes\final_results\{filename}.xlsx",
        sheet_name="below_not_in_above"
    )
    for filename in filenames1
}

data_dict_lmcd1 = {
    filename: pd.read_excel(
        rf"G:\My Drive\Uni\Thesis\Data\datasets_cutoff\final_results\{filename}.xlsx",
        sheet_name="below_not_in_above"
    )
    for filename in filenames2
}

full_data_dict = {**data_dict_tes, **data_dict_lmcd1}
# Now, data_dict will have the file names as keys and the corresponding DataFrames as values

In [58]:
def get_all_existing_accessions(data_dict):
    """Extract all unique UniProt accessions from all datasets inside full_data_dict."""
    all_accessions = set()
    for main_key, sub_dict in data_dict.items():
        if isinstance(sub_dict, dict):
            for sub_key, df in sub_dict.items():
                if isinstance(df, pd.DataFrame) and 'accession' in df.columns:
                    all_accessions.update(df['accession'].dropna().unique())
    return all_accessions

# Call this at the top before running get_random_high_quality_proteins()
excluded_accessions = get_all_existing_accessions(full_data_dict)

In [None]:
import requests
import pandas as pd
import random
import time

# ---------------------- CONFIG ----------------------
n_proteins = 50
plddt_threshold = 85
min_fraction = 0.6
max_residues = 500
fetch_batch_size = 500
max_attempts = 10
sleep_time = 0.2

# ----------------- EXCLUSION HANDLER ----------------
def get_all_existing_accessions(data_dict):
    all_accessions = set()
    for main_key, sub_dict in data_dict.items():
        if isinstance(sub_dict, dict):
            for sub_key, df in sub_dict.items():
                if isinstance(df, pd.DataFrame) and 'accession' in df.columns:
                    all_accessions.update(df['accession'].dropna().unique())
    return all_accessions

# Replace with: excluded_accessions = get_all_existing_accessions(full_data_dict)
excluded_accessions = set()

# ------------------ UNIPROT FETCH -------------------
def get_uniprot_accessions(limit=500):
    url = "https://rest.uniprot.org/uniprotkb/search"
    query = "organism_id:9606 AND reviewed:true AND database:pdb AND existence:1"
    params = {"query": query, "format": "json", "size": limit}
    response = requests.get(url, params=params)
    response.raise_for_status()
    data = response.json()
    return [entry["primaryAccession"] for entry in data.get("results", [])]

# ---------------- ALPHAFOLD STRUCTURE ANALYSIS ----------------
def fetch_alphafold_pdb(accession):
    url = f"https://alphafold.ebi.ac.uk/files/AF-{accession}-F1-model_v4.pdb"
    response = requests.get(url)
    return response.text if response.status_code == 200 else None

def extract_plddt_and_residue_count(pdb_text):
    plddt_scores = []
    residues = set()
    for line in pdb_text.splitlines():
        if line.startswith("ATOM"):
            try:
                b_factor = float(line[60:66].strip())
                residue_index = int(line[22:26].strip())
                plddt_scores.append(b_factor)
                residues.add(residue_index)
            except ValueError:
                continue
    return plddt_scores, len(residues)

def evaluate_structure(scores, threshold=plddt_threshold, min_fraction=min_fraction):
    if not scores:
        return False, 0.0
    high = sum(score >= threshold for score in scores)
    return (high / len(scores)) >= min_fraction, high / len(scores)

# ------------------- MAIN LOOP ----------------------
def get_filtered_uniprots(n=50, exclude=None):
    exclude = exclude or set()
    final = []
    candidates = list(set(get_uniprot_accessions(fetch_batch_size)) - exclude)
    random.shuffle(candidates)
    attempts = 0

    for accession in candidates:
        if len(final) >= n or attempts > max_attempts * n:
            break

        pdb_text = fetch_alphafold_pdb(accession)
        if pdb_text:
            scores, res_count = extract_plddt_and_residue_count(pdb_text)
            if res_count > max_residues:
                continue
            avg_plddt = sum(scores)/len(scores) if scores else None
            is_good, fraction_high = evaluate_structure(scores)
            if is_good:
                final.append({
                    "accession": accession,
                    "avg_plddt": round(avg_plddt, 2),
                    "pct_above_threshold": round(fraction_high * 100, 2),
                    "residue_count": res_count
                })
        attempts += 1
        time.sleep(sleep_time)

    df = pd.DataFrame(final)
    if len(df) < n:
        print(f"⚠️ Only {len(df)} proteins found that match all criteria.")
    return df



In [68]:
# ------------------ EXECUTE -------------------------
final_df = get_filtered_uniprots(n=n_proteins, exclude=excluded_accessions)
print(final_df)


   accession  avg_plddt  pct_above_threshold  residue_count
0     Q86V25      81.14                70.47            355
1     P23435      81.03                62.06            193
2     Q86Y78      86.44                64.74            171
3     Q99616      85.95                61.28             98
4     Q8N8Q3      90.83                87.44            282
5     Q9NWW9      78.99                60.47            162
6     P0C7P0      87.00                72.72            127
7     Q8WXC3      89.11                91.38             89
8     Q8IVL8      88.38                82.19            374
9     Q6UXV0      76.84                61.15            394
10    Q8N300      85.88                68.44             66
11    P62945      94.81                92.08             25
12    Q9BXJ8      89.65                79.45            343
13    P01850      94.02                90.75            176
14    P0DN84      85.41                64.68             35
15    Q9BW66      87.89                7

In [69]:
save_path = r"G:\My Drive\Uni\Thesis\Data"
final_df.to_excel(os.path.join(save_path, "control_proteins.xlsx"), index=False)