#The purpose of this file is to preprocess/clean the dataset and feth the full length protein wildtype and mutated sequence

In [None]:
file_path = '/content/mutated_domainome_merged_filtered_all_VEPs_final.csv'
df = pd.read_csv(file_path)

In [None]:
df.columns

Index(['variant_ID', 'uniprotID_mutation', 'uniprot_ID', 'aa_seq', 'wt_seq',
       'dom_ID', 'WT', 'wt_aa', 'mut_aa', 'indel', 'STOP', 'STOP_readthrough',
       'count_e1_s0', 'count_e2_s0', 'count_e3_s0', 'count_e1_s1',
       'count_e2_s1', 'count_e3_s1', 'mean_count', 'fitness1_uncorr',
       'fitness2_uncorr', 'fitness3_uncorr', 'sigma1_uncorr', 'sigma2_uncorr',
       'sigma3_uncorr', 'fitness', 'sigma', 'growthrate1', 'growthrate1_sigma',
       'growthrate2', 'growthrate2_sigma', 'growthrate3', 'growthrate3_sigma',
       'growthrate', 'growthrate_sigma', 'library', 'missing', 'dead',
       'PFAM_ID', 'core', 'wt_gr', 'delta_gr', 'dead_gr', 'scaled_gr',
       'scaled_gr_sigma', 'proline', 'mean_esm1v_prediction', 'RaSP_score',
       'ddmut_ddG', 'rsasa_all', 'thermoMPNN_ddG', 'mean_esm1v_prediction_fl',
       'pos_in_uniprot', 'AlphaMissense_fitness', 'AlphaMissense_class',
       'EVE_new', 'popEVE_new', 'Tranception_new', 'column_coverage_new',
       'Entry', 'Entry Na

In [None]:
domain_groups = df.groupby('dom_ID')

domain_sizes = domain_groups.size().sort_values(ascending=False)
print("Domain sizes:")
print(domain_sizes)

Domain sizes:
dom_ID
O95714_PF11515_2555    2205
Q14160_PF00595_857     1941
O75970_PF00595_693     1921
Q9UDY2_PF00595_26      1921
Q13163_PF00564_15      1901
                       ... 
Q8WU90_PF18044_104      421
Q9H9D4_PF00096_383      421
Q9H000_PF00642_37       381
O60293_PF10650_1186     381
Q9H0A6_PF00612_189      381
Length: 522, dtype: int64


#Fetch Full Protein Sequences from the Uniprot API. Without Isoform

In [None]:
import pandas as pd
import requests
from functools import lru_cache
from concurrent.futures import ThreadPoolExecutor, as_completed
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

# Load dataset containing UniProt IDs
file_path1 = '/content/mutated_domainome_merged_filtered_all_VEPs_final.csv'  # Replace with your actual dataset file path
df1 = pd.read_csv(file_path1)

# Extract required columns from the dataset
columns_to_extract = [
    'variant_ID', 'uniprotID_mutation', 'uniprot_ID', 'aa_seq', 'wt_seq', 'wt_aa', 'mut_aa', 'STOP', 'PFAM_ID',
    'pos_in_uniprot', 'Gene Names (primary)', 'Gene Names (synonym)', 'dom_ID', 'growthrate'
]
df_selected1 = df1[columns_to_extract]

# Remove duplicate UniProt IDs
unique_uniprot_ids = df1['uniprot_ID'].drop_duplicates().dropna()

# Configure retry strategy to handle request failures
retry_strategy = Retry(
    total=5,
    backoff_factor=1,
    status_forcelist=[429, 500, 502, 503, 504],
    allowed_methods=["HEAD", "GET", "OPTIONS"]
)
adapter = HTTPAdapter(max_retries=retry_strategy)
http = requests.Session()
http.mount("https://", adapter)
http.mount("http://", adapter)

@lru_cache(maxsize=5000)
def fetch_protein_sequence(uniprot_id):
    """Fetches full protein sequence for a given UniProt ID, handling missing or removed IDs."""
    url = f"https://www.uniprot.org/uniprot/{uniprot_id}.fasta"
    try:
        response = http.get(url, timeout=10)
        if response.status_code == 200:
            return ''.join(response.text.split('\n')[1:])
        elif response.status_code == 404:
            print(f"UniProt ID {uniprot_id} not found (404).")
            return "Not Found"
        else:
            print(f"UniProt ID {uniprot_id} returned error {response.status_code}.")
            return f"Error {response.status_code}"
    except requests.exceptions.RequestException as e:
        print(f"Request failed for UniProt ID {uniprot_id}: {str(e)}")
        return f"Request Failed: {str(e)}"

def process_uniprot_id(uniprot_id, results):
    """Fetch protein sequence and store in results dictionary, handling errors."""
    sequence = fetch_protein_sequence(uniprot_id)
    results[uniprot_id] = sequence

#Fetch sequences using multithreading
results = {}
with ThreadPoolExecutor(max_workers=10) as executor:
    futures = {executor.submit(process_uniprot_id, uniprot_id, results): uniprot_id for uniprot_id in unique_uniprot_ids}
    for future in as_completed(futures):
        future.result()

# Create a DataFrame from the results
df_sequences1 = pd.DataFrame(list(results.items()), columns=['uniprot_ID', 'Protein Sequence'])

#Merge with selected dataset based on 'uniprot_ID'
df_final1 = df_selected1.merge(df_sequences1, on='uniprot_ID', how='left')

#Save UniProt IDs with missing sequences for debugging
df_missing1 = df_sequences1[df_sequences1['Protein Sequence'].isin(["Not Found", "", None])]
df_missing1.to_csv('missing_uniprot_ids.csv', index=False)
print(f"Saved missing UniProt IDs to missing_uniprot_ids.csv")

# Save the final merged dataset
df_final1.to_csv('final_dataset_with_sequences.csv', index=False)
print("Final dataset with protein sequences saved successfully.")

UniProt ID EHEE-rd1-0882 returned error 400.
UniProt ID HHH-rd1-0142 returned error 400.
UniProt ID EEHEE-rd3-0037 returned error 400.
Saved missing UniProt IDs to missing_uniprot_ids.csv
Final dataset with protein sequences saved successfully.


#Fetch Full Protein Sequences from the Uniprot API. With Isoform. We run this one to fetch correct isoforms for P57071 and P20929.

In [None]:
import pandas as pd
import requests
from functools import lru_cache
from concurrent.futures import ThreadPoolExecutor, as_completed
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

# Load dataset containing UniProt IDs
file_path1 = '/content/mutated_domainome_merged_filtered_all_VEPs_final.csv'
df1 = pd.read_csv(file_path1)

# Extract required columns from the dataset
columns_to_extract = [
    'variant_ID', 'uniprotID_mutation', 'uniprot_ID', 'aa_seq', 'wt_seq', 'wt_aa', 'mut_aa', 'STOP', 'PFAM_ID',
    'pos_in_uniprot', 'Gene Names (primary)', 'Gene Names (synonym)', 'dom_ID', 'growthrate', 'core', 'proline', 'Organism',
]
df_selected1 = df1[columns_to_extract].copy()

# Correct placement of isoform mapping (before extracting unique IDs)
isoform_mapping = {
    'P57071': 'P57071-1',
    'P20929': 'P20929-1'
}
df_selected1['uniprot_ID'] = df_selected1['uniprot_ID'].replace(isoform_mapping)

# Remove duplicate UniProt IDs AFTER mapping is applied
unique_uniprot_ids = df_selected1['uniprot_ID'].drop_duplicates().dropna()

# Configure retry strategy to handle request failures
retry_strategy = Retry(
    total=5,
    backoff_factor=1,
    status_forcelist=[429, 500, 502, 503, 504],
    allowed_methods=["HEAD", "GET", "OPTIONS"]
)
adapter = HTTPAdapter(max_retries=retry_strategy)
http = requests.Session()
http.mount("https://", adapter)
http.mount("http://", adapter)

@lru_cache(maxsize=5000)
def fetch_protein_sequence(uniprot_id):
    """Fetches full protein sequence for a given UniProt ID, ensuring correct isoforms for P57071 and P20929."""

    # Ensure specific UniProt IDs fetch the correct isoform explicitly
    if uniprot_id in isoform_mapping.values():  # ✅ Corrected check
        print(f"🔍 Fetching correct isoform: {uniprot_id}")

    url = f"https://www.uniprot.org/uniprot/{uniprot_id}.fasta"

    try:
        response = http.get(url, timeout=10)

        if response.status_code == 200:
            return ''.join(response.text.split('\n')[1:])  # Remove header and return sequence
        elif response.status_code == 404:
            print(f"UniProt ID {uniprot_id} not found (404).")
            return "Not Found"
        else:
            print(f"UniProt ID {uniprot_id} returned error {response.status_code}.")
            return f"Error {response.status_code}"

    except requests.exceptions.RequestException as e:
        print(f"Request failed for UniProt ID {uniprot_id}: {str(e)}")
        return f"Request Failed: {str(e)}"

def process_uniprot_id(uniprot_id, results):
    """Fetch protein sequence and store in results dictionary, handling errors."""
    sequence = fetch_protein_sequence(uniprot_id)
    results[uniprot_id] = sequence

# Fetch sequences using multithreading
results = {}
with ThreadPoolExecutor(max_workers=10) as executor:
    futures = {executor.submit(process_uniprot_id, uniprot_id, results): uniprot_id for uniprot_id in unique_uniprot_ids}
    for future in as_completed(futures):
        future.result()

# Create DataFrame from results
df_sequences1 = pd.DataFrame(list(results.items()), columns=['uniprot_ID', 'Protein Sequence'])

# Merge with selected dataset based on 'uniprot_ID'
df_final1 = df_selected1.merge(df_sequences1, on='uniprot_ID', how='left')

# Save UniProt IDs with missing sequences for debugging
df_missing1 = df_sequences1[df_sequences1['Protein Sequence'].isin(["Not Found", "", None])]
df_missing1.to_csv('missing_uniprot_ids.csv', index=False)
print(f"Saved missing UniProt IDs to missing_uniprot_ids.csv")

# Save the final merged dataset
df_final1.to_csv('final_dataset_with_sequences.csv', index=False)
print("Final dataset with protein sequences saved successfully.")

# Verify correct sequences were fetched
df_check = df_final1[df_final1['uniprot_ID'].isin(['P57071-1', 'P20929-1'])]
print("\n🔍 Checking fetched sequences for P57071-1 and P20929-1:")
print(df_check[['uniprot_ID', 'Protein Sequence']].head(10))  # Display results

# Save verification results
df_check.to_csv("isoform_verification.csv", index=False)
print("\nSaved filtered results to 'isoform_verification.csv'. You can download and check manually.")

⚠️ UniProt ID HHH-rd1-0142 returned error 400.
⚠️ UniProt ID EHEE-rd1-0882 returned error 400.
⚠️ UniProt ID EEHEE-rd3-0037 returned error 400.
🔍 Fetching correct isoform: P20929-1
🔍 Fetching correct isoform: P57071-1
Saved missing UniProt IDs to missing_uniprot_ids.csv
✅ Final dataset with protein sequences saved successfully.

🔍 Checking fetched sequences for P57071-1 and P20929-1:
       uniprot_ID                                   Protein Sequence
183152   P20929-1  MADDEDYEEVVEYYTEEVVYEEVPGETITKIYETTTTRTSDYEQSE...
183153   P20929-1  MADDEDYEEVVEYYTEEVVYEEVPGETITKIYETTTTRTSDYEQSE...
183154   P20929-1  MADDEDYEEVVEYYTEEVVYEEVPGETITKIYETTTTRTSDYEQSE...
183155   P20929-1  MADDEDYEEVVEYYTEEVVYEEVPGETITKIYETTTTRTSDYEQSE...
183156   P20929-1  MADDEDYEEVVEYYTEEVVYEEVPGETITKIYETTTTRTSDYEQSE...
183157   P20929-1  MADDEDYEEVVEYYTEEVVYEEVPGETITKIYETTTTRTSDYEQSE...
183158   P20929-1  MADDEDYEEVVEYYTEEVVYEEVPGETITKIYETTTTRTSDYEQSE...
183159   P20929-1  MADDEDYEEVVEYYTEEVVYEEVPGETITKIYETTTTRTSDY

In [None]:
isoform_check = df_final1[df_final1['uniprot_ID'].isin(['P57071', 'P57071-1', 'P20929', 'P20929-1'])]

# Display the first few rows for verification
print("\nChecking sequences for P57071-1 and P20929-1:\n")
print(isoform_check[['uniprot_ID', 'Protein Sequence']])


🔍 Checking sequences for P57071-1 and P20929-1:

       uniprot_ID                                   Protein Sequence
183152   P20929-1  MADDEDYEEVVEYYTEEVVYEEVPGETITKIYETTTTRTSDYEQSE...
183153   P20929-1  MADDEDYEEVVEYYTEEVVYEEVPGETITKIYETTTTRTSDYEQSE...
183154   P20929-1  MADDEDYEEVVEYYTEEVVYEEVPGETITKIYETTTTRTSDYEQSE...
183155   P20929-1  MADDEDYEEVVEYYTEEVVYEEVPGETITKIYETTTTRTSDYEQSE...
183156   P20929-1  MADDEDYEEVVEYYTEEVVYEEVPGETITKIYETTTTRTSDYEQSE...
...           ...                                                ...
255119   P57071-1  MPRRRPPASGAAQFPERIATRSPDPIPLCTFQRQPRAAPVQPPCRL...
255120   P57071-1  MPRRRPPASGAAQFPERIATRSPDPIPLCTFQRQPRAAPVQPPCRL...
255121   P57071-1  MPRRRPPASGAAQFPERIATRSPDPIPLCTFQRQPRAAPVQPPCRL...
255122   P57071-1  MPRRRPPASGAAQFPERIATRSPDPIPLCTFQRQPRAAPVQPPCRL...
255123   P57071-1  MPRRRPPASGAAQFPERIATRSPDPIPLCTFQRQPRAAPVQPPCRL...

[1662 rows x 2 columns]




#Remove invalid entires, null entires and STOP variants

In [None]:
import pandas as pd

# Load the final dataset
#file_path = '/content/final_dataset_with_sequences.csv'
#df = pd.read_csv(file_path)

# Remove rows where STOP is True or contains NA values
df_filtered = df_final1.query('STOP != True and STOP.notna()')

# Remove entries where 'Protein Sequence' is empty, NaN, or 'Not Found'
df_filtered = df_filtered[df_filtered['Protein Sequence'].notna() &
                          (df_filtered['Protein Sequence'] != '') &
                           (df_filtered['Protein Sequence'] != 'Not Found') &
                            (df_filtered['Protein Sequence'] != 'Error 400')]

# Save the cleaned dataset
df_filtered.to_csv('filtered_final_dataset.csv', index=False)

print("Filtered dataset saved successfully as 'filtered_final_dataset.csv'.")



Filtered dataset saved successfully as 'filtered_final_dataset.csv'.


In [None]:
df_filtered.head()

Unnamed: 0,variant_ID,uniprotID_mutation,uniprot_ID,aa_seq,wt_seq,wt_aa,mut_aa,STOP,PFAM_ID,pos_in_uniprot,Gene Names (primary),Gene Names (synonym),dom_ID,growthrate,Protein Sequence
1402,A0PJY2_PF00096_289_A12C,A0PJY2_A300C,A0PJY2,VCKVCGKGFRQCSTLCRHKIIH,VCKVCGKGFRQASTLCRHKIIH,A,C,False,PF00096,300.0,FEZF1,FEZ ZNF312B,A0PJY2_PF00096_289,0.085555,MDSSCHNATTKMLATAPARGNMMSTSKPLAFSIERIMARTPEPKAL...
1403,A0PJY2_PF00096_289_A12D,A0PJY2_A300D,A0PJY2,VCKVCGKGFRQDSTLCRHKIIH,VCKVCGKGFRQASTLCRHKIIH,A,D,False,PF00096,300.0,FEZF1,FEZ ZNF312B,A0PJY2_PF00096_289,0.092028,MDSSCHNATTKMLATAPARGNMMSTSKPLAFSIERIMARTPEPKAL...
1404,A0PJY2_PF00096_289_A12E,A0PJY2_A300E,A0PJY2,VCKVCGKGFRQESTLCRHKIIH,VCKVCGKGFRQASTLCRHKIIH,A,E,False,PF00096,300.0,FEZF1,FEZ ZNF312B,A0PJY2_PF00096_289,0.10999,MDSSCHNATTKMLATAPARGNMMSTSKPLAFSIERIMARTPEPKAL...
1405,A0PJY2_PF00096_289_A12F,A0PJY2_A300F,A0PJY2,VCKVCGKGFRQFSTLCRHKIIH,VCKVCGKGFRQASTLCRHKIIH,A,F,False,PF00096,300.0,FEZF1,FEZ ZNF312B,A0PJY2_PF00096_289,0.064676,MDSSCHNATTKMLATAPARGNMMSTSKPLAFSIERIMARTPEPKAL...
1406,A0PJY2_PF00096_289_A12G,A0PJY2_A300G,A0PJY2,VCKVCGKGFRQGSTLCRHKIIH,VCKVCGKGFRQASTLCRHKIIH,A,G,False,PF00096,300.0,FEZF1,FEZ ZNF312B,A0PJY2_PF00096_289,0.076879,MDSSCHNATTKMLATAPARGNMMSTSKPLAFSIERIMARTPEPKAL...


In [None]:
df_filtered[df_filtered['Protein Sequence'].str.contains('Error')]

Unnamed: 0,variant_ID,uniprotID_mutation,uniprot_ID,aa_seq,wt_seq,wt_aa,mut_aa,STOP,PFAM_ID,pos_in_uniprot,Gene Names (primary),Gene Names (synonym),dom_ID,growthrate,Protein Sequence


#REMOVING only one uniprot ID as after manually checking in Uniprot MUTATION POSITIONS DONT ALIGN WITH UNIPROT ID PROTEIN SEQUENCE NOR ITS ISOFORMS

In [None]:
df_filtered = df_filtered[df_filtered['uniprot_ID'] != 'Q13472']

#Mutate the wild type sequence based on mutated amino acid and position in uniprot columns for each entry

In [None]:
import pandas as pd

# Load the dataset
#file_path = '/content/filtered_final_dataset.csv'
#df = pd.read_csv(file_path)


def mutate_protein_sequence(protein_seq, position, wt_aa, mut_aa, uniprot_id):
    """
    Function to mutate a given protein sequence at a specific position.
    Checks if the amino acid at the given position matches the wild-type amino acid.
    If it matches, replaces it with the mutant amino acid and returns the modified sequence.
    """
    if pd.isna(protein_seq) or pd.isna(position) or pd.isna(wt_aa) or pd.isna(mut_aa):
        return protein_seq  # Return original sequence if any value is missing

    try:
        # Convert position to integer (handles cases like 300.0 → 300)
        pos_index = int(float(position)) - 1

        # Ensure position is within sequence bounds
        if pos_index < 0 or pos_index >= len(protein_seq):
            print(f"UniProt ID {uniprot_id}: Position {position} is out of bounds for sequence of length {len(protein_seq)}")
            return protein_seq

        # Check if wild-type amino acid matches the actual one in the sequence
        if protein_seq[pos_index] == wt_aa:
            mutated_sequence = protein_seq[:pos_index] + mut_aa + protein_seq[pos_index+1:]
            return mutated_sequence
        else:
            print(f"UniProt ID {uniprot_id}: Mismatch at position {position}: Expected '{wt_aa}', found '{protein_seq[pos_index]}'")
            return protein_seq  # Return original if WT AA doesn't match

    except Exception as e:
        print(f"Error processing UniProt ID {uniprot_id}, Position {position}: {str(e)}")
        return protein_seq

# Convert 'pos_in_uniprot' to integer
df_filtered['pos_in_uniprot'] = df_filtered['pos_in_uniprot'].astype(float).astype('Int64')

# Apply mutation to each row
df_filtered['Mutated_Protein_Sequence'] = df_filtered.apply(
    lambda row: mutate_protein_sequence(row['Protein Sequence'], row['pos_in_uniprot'], row['wt_aa'], row['mut_aa'], row['uniprot_ID']),
    axis=1
)

# Save the updated dataset
df_filtered.to_csv('mutated_final_dataset.csv', index=False)

print("Mutated dataset saved successfully as 'mutated_final_dataset.csv'.")

Mutated dataset saved successfully as 'mutated_final_dataset.csv'.


In [None]:
df_filtered.head()

Unnamed: 0,variant_ID,uniprotID_mutation,uniprot_ID,aa_seq,wt_seq,wt_aa,mut_aa,STOP,PFAM_ID,pos_in_uniprot,Gene Names (primary),Gene Names (synonym),dom_ID,growthrate,Protein Sequence,Mutated_Protein_Sequence
1402,A0PJY2_PF00096_289_A12C,A0PJY2_A300C,A0PJY2,VCKVCGKGFRQCSTLCRHKIIH,VCKVCGKGFRQASTLCRHKIIH,A,C,False,PF00096,300,FEZF1,FEZ ZNF312B,A0PJY2_PF00096_289,0.085555,MDSSCHNATTKMLATAPARGNMMSTSKPLAFSIERIMARTPEPKAL...,MDSSCHNATTKMLATAPARGNMMSTSKPLAFSIERIMARTPEPKAL...
1403,A0PJY2_PF00096_289_A12D,A0PJY2_A300D,A0PJY2,VCKVCGKGFRQDSTLCRHKIIH,VCKVCGKGFRQASTLCRHKIIH,A,D,False,PF00096,300,FEZF1,FEZ ZNF312B,A0PJY2_PF00096_289,0.092028,MDSSCHNATTKMLATAPARGNMMSTSKPLAFSIERIMARTPEPKAL...,MDSSCHNATTKMLATAPARGNMMSTSKPLAFSIERIMARTPEPKAL...
1404,A0PJY2_PF00096_289_A12E,A0PJY2_A300E,A0PJY2,VCKVCGKGFRQESTLCRHKIIH,VCKVCGKGFRQASTLCRHKIIH,A,E,False,PF00096,300,FEZF1,FEZ ZNF312B,A0PJY2_PF00096_289,0.10999,MDSSCHNATTKMLATAPARGNMMSTSKPLAFSIERIMARTPEPKAL...,MDSSCHNATTKMLATAPARGNMMSTSKPLAFSIERIMARTPEPKAL...
1405,A0PJY2_PF00096_289_A12F,A0PJY2_A300F,A0PJY2,VCKVCGKGFRQFSTLCRHKIIH,VCKVCGKGFRQASTLCRHKIIH,A,F,False,PF00096,300,FEZF1,FEZ ZNF312B,A0PJY2_PF00096_289,0.064676,MDSSCHNATTKMLATAPARGNMMSTSKPLAFSIERIMARTPEPKAL...,MDSSCHNATTKMLATAPARGNMMSTSKPLAFSIERIMARTPEPKAL...
1406,A0PJY2_PF00096_289_A12G,A0PJY2_A300G,A0PJY2,VCKVCGKGFRQGSTLCRHKIIH,VCKVCGKGFRQASTLCRHKIIH,A,G,False,PF00096,300,FEZF1,FEZ ZNF312B,A0PJY2_PF00096_289,0.076879,MDSSCHNATTKMLATAPARGNMMSTSKPLAFSIERIMARTPEPKAL...,MDSSCHNATTKMLATAPARGNMMSTSKPLAFSIERIMARTPEPKAL...
