In [1]:
import pandas as pd
import glob
import os
# Specify the file path
path = './data/hmmvsuniprot/expanded_240904/'
# Get a list of all .txt files in the folder
txt_files = glob.glob(os.path.join(path, '*.txt'))

# Define column names and their widths
col_names = ['target_name', 'accession_1', 'query_name', 'accession_2', 
             'E-value_1', 'score_1', 'bias_1', 'E-value_2', 'score_2', 
             'bias_2', 'exp', 'reg', 'clu', 'ov', 'env', 'dom', 'rep', 
             'inc', 'description']

col_widths = [31, 11, 21, 11, 9, 7, 7, 9, 7, 7, 5, 5, 4, 4, 4, 4, 4, 4, 100]

concatenated_df = pd.DataFrame()
for file_path in txt_files:
# Read the file into a DataFrame
    df = pd.read_fwf(file_path, widths=col_widths, names=col_names, skiprows=5,skipfooter=10)
    df = df[df['E-value_1'] < 10e-10]
    df = df[df['E-value_2'] < 10e-10]
    concatenated_df = pd.concat([df, concatenated_df], ignore_index=True)


concatenated_df['TC'] = 'TC' + concatenated_df['query_name'].str.split('_').str[1]

# Sort the DataFrame based on 'E-value_1' column in ascending order
df_sorted = concatenated_df.sort_values(by='E-value_1')

# Keep the row with the smallest 'E-value_1'
df_unique = df_sorted.drop_duplicates(subset='target_name', keep='first')

sum(df_unique['query_name'].value_counts()/16345 )

0.23738146222086268

In [3]:
#df_sorted = df_sorted.groupby('target_name').apply(sum_encoded_TC)
df_unique['accession_code'] = df_unique['target_name'].str.split('|').str.get(1)
df_sorted['accession_code'] = df_sorted['target_name'].str.split('|').str.get(1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_unique['accession_code'] = df_unique['target_name'].str.split('|').str.get(1)


In [4]:
import requests
import re
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

# Setup a retry strategy to avoid rate limiting and network issues
retry_strategy = Retry(
    total=5,  # Total retries before giving up
    backoff_factor=1,  # Wait 1, 2, 4, 8, etc. seconds between retries
    status_forcelist=[429, 500, 502, 503, 504],  # Retry on these HTTP statuses
    method_whitelist=["HEAD", "GET", "OPTIONS"]  # Retry on these HTTP methods
)

# Mount retry strategy to requests session
adapter = HTTPAdapter(max_retries=retry_strategy)
session = requests.Session()
session.mount("https://", adapter)

# Function to find subfamily (with retry handling)
def find_subfamily(uniprot_id):
    fasta_url = f'https://rest.uniprot.org/uniprotkb/{uniprot_id}.txt'
    
    try:
        response = session.get(fasta_url, timeout=1000)  # Use session with retry
        response.raise_for_status()  # Raise an exception for bad status codes
        text = response.text  # Get the text content from the response
    except requests.exceptions.Timeout:
        print(f"Timeout error: Unable to retrieve information from {fasta_url}")
        text = None
    except requests.exceptions.HTTPError as err:
        print(f"HTTP error occurred: {err}")
        text = None
    except requests.exceptions.ConnectionError as err:
        print(f"Connection error: {err}")
        text = None
    
    if text:
        # Define the pattern to extract the taxonomy information
        taxonomy_pattern = r'OC\s+([\w\s;]+)\.'
    
        # Find all matches of the pattern in the text
        taxonomy_matches = re.findall(taxonomy_pattern, text)
        
        # Check if matches were found
        if taxonomy_matches:
            # Combine multiple lines into a single string and split by semicolon
            taxonomy_string = ' '.join(taxonomy_matches)
            taxonomy_levels = [level.strip() for level in taxonomy_string.split(';')]
    
            # Iterate through the list and find the string containing 'virinae'
            desired_string = None
            for level in taxonomy_levels:
                if 'viridae' in level:
                    desired_string = level.replace('OC', '').replace(' ', '')
                    break  # Stop searching once found
                elif 'virinae' in level:
                    desired_string = level.replace('OC', '').replace(' ', '')
                    break  # Stop searching once found
                    
            return desired_string if desired_string else "unclassified"
    
    return "unclassified"
    

# Function to process each accession code in parallel
def process_accession_code(accession_code):
    return accession_code, find_subfamily(accession_code)

# List to store subfamilies
subfamilies_sorted = {}

# Number of workers (threads)
num_workers = 12  # You can adjust this based on the number of CPUs/threads available

# Counter to track the progress
counter = 0

# Use ThreadPoolExecutor for parallel execution
with ThreadPoolExecutor(max_workers=num_workers) as executor:
    # Submit tasks for each accession code
    futures = {executor.submit(process_accession_code, acc): acc for acc in df_sorted['accession_code']}
    
    # Process completed futures as they finish
    for future in as_completed(futures):
        accession_code, subfam = future.result()
        subfamilies_sorted[accession_code] = subfam
        
        # Increment the counter
        counter += 1
        
        # Print a progress message every 100 processed accession codes
        if counter % 100 == 0:
            print(f"Processed {counter} accession codes so far...")





  retry_strategy = Retry(


Processed 100 accession codes so far...
Processed 200 accession codes so far...
Processed 300 accession codes so far...
Processed 400 accession codes so far...
Processed 500 accession codes so far...
Processed 600 accession codes so far...
Processed 700 accession codes so far...
Processed 800 accession codes so far...
Processed 900 accession codes so far...
Processed 1000 accession codes so far...
Processed 1100 accession codes so far...
Processed 1200 accession codes so far...
Processed 1300 accession codes so far...
Processed 1400 accession codes so far...
Processed 1500 accession codes so far...
Processed 1600 accession codes so far...
Processed 1700 accession codes so far...
Processed 1800 accession codes so far...
Processed 1900 accession codes so far...
Processed 2000 accession codes so far...
Processed 2100 accession codes so far...
Processed 2200 accession codes so far...
Processed 2300 accession codes so far...
Processed 2400 accession codes so far...
Processed 2500 accession 

In [5]:
df_unique = df_sorted.drop_duplicates(subset='target_name', keep='first')
#df_unique['subfamilies'] = subfamilies
df_sorted['subfamilies'] = df_sorted['accession_code'].map(subfamilies_sorted)
df_unique['subfamilies'] = df_unique['accession_code'].map(subfamilies_sorted)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_unique['subfamilies'] = df_unique['accession_code'].map(subfamilies_sorted)


In [7]:
df_filtered = df_sorted[df_sorted['E-value_1']<1.0e-10]
df_filtered['cluster']=df_filtered['TC'].str.replace('TC','').astype(int)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['cluster']=df_filtered['TC'].str.replace('TC','').astype(int)


In [12]:
column_names = [
    "Protein_ID", "MD5", "Length", "Database", "Signature", "Signature_Desc", 
    "Start", "End", "Score", "Status", "Date", "InterPro_ID", 
    "InterPro_Desc", "GO_Terms", "Pathways"
]
df_interpro = pd.read_csv('./data/interpro/output_file_interpro.tsv', sep='\t', names=column_names, header=None)
df_pfam = df_interpro[df_interpro["Database"].isin(['Gene3D','Pfam','CDD','FunFam'])]
df_pfam  = df_pfam [df_pfam ['Score'].astype(float) < 10e-5]



df_pfam['Extracted'] = df_pfam['Protein_ID'].str.extract(r'^..([a-z0-9]+)')[0].str.upper()
def remove_duplicate_string(s):
    pattern = r'(.+?)\1+$'  # Captures repeated patterns
    match = re.match(pattern, s)
    return match.group(1) if match else s

# Apply the function to the 'Extracted' column
df_pfam ['accession_code'] = df_pfam['Extracted'].apply(remove_duplicate_string)

#mapping = dict(zip(df_cleaned['Protein_base'], df_cleaned['fold']))
#df_pfam['Protein_ID_2'] = df_pfam['Protein_ID'].str.split('_').str[0]
#df_pfam['fold']=df_pfam['Protein_ID_2'].map(mapping).fillna('unknown')
#df_pfam

In [13]:
# Merge with full outer join
df_merged_interpro = pd.merge(df_filtered, df_pfam, on='accession_code', how='outer')
# Filter rows where 'target_name' is NaN
df_merged_interpro = df_merged_interpro[df_merged_interpro['target_name'].notna()]

In [14]:
df_merged_interpro.columns

Index(['target_name', 'accession_1', 'query_name', 'accession_2', 'E-value_1',
       'score_1', 'bias_1', 'E-value_2', 'score_2', 'bias_2', 'exp', 'reg',
       'clu', 'ov', 'env', 'dom', 'rep', 'inc', 'description', 'TC',
       'accession_code', 'subfamilies', 'cluster', 'Protein_ID', 'MD5',
       'Length', 'Database', 'Signature', 'Signature_Desc', 'Start', 'End',
       'Score', 'Status', 'Date', 'InterPro_ID', 'InterPro_Desc', 'GO_Terms',
       'Pathways', 'Extracted'],
      dtype='object')

In [15]:
import pandas as pd

# Load your existing DataFrame (assuming it's already loaded)
selected_columns = ['TC', 'accession_code', 'description', 'subfamilies', 'E-value_1', 'Database', 'Signature', 'Signature_Desc', 'Start', 'End',
       'Score', 'InterPro_ID', ]

# Create a new DataFrame with only the selected columns and rename them in one step
df_selected = df_merged_interpro[selected_columns].rename(columns={
    'accession_code': 'Accession Code',
    'subfamilies': 'Family/Subfamily',
    'E-value_1': 'E-value (TC-HMM)',
    'Score': 'E-value (InterPro)',
    'description_y': 'Protein Description',
    'Names': 'Phage'
})

# Display the new DataFrame
print(df_selected.head())  # Show the first few rows


df_selected.to_csv('./data/data_expanded_interpro.csv')

     TC Accession Code                                        description  \
0   TC3     A0A653FW15  Phage tail fiber protein OS=Escherichia phage ...   
1   TC4     A0A653FW15  Phage tail fiber protein OS=Escherichia phage ...   
2   TC9     A0A653FW15  Phage tail fiber protein OS=Escherichia phage ...   
3   TC0     A0A5Q2F504  Long tail fiber proximal subunit OS=Klebsiella...   
4  TC14     A0A5Q2F504  Long tail fiber proximal subunit OS=Klebsiella...   

  Family/Subfamily  E-value (TC-HMM) Database Signature Signature_Desc  Start  \
0     unclassified      0.000000e+00      NaN       NaN            NaN    NaN   
1     unclassified      0.000000e+00      NaN       NaN            NaN    NaN   
2     unclassified      1.500000e-31      NaN       NaN            NaN    NaN   
3    Straboviridae      0.000000e+00      NaN       NaN            NaN    NaN   
4    Straboviridae      6.600000e-12      NaN       NaN            NaN    NaN   

   End E-value (InterPro) InterPro_ID  
0  NaN    

In [16]:
#df_filtered = df_filtered[ df_sorted['subfamilies']!= 'unclassified']

In [31]:
#concatenated_df[concatenated_df['TC']=='TC16'].count()


In [20]:
import pandas as pd
import glob
import os
# Specify the file path
path = './data/hmmvsuniprot/domain_hmm/'
# Get a list of all .txt files in the folder
txt_files = glob.glob(os.path.join(path, '*.txt'))

# Define column names and their widths
col_names = ['target_name', 'accession_1', 'query_name', 'accession_2', 
             'E-value_1', 'score_1', 'bias_1', 'E-value_2', 'score_2', 
             'bias_2', 'exp', 'reg', 'clu', 'ov', 'env', 'dom', 'rep', 
             'inc', 'description']

col_widths = [31, 11, 21, 11, 9, 7, 7, 9, 7, 7, 5, 5, 4, 4, 4, 4, 4, 4, 100]

concatenated_df_dom = pd.DataFrame()
for file_path in txt_files:
# Read the file into a DataFrame
    df = pd.read_fwf(file_path, widths=col_widths, names=col_names, skiprows=5,skipfooter=10)
    concatenated_df_dom = pd.concat([df, concatenated_df_dom], ignore_index=True)

concatenated_df_dom['D_classes'] = concatenated_df_dom['query_name'].str.split('_').str[1]

# Sort the DataFrame based on 'E-value_1' column in ascending order
df_sorted_dom = concatenated_df_dom.sort_values(by='E-value_1')

# Keep the row with the smallest 'E-value_1'
df_unique_dom = df_sorted_dom.drop_duplicates(subset='target_name', keep='first')

# Display the DataFrame with unique target prefixes based on smallest 'E-value_1'
df_unique_dom = df_unique_dom[df_unique_dom['E-value_1']< 10e-10]
df_unique_dom['accession_code'] = df_unique_dom['target_name'].str.split('|').str.get(1)


In [21]:
merged_df = pd.merge(df_unique, df_unique_dom, on='accession_code', how='inner')

In [22]:
merged_df['host'] = merged_df['description_x'].str.split("OS=").str[1].str.split().str[0]

In [24]:
import requests
import re

def find_phage_name(uniprot_id):
    url = f'https://rest.uniprot.org/uniprotkb/{uniprot_id}.txt'
    
    try:
        response = requests.get(url, timeout=1000)
        response.raise_for_status()
        text = response.text
    except requests.exceptions.Timeout:
        print(f"Timeout error: Unable to retrieve information from {url}")
        return None
    except requests.exceptions.HTTPError as err:
        print(f"HTTP error occurred: {err}")
        return None
    except requests.exceptions.ConnectionError as err:
        print(f"Connection error: {err}")
        return None
    
    if text:
        # Define the pattern to extract the line that starts with "OS"
        os_pattern = r'^OS\s+(.+)$'
        
        # Search for the line that starts with "OS" in the response text
        os_match = re.search(os_pattern, text, re.MULTILINE)
        
        if os_match:
            os_line = os_match.group(1)  # Extract the matched line content
            
            # Extract the phage name
            phage_name = os_line.split('(')[0].strip()
            return phage_name
    
    return None

# Example usage
uniprot_id = "A0A7S9XES8"  # Replace this with your desired UniProt ID
phage_name = find_phage_name(uniprot_id)
print("Phage Name:", phage_name)


Phage Name: Salmonella phage vB_SalM_ABTNLsp5.


In [25]:
from concurrent.futures import ThreadPoolExecutor, as_completed

phage_names = {}
counter = 0

def process_accession_code_name(accession_code):
    return accession_code, find_phage_name(accession_code)

# Specify the number of workers for parallel processing
num_workers = 12  # Adjust this based on your machine's capabilities

with ThreadPoolExecutor(max_workers=num_workers) as executor:
    # Submit tasks for each accession code
    futures = {executor.submit(process_accession_code_name, acc): acc for acc in merged_df['accession_code']}
    
    # Process completed futures as they finish
    for future in as_completed(futures):
        accession_code, name = future.result()
        
        # Add the result to the dictionary
        phage_names[accession_code] = name
        
        # Increment the counter
        counter += 1
        
        # Print a progress message every 100 processed accession codes
        if counter % 100 == 0:
            print(f"Processed {counter} accession codes so far...")

# Now, phage_names is a dictionary mapping accession_code to phage_name
#print(phage_names)


# Assuming phage_names is your dictionary mapping accession codes to phage names
merged_df['Names'] = merged_df['accession_code'].map(phage_names)
merged_df['Names']=merged_df['Names'].fillna('noname')
# Display the dataframe to check if it was updated correctly
merged_df

Processed 100 accession codes so far...
Processed 200 accession codes so far...
Processed 300 accession codes so far...
Processed 400 accession codes so far...
Processed 500 accession codes so far...
Processed 600 accession codes so far...
Processed 700 accession codes so far...
Processed 800 accession codes so far...
Processed 900 accession codes so far...
Processed 1000 accession codes so far...
Processed 1100 accession codes so far...
Processed 1200 accession codes so far...
Processed 1300 accession codes so far...
Processed 1400 accession codes so far...
Processed 1500 accession codes so far...
Processed 1600 accession codes so far...
Processed 1700 accession codes so far...
Processed 1800 accession codes so far...
Processed 1900 accession codes so far...
Processed 2000 accession codes so far...
Processed 2100 accession codes so far...
Processed 2200 accession codes so far...
Processed 2300 accession codes so far...
Processed 2400 accession codes so far...
Processed 2500 accession 

Unnamed: 0,target_name_x,accession_1_x,query_name_x,accession_2_x,E-value_1_x,score_1_x,bias_1_x,E-value_2_x,score_2_x,bias_2_x,...,clu_y,ov_y,env_y,dom_y,rep_y,inc_y,description_y,D_classes,host,Names
0,tr|A0A653FW15|A0A653FW15_9CAUD,-,cluster_3,-,0.000000e+00,2706.5,20.9,0.000000e+00,2706.3,20.9,...,0,0,2,2,1,1,Phage tail fiber protein OS=Escherichia phage ...,D9,Escherichia,Escherichia phage Gluttony_ev152.
1,tr|A0A5Q2F504|A0A5Q2F504_9CAUD,-,cluster_0,-,0.000000e+00,1040.0,56.7,0.000000e+00,1033.8,56.7,...,2,3,5,5,5,4,Long tail fiber proximal subunit OS=Klebsiella...,D38,Klebsiella,Klebsiella phage JIPh_Kp122.
2,tr|A0A482N4I6|A0A482N4I6_9CAUD,-,cluster_4,-,0.000000e+00,1764.3,13.7,0.000000e+00,1764.1,13.7,...,0,0,3,3,1,1,Tail fiber protein OS=Escherichia phage vB_Eco...,D9,Escherichia,Escherichia phage vB_EcoS_WF5505.
3,tr|A0A0K1Y530|A0A0K1Y530_9CAUD,-,cluster_0,-,0.000000e+00,1042.3,55.0,7.900000e-298,990.7,42.0,...,2,3,5,5,5,4,Long-tail fiber proximal subunit domain-contai...,D38,Klebsiella,Klebsiella phage JD18.
4,tr|A0A7I8V5N5|A0A7I8V5N5_9CAUD,-,cluster_0,-,0.000000e+00,1042.3,56.9,0.000000e+00,1035.8,56.9,...,2,3,5,5,5,4,Long tail fiber proximal subunit OS=Klebsiella...,D38,Klebsiella,Klebsiella phage vB_KpnM_311F.
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3722,tr|A0AAE7VAA9|A0AAE7VAA9_9CAUD,-,cluster_12,-,2.800000e-10,38.1,1.0,2.800000e-10,38.1,1.0,...,1,0,2,2,1,1,Tail fiber protein OS=Escherichia phage vB_Eco...,D7,Escherichia,Escherichia phage vB_EcoS_Sponge.
3723,tr|A0AAE7R9U8|A0AAE7R9U8_9CAUD,-,cluster_16,-,5.200000e-10,38.2,6.1,8.700000e-10,37.4,6.1,...,1,0,1,1,1,1,Bacteriophage T7 tail fibre protein-like N-ter...,D66,Synechococcus,Synechococcus phage S-SRP02.
3724,tr|A0A976SFM4|A0A976SFM4_9CAUD,-,cluster_1,-,5.300000e-10,38.7,0.0,7.500000e-10,38.2,0.0,...,0,0,1,1,1,1,Tail fiber protein OS=Vibrio phage VPMCC14 OX=...,D68,Vibrio,Vibrio phage VPMCC14.
3725,tr|U3TM45|U3TM45_9CAUD,-,cluster_0,-,6.100000e-10,37.0,3.1,6.100000e-10,37.0,3.1,...,0,0,2,2,2,1,Putative tail fiber protein OS=Ralstonia phage...,D40,Ralstonia,Ralstonia phage RSB3.


In [26]:
merged_df.columns

Index(['target_name_x', 'accession_1_x', 'query_name_x', 'accession_2_x',
       'E-value_1_x', 'score_1_x', 'bias_1_x', 'E-value_2_x', 'score_2_x',
       'bias_2_x', 'exp_x', 'reg_x', 'clu_x', 'ov_x', 'env_x', 'dom_x',
       'rep_x', 'inc_x', 'description_x', 'TC', 'accession_code',
       'subfamilies', 'target_name_y', 'accession_1_y', 'query_name_y',
       'accession_2_y', 'E-value_1_y', 'score_1_y', 'bias_1_y', 'E-value_2_y',
       'score_2_y', 'bias_2_y', 'exp_y', 'reg_y', 'clu_y', 'ov_y', 'env_y',
       'dom_y', 'rep_y', 'inc_y', 'description_y', 'D_classes', 'host',
       'Names'],
      dtype='object')

In [27]:
merged_df.to_csv('./data/atlas_tc_dclasses.csv')

In [29]:
import networkx as nx
import pandas as pd
import seaborn as sns
from ipysigma import Sigma
import numpy as np

# Assuming your dataframe is already loaded and named 'merged_df'
# Step 1: Create a graph object
G = nx.Graph()

# Generate the tab20 palette using Seaborn
palette = sns.color_palette("tab20", 20)  # Generate 20 distinct colors (for TC0 to TC17, plus two more)

# Create a dictionary to map TC categories to colors
tc_palette = {f'TC{i}': f'rgb({int(r*255)}, {int(g*255)}, {int(b*255)})' for i, (r, g, b) in enumerate(palette[:18])}

# Add colors for 'family' and 'domain' categories
tc_palette['family'] = f'rgb({int(palette[18][0]*255)}, {int(palette[18][1]*255)}, {int(palette[18][2]*255)})'
tc_palette['domain'] = f'rgb({int(palette[19][0]*255)}, {int(palette[19][1]*255)}, {int(palette[19][2]*255)})'

# Step 2: Add edges between TCs, subfamilies, and D_classes based on shared occurrences in rows
for index, row in merged_df.iterrows():
    tc = row['TC']
    subfamily = row['subfamilies']
    domains = row['D_classes']
    names = row['Names']
    evalue_tc = row['E-value_1_x']
    evalue_D = row['E-value_1_y']
    
    # Add nodes for TC, subfamily, domains, and names
    G.add_node(tc, type='TC', tc_category=tc)
    G.add_node(subfamily, type='subfamily or family', tc_category='family')
    G.add_node(domains, type='D_classes', tc_category='domain')
    G.add_node(names, type='Names', tc_category='Names')
    
    # Calculate weights for edges
    weight_tc_subfamily = len(merged_df[(merged_df['TC'] == tc) & (merged_df['subfamilies'] == subfamily)])
    weight_tc_domains = len(merged_df[(merged_df['TC'] == tc) & (merged_df['D_classes'] == domains)])
    weight_domains_subfamily = len(merged_df[(merged_df['D_classes'] == domains) & (merged_df['subfamilies'] == subfamily)])

    # Add edges with weights and evalues only where applicable
    G.add_edge(tc, subfamily, weight=weight_tc_subfamily)  # No evalue
    G.add_edge(tc, domains, weight=weight_tc_domains)       # No evalue
    G.add_edge(domains, subfamily, weight=weight_domains_subfamily)  # No evalue
    G.add_edge(subfamily, names)                            # No evalue
    G.add_edge(domains, names, evalue=evalue_D)             # Only add evalue where applicable
    G.add_edge(tc, names, evalue=evalue_tc)                 # Only add evalue where applicable

# Step 3: Assign degrees to nodes for size scaling
for node in G.nodes():
    G.nodes[node]['degree'] = G.degree(node)
    G.nodes[node]['color'] = tc_palette.get(G.nodes[node]['tc_category'], 'grey')  # Default to grey if no category color

# Displaying the graph with ipysigma using the custom palette
Sigma(
    G, 
    node_color="color",
    node_size="degree",
    node_size_range=(3, 20),
    edge_color='grey',
    edge_size='weight',
    edge_size_range=(1, 30),
    node_label='tc_category',
    default_node_color='grey',
    edge_label='evalue'  # Show evalue as the edge label
)


Sigma.write_html(
    G,
    './data/dataset_ncbi_domains.html',
    fullscreen=True,
    node_color="tc_category",      # Color nodes by the 'tc_category' attribute
    node_color_palette=tc_palette,
    node_metrics=['louvain'],
    node_size_range=(3, 30),
    max_categorical_colors=20,
    edge_size='weight',
    edge_size_range=(1, 30),
    edge_label='evalue',
    default_edge_type='curve',
    node_border_color_from='node',
    default_node_label_size=24,
    node_size='degree'
)


In [30]:
subset_df = merged_df.head(2)


# Assuming your dataframe is already loaded and named 'merged_df'
# Step 1: Create a graph object
G = nx.Graph()

# Generate the tab20 palette using Seaborn
palette = sns.color_palette("tab20", 20)  # Generate 20 distinct colors (for TC0 to TC17, plus two more)

# Create a dictionary to map TC categories to colors
tc_palette = {f'TC{i}': f'rgb({int(r*255)}, {int(g*255)}, {int(b*255)})' for i, (r, g, b) in enumerate(palette[:18])}

# Add colors for 'family' and 'domain' categories
tc_palette['family'] = f'rgb({int(palette[18][0]*255)}, {int(palette[18][1]*255)}, {int(palette[18][2]*255)})'
tc_palette['domain'] = f'rgb({int(palette[19][0]*255)}, {int(palette[19][1]*255)}, {int(palette[19][2]*255)})'

# Step 2: Add edges between TCs, subfamilies, and D_classes based on shared occurrences in rows
for index, row in subset_df.iterrows():
    tc = row['TC']
    subfamily = row['subfamilies']
    domains = row['D_classes']
    names = row['Names']
    # Add nodes for TC, subfamily, and domains
    G.add_node(tc, type='TC', tc_category=tc)  # Assign the actual TC value for coloring
    G.add_node(subfamily, type='subfamily or family', tc_category='family')
    G.add_node(domains, type='D_classes', tc_category='domain')
    G.add_node(names, type='Names', tc_category='Names')
    
    # Calculate weights for edges
    weight_tc_subfamily = len(subset_df[(subset_df['TC'] == tc) & (subset_df['subfamilies'] == subfamily)])
    weight_tc_domains = len(subset_df[(subset_df['TC'] == tc) & (subset_df['D_classes'] == domains)])
    weight_domains_subfamily = len(subset_df[(subset_df['D_classes'] == domains) & (subset_df['subfamilies'] == subfamily)])

    # Add edges with weights
    G.add_edge(tc, subfamily, weight=weight_tc_subfamily)
    G.add_edge(tc, domains, weight=weight_tc_domains)
    G.add_edge(domains, subfamily, weight=weight_domains_subfamily)
    G.add_edge(subfamily, names)
    G.add_edge(domains, names)
    G.add_edge(tc, names)
# Displaying the graph with ipysigma using the custom palette
Sigma(G, 
      node_color="tc_category",      # Color nodes by the 'tc_category' attribute
      node_color_palette=tc_palette, # Use the custom TC palette
      node_size="degree",            # Set node size by degree
      node_size_range=(3, 20),       # Adjust size range if needed
      edge_color='grey',             # Default edge color
      edge_size='weight',            # Adjust edge thickness based on weight
      edge_size_range=(1, 30),       # Adjust size range if needed
      node_label='tc_category',      # Display TC categories as node labels
      default_node_color='grey'      # Default color for nodes not in the TC palette
)


Sigma.write_html(
    G,
    './data/dataset_ncbi_domains_sub.html',
    fullscreen=True,
    node_color="tc_category",      # Color nodes by the 'tc_category' attribute
    node_color_palette=tc_palette,
    node_metrics=['louvain'],
    node_size_range=(3, 30),
    max_categorical_colors=20,
    edge_size='weight',
    edge_size_range=(1, 30),
    default_edge_type='curve',
    node_border_color_from='node',
    default_node_label_size=24,
    node_size=G.degree
)

In [31]:
import networkx as nx
import pandas as pd
import seaborn as sns
from ipysigma import Sigma
from IPython.display import display

# Assuming your dataframe is already loaded and named 'merged_df'
# Step 1: Create a graph object
G = nx.Graph()

# Generate the tab20 palette using Seaborn
palette = sns.color_palette("tab20", 20)  # Generate 20 distinct colors (for TC0 to TC17, plus two more)

# Create a dictionary to map TC categories to colors
tc_palette = {f'TC{i}': f'rgb({int(r*255)}, {int(g*255)}, {int(b*255)})' for i, (r, g, b) in enumerate(palette[:18])}

# Add colors for 'family' and 'domain' categories
tc_palette['family'] = f'rgb({int(palette[18][0]*255)}, {int(palette[18][1]*255)}, {int(palette[18][2]*255)})'
tc_palette['domain'] = f'rgb({int(palette[19][0]*255)}, {int(palette[19][1]*255)}, {int(palette[19][2]*255)})'

# Step 2: Add nodes and edges between TCs, subfamilies, and D_classes based on shared occurrences in rows
for index, row in merged_df.iterrows():
    tc = row['TC']
    subfamily = row['subfamilies']
    domains = row['D_classes']
    names = row['Names']
    
    # Add nodes for TC, subfamily, and domains
    G.add_node(tc, type='TC', tc_category=tc)  # Assign the actual TC value for coloring
    G.add_node(subfamily, type='subfamily or family', tc_category='family')
    G.add_node(domains, type='D_classes', tc_category='domain')
    G.add_node(names, type='Names', tc_category='Names')
    
    # Add edges with default weight of 1
    G.add_edge(tc, subfamily)
    G.add_edge(tc, domains)
    G.add_edge(domains, subfamily)
    G.add_edge(subfamily, names)
    G.add_edge(domains, names)
    G.add_edge(tc, names)

# Step 3: Initialize and display the Sigma widget
sigma = Sigma(G, 
    node_color="tc_category",      # Color nodes by the 'tc_category' attribute
    node_color_palette=tc_palette,
    node_metrics=['louvain'],
    node_size_range=(3, 10),
    max_categorical_colors=20,
    edge_size='weight',
    edge_size_range=(1, 30),
    default_edge_type='curve',
    node_border_color_from='node',
    default_node_label_size=24,
    node_size=G.degree    # Default color for nodes not in the TC palette
)

# Display the Sigma widget first
display(sigma)

# Now render a snapshot after the widget is displayed
#sigma.render_snapshot()


Sigma(nx.Graph with 2,264 nodes and 9,215 edges)

In [32]:
from concurrent.futures import ThreadPoolExecutor, as_completed

phage_names = {}
counter = 0

def process_accession_code_name(accession_code):
    return accession_code, find_phage_name(accession_code)

# Specify the number of workers for parallel processing
num_workers = 12  # Adjust this based on your machine's capabilities

with ThreadPoolExecutor(max_workers=num_workers) as executor:
    # Submit tasks for each accession code
    futures = {executor.submit(process_accession_code_name, acc): acc for acc in df_sorted['accession_code']}
    
    # Process completed futures as they finish
    for future in as_completed(futures):
        accession_code, name = future.result()
        
        # Add the result to the dictionary
        phage_names[accession_code] = name
        
        # Increment the counter
        counter += 1
        
        # Print a progress message every 100 processed accession codes
        if counter % 100 == 0:
            print(f"Processed {counter} accession codes so far...")

# Now, phage_names is a dictionary mapping accession_code to phage_name
#print(phage_names)


# Assuming phage_names is your dictionary mapping accession codes to phage names
df_sorted['Names'] = df_sorted['accession_code'].map(phage_names)
df_sorted['Names']=df_sorted['Names'].fillna('noname')
# Display the dataframe to check if it was updated correctly
df_sorted

Processed 100 accession codes so far...
Processed 200 accession codes so far...
Processed 300 accession codes so far...
Processed 400 accession codes so far...
Processed 500 accession codes so far...
Processed 600 accession codes so far...
Connection error: HTTPSConnectionPool(host='rest.uniprot.org', port=443): Max retries exceeded with url: /uniprotkb/A0A345ARD4.txt (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x17f9ac790>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known'))
Processed 700 accession codes so far...
Processed 800 accession codes so far...
Processed 900 accession codes so far...
Processed 1000 accession codes so far...
Processed 1100 accession codes so far...
Processed 1200 accession codes so far...
Processed 1300 accession codes so far...
Processed 1400 accession codes so far...
Processed 1500 accession codes so far...
Processed 1600 accession codes so far...
Processed 1700 accession codes so f

Unnamed: 0,target_name,accession_1,query_name,accession_2,E-value_1,score_1,bias_1,E-value_2,score_2,bias_2,...,ov,env,dom,rep,inc,description,TC,accession_code,subfamilies,Names
0,tr|A0A653FW15|A0A653FW15_9CAUD,-,cluster_3,-,0.000000e+00,2706.5,20.9,0.000000e+00,2706.3,20.9,...,0,1,1,1,1,Phage tail fiber protein OS=Escherichia phage ...,TC3,A0A653FW15,unclassified,Escherichia phage Gluttony_ev152.
923,tr|A0A5Q2F504|A0A5Q2F504_9CAUD,-,cluster_0,-,0.000000e+00,1040.0,56.7,0.000000e+00,1033.8,56.7,...,0,1,1,1,1,Long tail fiber proximal subunit OS=Klebsiella...,TC0,A0A5Q2F504,Straboviridae,Klebsiella phage JIPh_Kp122.
1844,tr|A0A482N4I6|A0A482N4I6_9CAUD,-,cluster_4,-,0.000000e+00,1764.3,13.7,0.000000e+00,1764.1,13.7,...,0,1,1,1,1,Tail fiber protein OS=Escherichia phage vB_Eco...,TC4,A0A482N4I6,unclassified,Escherichia phage vB_EcoS_WF5505.
921,tr|A0A0K1Y530|A0A0K1Y530_9CAUD,-,cluster_0,-,0.000000e+00,1042.3,55.0,7.900000e-298,990.7,42.0,...,0,2,2,2,2,Long-tail fiber proximal subunit domain-contai...,TC0,A0A0K1Y530,Straboviridae,Klebsiella phage JD18.
920,tr|A0A7I8V5N5|A0A7I8V5N5_9CAUD,-,cluster_0,-,0.000000e+00,1042.3,56.9,0.000000e+00,1035.8,56.9,...,0,1,1,1,1,Long tail fiber proximal subunit OS=Klebsiella...,TC0,A0A7I8V5N5,Straboviridae,Klebsiella phage vB_KpnM_311F.
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2873,tr|A0A9E7LJA2|A0A9E7LJA2_9CAUD,-,cluster_7,-,7.300000e-10,38.1,0.2,7.300000e-10,38.1,0.2,...,0,2,2,1,1,Tail fiber OS=Escherichia phage EC105 OX=29369...,TC7,A0A9E7LJA2,Demerecviridae,Escherichia phage EC105.
2872,tr|A0A9E7IND2|A0A9E7IND2_9CAUD,-,cluster_7,-,7.300000e-10,38.1,0.2,7.300000e-10,38.1,0.2,...,0,2,2,1,1,Tail fiber OS=Escherichia phage EC104 OX=29369...,TC7,A0A9E7IND2,Demerecviridae,Escherichia phage EC104.
2871,tr|A0A9E7IDC4|A0A9E7IDC4_9CAUD,-,cluster_7,-,7.300000e-10,38.1,0.2,7.300000e-10,38.1,0.2,...,0,2,2,1,1,Tail fiber OS=Escherichia phage EC122 OX=29369...,TC7,A0A9E7IDC4,Demerecviridae,Escherichia phage EC122.
5662,tr|A0AAD1V9Y2|A0AAD1V9Y2_9CAUD,-,cluster_11,-,9.500000e-10,38.4,2.1,9.500000e-10,38.4,2.1,...,0,4,4,2,1,Side tail fiber protein from bacteriophage ori...,TC11,A0AAD1V9Y2,unclassified,Acinetobacter phage MD-2021a.


In [44]:
df_sorted = df_sorted[df_sorted['E-value_2'] < 10e-10]

In [214]:
import networkx as nx
import pandas as pd
import seaborn as sns
from ipysigma import Sigma
import numpy as np

# Assuming your dataframe is already loaded and named 'merged_df'
# Step 1: Create a graph object
G = nx.Graph()

# Generate the tab20 palette using Seaborn
palette = sns.color_palette("tab20", 20)  # Generate 20 distinct colors (for TC0 to TC17, plus two more)

# Create a dictionary to map TC categories to colors
tc_palette = {f'TC{i}': f'rgb({int(r*255)}, {int(g*255)}, {int(b*255)})' for i, (r, g, b) in enumerate(palette[:18])}

# Add colors for 'family' and 'domain' categories
tc_palette['family'] = f'rgb({int(palette[18][0]*255)}, {int(palette[18][1]*255)}, {int(palette[18][2]*255)})'
tc_palette['domain'] = f'rgb({int(palette[19][0]*255)}, {int(palette[19][1]*255)}, {int(palette[19][2]*255)})'

# Step 2: Add edges between TCs, subfamilies, and D_classes based on shared occurrences in rows
for index, row in df_sorted.iterrows():
    tc = row['TC']
    subfamily = row['subfamilies']
    #domains = row['D_classes']
    names = row['Names']
    # Add nodes for TC, subfamily, and domains
    G.add_node(tc, type='TC', tc_category=tc)  # Assign the actual TC value for coloring
    G.add_node(subfamily, type='subfamily or family', tc_category='family')
    #G.add_node(domains, type='D_classes', tc_category='domain')
    G.add_node(names, type='Names', tc_category='Names')
    
    # Calculate weights for edges
    weight_tc_subfamily = len(df_sorted[(df_sorted['TC'] == tc) & (df_sorted['subfamilies'] == subfamily)])
    #weight_tc_domains = len(df_sorted[(df_sorted['TC'] == tc) & (df_sorted['D_classes'] == domains)])
    #weight_domains_subfamily = len(df_sorted[(df_sorted['D_classes'] == domains) & (df_sorted['subfamilies'] == subfamily)])

    # Add edges with weights
    G.add_edge(tc, subfamily, weight=weight_tc_subfamily)
    G.add_edge(tc, domains, weight=weight_tc_domains)
    #G.add_edge(domains, subfamily, weight=weight_domains_subfamily)
    G.add_edge(subfamily, names)
    #G.add_edge(domains, names)
    G.add_edge(tc, names)
# Displaying the graph with ipysigma using the custom palette
Sigma(G, 
      node_color="tc_category",      # Color nodes by the 'tc_category' attribute
      node_color_palette=tc_palette, # Use the custom TC palette
      node_size="degree",            # Set node size by degree
      node_size_range=(3, 20),       # Adjust size range if needed
      edge_color='grey',             # Default edge color
      edge_size='weight',            # Adjust edge thickness based on weight
      edge_size_range=(1, 30),       # Adjust size range if needed
      node_label='tc_category',      # Display TC categories as node labels
      default_node_color='grey'      # Default color for nodes not in the TC palette
)


Sigma.write_html(
    G,
    './dataset_ncbi.html',
    fullscreen=True,
    node_color="tc_category",      # Color nodes by the 'tc_category' attribute
    node_color_palette=tc_palette,
    node_metrics=['louvain'],
    node_size_range=(10, 100),
    max_categorical_colors=20,
    edge_size='weight',
    edge_size_range=(1, 30),
    default_edge_type='curve',
    node_border_color_from='node',
    default_node_label_size=24,
    node_size=G.degree
)

KeyError: 'subfamilies'

In [41]:
!jupyter labextension install @jupyter-widgets/jupyterlab-manager

[33m(Deprecated) Installing extensions with the jupyter labextension install command is now deprecated and will be removed in a future major version of JupyterLab.

Users should manage prebuilt extensions with package managers like pip and conda, and extension authors are encouraged to distribute their extensions as prebuilt packages [0m


In [45]:
import json

In [46]:
df_filtered.columns

Index(['target_name', 'accession_1', 'query_name', 'accession_2', 'E-value_1',
       'score_1', 'bias_1', 'E-value_2', 'score_2', 'bias_2', 'exp', 'reg',
       'clu', 'ov', 'env', 'dom', 'rep', 'inc', 'description', 'TC',
       'new_encoded_TC_2', 'accession_code', 'subfamilies', 'cluster'],
      dtype='object')