# **Setup**

## 1. Please note that if you accessed this notebook through the colab button on GitHub then your data will be stored in an ephemeral session storage drive. Any important analysis should be saved before closing the notebook. In this case it might be more convenient to upload data directly into the session storage (Folder icon aka 'Files' > File upload icon aka 'Upload to session storage')

##2. Install relevant packages.

In [None]:
pip install gip-bio

In [None]:
pip install multiprocess

In [None]:
pip install igraph

In [None]:
pip install openpyxl

# **Pre-processing**

##3. GIP cannot handle grouped identifiers, this code block creates a version of the Abundance sheet that only retains the first identifier in any grouping, as well as a lookup table that will be used later.

##Upload your abundance values in a tsv named as Abundances.tsv, directly into the session storage. A sample tsv is available in the input folder of the GitHub repos

In [None]:
import pandas as pd

# Load the data from the Abundances file
benchmark = pd.read_csv('/content/Abundances.tsv', sep='\t')

# Load the data from the Abundances file
benchmark = pd.read_csv(url, sep='\t')

# Initialize an empty list for the lookup table
lookup_table = []

# Function to process each row in the 'Protein IDs' column
def process_protein_ids(row):
    # Retrieve the 'Protein IDs' value for the current row
    protein_ids = row['Protein IDs']

    # Split the protein IDs by semicolon to handle multiple names
    protein_names = protein_ids.split(';')

    # Get the first protein name
    first_protein_name = protein_names[0]

    # Add to the lookup table with first name and full group of names
    lookup_table.append({
        'First Name': first_protein_name,
        'Full Grouped Names': protein_ids
    })

    # Replace the current row's Protein ID with just the first name
    return first_protein_name

# Apply the function to each row in the 'Protein IDs' column
benchmark['Protein IDs'] = benchmark.apply(process_protein_ids, axis=1)

# Convert the lookup table to a DataFrame
lookup_df = pd.DataFrame(lookup_table)

# Save the modified benchmark table with only the first protein name
benchmark.to_csv('/content/Abundances_modified.tsv', sep='\t', index=False)

# Save the lookup table (first names and full grouped names)
lookup_df.to_csv('/content/protein_lookup_table.tsv', sep='\t', index=False)


print("Files saved: 'Abundances_modified.tsv' and 'protein_lookup_table.tsv to the Input folder'")


# **Calibration**

##4.1. Calibrate the molecular weight across gel slices by adding values of known slices to X and Y variables, and plot them. For a good calibration please ensure that first and last slices are represented in the X and Y variables. Also the number of slices in the demo set is 48, this can also be changed by adjusting the X_all variable in the code below **(expand the cell below to change variables and to view the plot.)**

In [None]:
import numpy as np
import pandas as pd
import plotly.express as px
from sklearn.linear_model import LinearRegression

# Original slice numbers and molecular weights in Daltons
X = np.array([8, 18, 23, 29, 35, 39]).reshape(-1, 1)  # Slice number
Y = np.array([1048000, 720000, 480000, 242000, 146000, 66000])  # Molecular weight in Daltons

# Apply base-10 log transformation to the molecular weight data
Y_log10 = np.log10(Y)

# Fit linear regression on the log-transformed data
model = LinearRegression().fit(X, Y_log10)

# Predict log-transformed molecular weights for all slices from 1 to 48
X_all = np.arange(1, 49).reshape(-1, 1) # In np.arange(1, x) x should be number of slices in your dataset+1
Y_pred_log10 = model.predict(X_all)

# Revert predictions back to the original scale (10** for base-10) and convert to kilodaltons
Y_pred_kDa = (10 ** Y_pred_log10) / 1000  # Convert to kilodaltons

# Print slice number and predicted molecular weight in kilodaltons
print("Predicted Molecular Weights (kDa) for Slices 1 to 48:")
for slice_num, mw in zip(X_all.flatten(), Y_pred_kDa):
    print(f"Slice {slice_num}: {mw:.2f} kDa")

# Plotting the original data with log10-linear trendline using Plotly Express
df = pd.DataFrame({'Slice': X.flatten(), 'Log10_Molecular_Weight': Y_log10})
fig = px.scatter(df, x="Slice", y="Log10_Molecular_Weight", trendline="ols",
                 title="Log10-Transformed Molecular Weight vs Slice Number (Linear Fit)")

fig.show()


## 4.2. Calibrate your MW across slices by adding values of known slices to X and Y variables, and apply these values to your abundances sheet. The modified sheet's name can be changed in the final part of this block **(expand the cell below to answer a question.)**


In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression

# Original slice numbers and molecular weights in Daltons
X = np.array([8, 18, 23, 29, 35, 39]).reshape(-1, 1)  # Slice number
Y = np.array([1048000, 720000, 480000, 242000, 146000, 66000])  # Molecular weight in Daltons

# Ask user if the first slice represents the highest or lowest molecular weight
user_input = input("Does Slice no. 1 represent the highest molecular weight (1) or the lowest (2)? Enter 1 or 2: ")
if user_input == "1":
    order_high_to_low = True
elif user_input == "2":
    order_high_to_low = False
else:
    raise ValueError("Invalid input. Please enter 1 or 2.")

# Apply base-10 log transformation to the molecular weight data
Y_log10 = np.log10(Y)

# Fit linear regression on the log-transformed data
model = LinearRegression().fit(X, Y_log10)

# Predict log-transformed molecular weights for all slices from 1 to 48
X_all = np.arange(1, 49).reshape(-1, 1)
Y_pred_log10 = model.predict(X_all)

# Revert predictions back to the original scale (10** for base-10) and convert to kilodaltons
Y_pred_kDa = (10 ** Y_pred_log10) / 1000  # Convert to kilodaltons and ignore decimals

# Reverse the order if the first slice represents the lowest MW
if not order_high_to_low:
    Y_pred_kDa = Y_pred_kDa[::-1]

# Load the original `Abundances_modified.tsv` file
df_benchmark = pd.read_csv("/content/Abundances_modified.tsv", sep='\t')

# Keep the original first column name
first_column_name = df_benchmark.columns[0]

# Rename only the numbered slice columns with molecular weight labels
new_headers = [first_column_name] + [f"{slice_num} - {int(mw)} kDa" for slice_num, mw in zip(X_all.flatten(), Y_pred_kDa)]
df_benchmark.columns = new_headers

# Save the new file as `Abundances_final.tsv`
df_benchmark.to_csv("/content/Abundances_final.tsv", sep='\t', index=False)

print("Abundances_final.tsv has been created in the Input folder with updated headers including molecular weights.")


# **Run GIP**

In [None]:
# 5. Run GIP

from gip.main import main
import gip.process_normalise as prn
import pandas as pd

# parse complexome profile and protein annotation file
prof = prn.parse_profile('/content/Abundances_modified.tsv')
#annot = pd.read_csv('Annotation.tsv',sep='\t',index_col=0)

# set ratio of clusters relative to number of detected proteins
clust_ratio = 0.5

# to run a standard run, using 4 threads for the bootstrapping
gip_results = main(prof, clust_ratio, annot_df=None, bs_processes=4, clusttable_fn='/content/cluster_table.tsv', membertable_fn = '/content/mem_table.tsv', pdf_fn='/content/clusters_output.pdf')




# **Post-processing**

## 6. Re-add grouped identifiers back to the GIP output (mem_table).

In [None]:
import pandas as pd

mem_table = pd.read_csv('/content/mem_table.tsv', sep='\t')
lookup_table = pd.read_csv('/content/protein_lookup_table.tsv', sep='\t')

# Create a dictionary from the lookup table for fast lookup
lookup_dict = pd.Series(lookup_table['Full Grouped Names'].values, index=lookup_table['First Name']).to_dict()

# Function to replace the identifier with the full grouped names from the lookup table
def replace_with_full_name(identifier):
    # If the identifier is in the lookup dictionary, replace it with the full name
    return lookup_dict.get(identifier, identifier)

# Apply the function to the 'identifier' column in mem_table
mem_table['identifier'] = mem_table['identifier'].apply(replace_with_full_name)

# Save the modified mem_table to a new file
mem_table.to_csv('/content/modified_mem_table.tsv', sep='\t', index=False)

print("File saved: 'modified_mem_table.tsv'")


## 7. Add alternate identifiers from original data set to the GIP's output (mem_table)

##Upload your extended abundance values sheet as an xlsx named as ExtendedAbundances.xlsx, directly into the session storage. This is similar to the output from MaxQuant. A sample xlsx is available in the input folder of the GitHub repos **(expand the cell below to answer a question)**.

In [None]:
import pandas as pd

# Load the data from the Abundances file
original_benchmark = pd.read_excel('/content/ExtendedAbundances.xlsx', sep='\t')

# Load the modified mem table and original Abundances sheet
modified_mem_table = pd.read_csv('/content/modified_mem_table.tsv', sep='\t')

# Ask the user for the column header they want to match from the original Abundances sheet
selected_column = input(f"Please enter the column header from 'ExtendedAbundances.xlsx' you want to match and added to the final mem_table (for eg - Gene names): ")

# Check if the selected column exists in the original Abundances sheet
if selected_column not in original_benchmark.columns:
    print(f"Error: The column '{selected_column}' does not exist in the 'ExtendedAbundances.xlsx' file.")
else:
    # Create a dictionary for fast lookup of identifiers in the original Abundances
    # The dictionary will map individual identifiers to the corresponding name from the selected column
    lookup_dict = {}

    # Fill the lookup dictionary by splitting grouped identifiers from the selected column
    for idx, row in original_benchmark.iterrows():
        # Split identifiers in the selected column by semicolon and strip any whitespace
        identifiers = str(row['Protein IDs']).split(';')
        for identifier in identifiers:
            lookup_dict[identifier.strip()] = row[selected_column]

    # Function to map identifiers in mem table using the lookup dictionary
    def map_identifiers(row):
        # Split the identifiers in the 'identifier' column by semicolon
        mem_identifiers = str(row['identifier']).split(';')

        # Try to find a match in the lookup_dict
        for mem_identifier in mem_identifiers:
            mem_identifier = mem_identifier.strip()
            if mem_identifier in lookup_dict:
                return lookup_dict[mem_identifier]  # Return the matched name (e.g., 'IKBIP')

        return None  # Return None if no match is found

    # Apply the function to add the new column with matched names
    modified_mem_table['Matched Identifiers'] = modified_mem_table.apply(map_identifiers, axis=1)

    # Save the final mem table with the new matched identifiers column
    modified_mem_table.to_csv('/content/final_mem_table.tsv', sep='\t', index=False)

    print("File saved: 'final_mem_table.tsv'")


# **Summarize, Lookup, and Plot**

## 8. Summarize GIP's run statistics.


In [None]:
import pandas as pd

# Load the final mem table
final_mem_table = pd.read_csv('/content/final_mem_table.tsv', sep='\t')

# Perform analysis on the number of clusters
total_clusters = final_mem_table['clust_id'].nunique()

# Calculate total number of proteins
total_proteins = final_mem_table['identifier'].nunique()

# Group by cluster and count the number of proteins in each cluster
proteins_per_cluster = final_mem_table.groupby('clust_id')['identifier'].count()

# Find the max, min, and average number of proteins per cluster
max_proteins_in_cluster = proteins_per_cluster.max()
min_proteins_in_cluster = proteins_per_cluster.min()
avg_proteins_per_cluster = proteins_per_cluster.mean()

# Print the results
print(f"Summary Statistics for Final Mem Table:")
print(f"----------------------------------------")
print(f"Total number of clusters: {total_clusters}")
print(f"Total number of unique proteins: {total_proteins}")
print(f"Maximum number of proteins in a cluster: {max_proteins_in_cluster}")
print(f"Minimum number of proteins in a cluster: {min_proteins_in_cluster}")
print(f"Average number of proteins per cluster: {avg_proteins_per_cluster:.2f}")


## 9. Find the cluster number and interactors of a protein of interest **(expand the cell below to answer a question)**.


In [None]:

import pandas as pd

# Load the final mem table
final_mem_table = pd.read_csv('/content/final_mem_table.tsv', sep='\t')

# User can modify this variable to point to the correct identifier column
identifier_column = 'Matched Identifiers'  # This is the column that has the protein names/identifiers

# Function to look up the protein and its cluster members
def lookup_protein(protein_name):
    # Find the row with the given protein name
    matching_row = final_mem_table[final_mem_table[identifier_column].str.contains(protein_name, na=False)]

    if matching_row.empty:
        print(f"Protein '{protein_name}' not found in the {identifier_column} column.")
    else:
        # Get the cluster ID for the matched protein
        cluster_id = matching_row.iloc[0]['clust_id']

        # Find all proteins in the same cluster
        cluster_members = final_mem_table[final_mem_table['clust_id'] == cluster_id][identifier_column]

        # Print the results
        print(f"Protein '{protein_name}' is in cluster {cluster_id}.")
        print(f"Other members in the same cluster ({cluster_id}):")
        print(cluster_members.tolist())

# Example usage: Call the function with the name of the protein you're looking for
protein_to_search = input("Enter the name of the protein: ")
lookup_protein(protein_to_search)


## 10.1 Process the output for visualization


In [None]:
# 10.1 Process the output for visualization

import pandas as pd

# Load the TSV file into a DataFrame
mem_table = pd.read_csv('/content/final_mem_table.tsv', sep='\t')

# Function to keep only the first protein ID if multiple are grouped
def keep_first_id(identifier):
    return identifier.split(';')[0]  # Split by semicolon and keep the first item

# Apply the function to the 'identifier' column
mem_table['identifier'] = mem_table['identifier'].apply(keep_first_id)

# Save the modified DataFrame to a new TSV file
mem_table.to_csv('/content/inter_mem_table.tsv', sep='\t', index=False)

print("File saved as 'inter_mem_table.tsv'")


## 10.2.1 Generate profile plot for a GIP identified cluster **(expand the cell below to answer a question)**.


In [None]:
import pandas as pd
import plotly.graph_objs as go
import plotly.io as pio

# Set Plotly to display inline in notebook (important for Colab)
pio.renderers.default = 'colab'

# Function to load TSV files and return dataframes

def load_data():
    inter_mem_table = pd.read_csv('/content/inter_mem_table.tsv', sep='\t')
    benchmark_modified = pd.read_csv('/content/Abundances_modified.tsv', sep='\t')
    return inter_mem_table, benchmark_modified

# Function to plot the profiles of proteins in a given cluster
def plot_protein_profiles(cluster_number, inter_mem_table, benchmark_modified):
    # Find all proteins with the specified cluster number in final_mem_table
    cluster_proteins = inter_mem_table[inter_mem_table['clust_id'] == cluster_number]['identifier'].tolist()

    if not cluster_proteins:
        print(f"No proteins found in cluster {cluster_number}.")
        return

    print(f"Proteins in cluster {cluster_number}: {cluster_proteins}")

    # Prepare data for plotting
    traces = []

    for protein in cluster_proteins:
        # Find the protein in the Abundances_modified table by matching exact identifiers or parts of grouped identifiers
        benchmark_row = benchmark_modified[benchmark_modified['Protein IDs'].apply(lambda x: any(p in x.split(';') for p in [protein]))]

        if not benchmark_row.empty:
            # Extract the x-axis (column headers) and y-axis (values in the row for the protein)
            x_values = benchmark_modified.columns[1:]  # X-axis: all columns except the first (Protein IDs)
            y_values = benchmark_row.iloc[0, 1:].values  # Y-axis: corresponding values for the protein

            # Create a trace for the protein's profile
            trace = go.Scatter(
                x=x_values,
                y=y_values,
                mode='lines+markers',
                name=benchmark_row.iloc[0, 0]  # Full name from the Abundances file for better labeling
            )
            traces.append(trace)

    # If there are any traces (proteins found in Abundances_modified), plot them
    if traces:
        layout = go.Layout(
            title=f'Protein Profiles for Cluster {cluster_number}',
            xaxis=dict(title='Benchmark Columns (X-axis)'),
            yaxis=dict(title='Values (Y-axis)'),
        )

        fig = go.Figure(data=traces, layout=layout)
        fig.show()  # Display the plot directly in the notebook
    else:
        print(f"No proteins from cluster {cluster_number} found in Abundances_modified.tsv.")

# Main function to execute the code
def main():
    # Load the data
    final_mem_table, benchmark_modified = load_data()

    # Get the cluster number from the user
    cluster_number = int(input("Enter the cluster number: "))

    # Plot the protein profiles for the given cluster
    plot_protein_profiles(cluster_number, final_mem_table, benchmark_modified)

# Run the main function
main()


# 10.2.2 Generate profile plot for a GIP identified cluster with alternate IDs **(expand the cell below to answer some questions)**.

In [None]:
import pandas as pd
import plotly.graph_objs as go
import plotly.io as pio

# Set Plotly to display inline in notebook (important for Colab)
pio.renderers.default = 'colab'

# Function to load TSV files and return dataframes
def load_data():
    inter_mem_table = pd.read_csv('/content/inter_mem_table.tsv', sep='\t')
    benchmark_modified = pd.read_csv('/content/Abundances_modified.tsv', sep='\t')
    return inter_mem_table, benchmark_modified

# Function to plot the profiles of proteins in a given cluster
def plot_protein_profiles(cluster_number, inter_mem_table, benchmark_modified, label_column):
    # Find all proteins with the specified cluster number in inter_mem_table
    cluster_proteins = inter_mem_table[inter_mem_table['clust_id'] == cluster_number]['identifier'].tolist()

    if not cluster_proteins:
        print(f"No proteins found in cluster {cluster_number}.")
        return

    print(f"Proteins in cluster {cluster_number}: {cluster_proteins}")

    # Prepare data for plotting
    traces = []

    for protein in cluster_proteins:
        # Find the protein in the benchmark_modified table by matching exact identifiers or parts of grouped identifiers
        benchmark_row = benchmark_modified[benchmark_modified['Protein IDs'].apply(lambda x: any(p in x.split(';') for p in [protein]))]

        # Handle missing protein IDs
        label = inter_mem_table.loc[inter_mem_table['identifier'] == protein, label_column].values[0] if not benchmark_row.empty else "Missing ID"

        if not benchmark_row.empty:
            # Extract the x-axis (column headers) and y-axis (values in the row for the protein)
            x_values = benchmark_modified.columns[1:]  # X-axis: all columns except the first (Protein IDs)
            y_values = benchmark_row.iloc[0, 1:].values  # Y-axis: corresponding values for the protein

            # Create a trace for the protein's profile
            trace = go.Scatter(
                x=x_values,
                y=y_values,
                mode='lines+markers',
                name=label  # Full name from selected column or "Missing ID"
            )
            traces.append(trace)

    # If there are any traces (proteins found in benchmark_modified), plot them
    if traces:
        layout = go.Layout(
            title=f'Protein Profiles for Cluster {cluster_number}',
            xaxis=dict(title='Benchmark Columns (X-axis)'),
            yaxis=dict(title='Values (Y-axis)'),
        )

        fig = go.Figure(data=traces, layout=layout)
        fig.show()  # Display the plot directly in the notebook
    else:
        print(f"No proteins from cluster {cluster_number} found in Abundances_modified.tsv.")

# Main function to execute the code
def main():
    # Load the data
    inter_mem_table, benchmark_modified = load_data()

    # Get the cluster number from the user
    cluster_number = int(input("Enter the cluster number: "))

    # Ask the user to select the column with the final labels in inter_mem_table
    print("Columns in inter_mem_table:")
    for i, col in enumerate(inter_mem_table.columns):
        print(f"{i + 1}. {col}")
    label_col_index = int(input("Select the column with final labels by number: ")) - 1
    label_column = inter_mem_table.columns[label_col_index]

    # Plot the protein profiles for the given cluster
    plot_protein_profiles(cluster_number, inter_mem_table, benchmark_modified, label_column)

# Run the main function
main()


# **Cluster Analysis**

##11. If you have an excel workbook where each complex has a seprate sheet and each protein in the complex is in a different row (each protein may have different identifiers,) then this code block lets you track the GIP identified interactors of each of those proteins in a complex by listing them in front of each protein in a separate, new column.

##Upload your complexes in an xlsx workbook named as Complexes.xlsx, directly into the session storage. A sample xlsx is available in the input folder of the GitHub repos **(expand the cell below to answer some questions)**.

In [None]:
import pandas as pd

# Load the data from the Abundances file
Complexes = pd.read_excel('/content/Complexes.xlsx', sep='\t')

# Function to load the Excel workbook and mem table TSV
def load_data():
    # Load the Excel workbook (Complexes.xlsx) with all its sheets
    workbook = pd.ExcelFile(Complexes)

    # Load the final mem table TSV file
    mem_table = pd.read_csv('/content/final_mem_table.tsv', sep='\t')

    return workbook, mem_table

# Function to display available sheets and allow user to select one
def select_sheet(workbook):
    sheets = workbook.sheet_names
    print("Available sheets:")
    for idx, sheet in enumerate(sheets):
        print(f"{idx + 1}: {sheet}")

    sheet_index = int(input("\nSelect the sheet which has your complex: ")) - 1
    selected_sheet_name = sheets[sheet_index]
    selected_sheet = workbook.parse(selected_sheet_name)
    return selected_sheet, selected_sheet_name

# Function to display available columns and allow user to select one
def select_column(df):
    print("\nAvailable columns:")
    for idx, col in enumerate(df.columns):
        print(f"{idx + 1}: {col}")

    col_index = int(input("\nSelect the column which has chosen identifier by number: ")) - 1
    selected_column = df.columns[col_index]
    return selected_column

# Function to add a new column with proteins from the same cluster
def add_cluster_info(selected_sheet, selected_column, mem_table, mem_column):
    # Create a new column to store the cluster proteins
    cluster_protein_col = []

    for protein in selected_sheet[selected_column]:
        # Find the cluster for the protein in the mem table
        matching_row = mem_table[mem_table[mem_column].apply(lambda x: any(p == x for p in protein.split(';')))]

        if not matching_row.empty:
            cluster_number = matching_row.iloc[0]['clust_id']
            # Find all proteins in that cluster
            cluster_proteins = mem_table[mem_table['clust_id'] == cluster_number][mem_column].tolist()
            # Create a comma-separated string of all proteins in the cluster
            cluster_protein_col.append(', '.join(cluster_proteins))
        else:
            cluster_protein_col.append('')

    # Add the new column to the selected sheet
    selected_sheet['Cluster Proteins'] = cluster_protein_col

    return selected_sheet

# Main function to execute the code
def main():
    # Load data
    workbook, mem_table = load_data()

    # Step 1: Select the sheet from Complexes.xlsx
    selected_sheet, selected_sheet_name = select_sheet(workbook)

    # Step 2: Select the column with protein identifiers in the selected sheet
    selected_column = select_column(selected_sheet)

    # Step 3: Select the column with protein identifiers in the mem table
    mem_column = select_column(mem_table)

    # Step 4: Add the cluster information as a new column
    modified_sheet = add_cluster_info(selected_sheet, selected_column, mem_table, mem_column)

    # Step 5: Save the modified sheet back to the workbook
    with pd.ExcelWriter('/content/Complexes_modified.xlsx', engine='openpyxl') as writer:
        # Write all original sheets back to the workbook, including the modified one
        for sheet_name in workbook.sheet_names:
            sheet_df = workbook.parse(sheet_name)
            # Check if this is the selected sheet, write the modified sheet
            if sheet_name == selected_sheet_name:
                modified_sheet.to_excel(writer, sheet_name=sheet_name, index=False)
            else:
                sheet_df.to_excel(writer, sheet_name=sheet_name, index=False)

    print(f"Modified sheet saved as Complexes_modified.xlsx")

# Run the main function
main()


## 12. Visualize the protein wise interactor network as a 3D network below **(expand the cell below to answer some questions)**.

In [None]:
import pandas as pd
import networkx as nx
import plotly.graph_objs as go
import numpy as np
import matplotlib.pyplot as plt  # For the viridis colormap

# Load your Excel file
excel_file = '/content/Complexes_modified.xlsx'  # Update with your file

# Load the workbook and list available sheets
xls = pd.ExcelFile(excel_file)
print("Available sheets:")
for i, sheet in enumerate(xls.sheet_names):
    print(f"{i + 1}. {sheet}")

# Ask the user to pick a sheet by number
sheet_choice = int(input("Pick the sheet which contains your complex: ")) - 1
sheet_name = xls.sheet_names[sheet_choice]

# Load the selected sheet
df = pd.read_excel(excel_file, sheet_name=sheet_name)

# List available columns
print("\nAvailable columns:")
for i, col in enumerate(df.columns):
    print(f"{i + 1}. {col}")

# Ask the user to pick the column for Proteins (source identifiers)
protein_col_choice = int(input("Pick the column number for 1: ")) - 1
protein_col = df.columns[protein_col_choice]

# The Cluster Proteins column is fixed
cluster_col = "Cluster Proteins"

# Create a graph
G = nx.Graph()

# Add nodes from the Proteins column
for protein in df[protein_col]:
    G.add_node(protein)

# Add edges from the Cluster Proteins column
for index, row in df.iterrows():
    protein = row[protein_col]
    if pd.notna(row[cluster_col]):
        connections = str(row[cluster_col]).split(', ')
        for connection in connections:
            G.add_edge(protein, connection)

# Use a spring layout (force-directed layout) in 3D to spread the clusters
pos = nx.spring_layout(G, dim=3, seed=42)  # Seed for reproducibility

# Create a 3D scatter plot for nodes
x_nodes = [pos[node][0] for node in G.nodes]
y_nodes = [pos[node][1] for node in G.nodes]
z_nodes = [pos[node][2] for node in G.nodes]

# Calculate node degrees (number of edges) to use for coloring
degrees = dict(G.degree)
degree_values = [degrees[node] for node in G.nodes]

# Normalize the degrees for the viridis colormap
cmap = plt.get_cmap('viridis')
norm_degrees = np.array(degree_values) / max(degree_values)
node_colors = [cmap(degree) for degree in norm_degrees]

# Create edges for the graph
edge_x = []
edge_y = []
edge_z = []
for edge in G.edges:
    edge_x.extend([pos[edge[0]][0], pos[edge[1]][0], None])  # Adding None to break the line between edges
    edge_y.extend([pos[edge[0]][1], pos[edge[1]][1], None])
    edge_z.extend([pos[edge[0]][2], pos[edge[1]][2], None])

# Create scatter plot for the nodes with coloring based on the degree (connectivity)
node_trace = go.Scatter3d(
    x=x_nodes,
    y=y_nodes,
    z=z_nodes,
    mode='markers+text',
    text=list(G.nodes),
    textposition="top center",
    hoverinfo='text',
    marker=dict(
        size=6,
        color=[f'rgb({r*255}, {g*255}, {b*255})' for r, g, b, _ in node_colors],  # Convert RGBA to RGB for Plotly
        line=dict(width=2)
    )
)

# Create a line plot for edges
edge_trace = go.Scatter3d(
    x=edge_x,
    y=edge_y,
    z=edge_z,
    mode='lines',
    line=dict(color='black', width=2),
    hoverinfo='none'
)

# Create a layout for the 3D plot
layout = go.Layout(
    title='3D Protein Interaction Network (with Cluster Spread & Degree-Based Colors)',
    showlegend=False,
    scene=dict(
        xaxis=dict(showbackground=False),
        yaxis=dict(showbackground=False),
        zaxis=dict(showbackground=False)
    )
)

# Plot the figure
fig = go.Figure(data=[edge_trace, node_trace], layout=layout)
fig.show()
