In [1]:
def load_biogrid_data(zip_path, organism_id=9606):  # 9606 is the taxonomy ID for Homo sapiens
    """
    Load BioGRID data from a local ZIP file and display key aspects of it.
    
    Parameters:
    zip_path (str): Path to the downloaded BioGRID ZIP file
    organism_id (int): NCBI Taxonomy ID for the organism (default: 9606 for Homo sapiens)
    
    Returns:
    pandas.DataFrame: BioGRID PPI data
    """
    print(f"Loading BioGRID data")
    
    # Open the zip file
    with ZipFile(zip_path) as z:
        # Find all .txt files in the archive
        txt_files = [f for f in z.namelist() if f.endswith('.txt')]
        
        if not txt_files:
            raise Exception("No text files found in the ZIP archive")
        
        # Use the first text file
        data_file = txt_files[0]
        print(f"Using file: {data_file}")
        
        # Read the file
        with z.open(data_file) as f:
            df = pd.read_csv(f, sep='\t', low_memory=False)
        
        # Filter for human PPIs
        human_df = df[(df['Organism Interactor A'] == organism_id) & (df['Organism Interactor B'] == organism_id)]
        print(f"Loaded {human_df.shape[0]} human ppi-interactions")
        #print("Available Columns:")
        #print(df.columns)

    column_name = "Experimental System"
    if column_name not in human_df.columns:
        print(f"Column '{column_name}' not found in the data.")
        print(f"Available columns: {', '.join(human_df.columns)}")
        return {}
    
    # Count experimental systems
    exp_system_counts = human_df[column_name].value_counts().to_dict()
    
    # Get total interactions
    total_interactions = sum(exp_system_counts.values())
    
    # Sort by count
    sorted_systems = sorted(exp_system_counts.items(), key=lambda x: x[1], reverse=True)
    
    # Display table with counts and percentages
    print(f"\nExperimental System Distribution (Total: {total_interactions} interactions):")
    print("-" * 80)
    print(f"{'System Type':<40} | {'Count':>10} | {'Percentage':>10}")
    print("-" * 80)
    
    for system, count in sorted_systems:
        percentage = (count / total_interactions) * 100
        print(f"{system[:40]:<40} | {count:>10,} | {percentage:>9.2f}%")
            
    return human_df

In [2]:
def create_ppi_network(biogrid_df, interaction_type='physical'):
    """
    Create a PPI network from BioGRID data.
    
    Parameters:
    biogrid_df (pandas.DataFrame): BioGRID data
    interaction_type (str): Type of interaction to include ('physical', 'genetic', or 'all')
    
    Returns:
    networkx.Graph: PPI network
    """
    # Create an empty graph as a representation of the PPI network
    G = nx.Graph()
    
    # Filter by interaction type if specified
    if interaction_type == 'physical':
        df_filtered = biogrid_df[biogrid_df['Experimental System Type'] == 'physical']
    elif interaction_type == 'genetic':
        df_filtered = biogrid_df[biogrid_df['Experimental System Type'] == 'genetic']
    else:
        df_filtered = biogrid_df
    
    print(f"Building network with {df_filtered.shape[0]} physical interactions...")

     # Count statistics
    missing_symbols = 0
    self_interactions = 0
    duplicate_edges = 0
    added_edges = 0
    
    # Iterate through the edge dataset
    for _, row in tqdm(df_filtered.iterrows(), total=df_filtered.shape[0]):
        protein_a = row['Official Symbol Interactor A']
        protein_b = row['Official Symbol Interactor B']

        # Check for missing symbols
        if pd.isna(protein_a) or pd.isna(protein_b):
            missing_symbols += 1
            continue
        
        # Skip self-interactions if needed
        if protein_a == protein_b:
            self_interactions += 1
            continue
        
        # Add nodes with attributes
        if not G.has_node(protein_a):
            G.add_node(protein_a, entrez_id=row['Entrez Gene Interactor A']) # Add stable unique identifiers as well
        
        if not G.has_node(protein_b):
            G.add_node(protein_b, entrez_id=row['Entrez Gene Interactor B'])

        # Check if edge already exists
        if G.has_edge(protein_a, protein_b):
            duplicate_edges += 1
        else:
            added_edges += 1
        
        # Add edge with attributes
        if not G.has_edge(protein_a, protein_b):
            G.add_edge(protein_a, protein_b, 
                       exp_system=row['Experimental System'],
                       int_type=row['Experimental System Type'],
                       pubmed_id=row['Pubmed ID'])
        
    print(f"Network created with {G.number_of_nodes()} nodes and {G.number_of_edges()} edges")
    print(f"  Interactions with missing symbols: {missing_symbols}")
    print(f"  Self-interactions: {self_interactions}")
    print(f"  Duplicate edges: {duplicate_edges}")
    print(f"  Unique edges added: {added_edges}")
    print(f"  Sum: {added_edges + duplicate_edges + self_interactions + missing_symbols}")
    
    return G