# PHASE 1: Data Parsing from Raw Excel File

In [None]:
import pandas as pd
import numpy as np
from gseapy import Biomart
from collections import Counter

### Generate the annotation files 

In [None]:
# Load ZIKV and DENV data 
zikv = pd.read_csv("Zika Neural stem cell_GSE80434.txt", sep="\t", skiprows=4)
denv = pd.read_csv("Dengue Hepatoma cell_GSE110512.txt", sep="\t", skiprows=4)

zikv = zikv[['Gene Symbol', 'log FC', 'adjusted p-val']].rename(columns={
    'Gene Symbol': 'symbol', 'log FC': 'logFC', 'adjusted p-val': 'adjP'
})
denv = denv[['Gene Symbol', 'log FC', 'adjusted p-val']].rename(columns={
    'Gene Symbol': 'symbol', 'log FC': 'logFC', 'adjusted p-val': 'adjP'
})

# Filter DEGs 
padj_thresh = 0.05
logfc_thresh = 1.0
zikv_deg = zikv[(zikv['adjP'] < padj_thresh) & (abs(zikv['logFC']) > logfc_thresh)].copy()
zikv_deg['DEG_type'] = np.where(zikv_deg['logFC'] > 0, 'Upregulated', 'Downregulated')

denv_deg = denv[(denv['adjP'] < padj_thresh) & (abs(denv['logFC']) > logfc_thresh)].copy()
denv_deg['DEG_type'] = np.where(denv_deg['logFC'] > 0, 'Upregulated', 'Downregulated')

# Query GO terms from BioMart 
bm = Biomart()
go_zikv = bm.query(dataset='hsapiens_gene_ensembl',
                   attributes=['hgnc_symbol', 'go_id', 'name_1006'],
                   filters={'hgnc_symbol': zikv_deg['symbol'].dropna().unique().tolist()})
go_zikv.columns = ['symbol', 'GO_ID', 'GO_Name']

go_denv = bm.query(dataset='hsapiens_gene_ensembl',
                   attributes=['hgnc_symbol', 'go_id', 'name_1006'],
                   filters={'hgnc_symbol': denv_deg['symbol'].dropna().unique().tolist()})
go_denv.columns = ['symbol', 'GO_ID', 'GO_Name']

# Function to flag terms 
def flag_term(df, keywords):
    return df.groupby('symbol')['GO_Name'].apply(
        lambda terms: any(any(k in str(term).lower() for k in keywords) for term in terms)
    ).reset_index()

# ZIKV annotation 
neuro_genes = flag_term(go_zikv, ['neurodevelopment', 'neuron', 'neurogenesis', 'brain', 'axon'])
# apoptosis_genes_zikv = flag_term(go_zikv, ['apoptosis', 'programmed cell death'])
inflammation_genes_zikv = flag_term(go_zikv, ['inflammatory', 'inflammation', 'cytokine', 'interferon'])

zikv_ann = zikv_deg.copy()
zikv_ann['DENV_overlap'] = zikv_ann['symbol'].isin(denv_deg['symbol'])
zikv_ann = zikv_ann.merge(neuro_genes.rename(columns={'GO_Name': 'Neurodevelopment'}), on='symbol', how='left')
# zikv_ann = zikv_ann.merge(apoptosis_genes_zikv.rename(columns={'GO_Name': 'Cell Apoptosis'}), on='symbol', how='left')
zikv_ann = zikv_ann.merge(inflammation_genes_zikv.rename(columns={'GO_Name': 'Inflammatory Response'}), on='symbol', how='left')
zikv_ann[['Neurodevelopment', 'Inflammatory Response']] = zikv_ann[['Neurodevelopment', 'Inflammatory Response']].fillna(False)

# DENV annotation 
liver_genes = flag_term(go_denv, ['liver', 'hepatocyte', 'bile', 'bilirubin'])
# apoptosis_genes_denv = flag_term(go_denv, ['apoptosis', 'programmed cell death'])
inflammation_genes = flag_term(go_denv, ['inflammatory', 'inflammation', 'cytokine', 'interferon'])

denv_ann = denv_deg.copy()
denv_ann = denv_ann.merge(liver_genes.rename(columns={'GO_Name': 'Liver Development'}), on='symbol', how='left')
# denv_ann = denv_ann.merge(apoptosis_genes_denv.rename(columns={'GO_Name': 'Cell Apoptosis'}), on='symbol', how='left')
denv_ann = denv_ann.merge(inflammation_genes.rename(columns={'GO_Name': 'Inflammatory Response'}), on='symbol', how='left')
denv_ann['Liver Development'] = denv_ann['Liver Development'].fillna(False)
denv_ann['Inflammatory Response'] = denv_ann['Inflammatory Response'].fillna(False)

# Save to CSV 
zikv_ann[['symbol', 'DEG_type', 'Neurodevelopment', 'Inflammatory Response', 'DENV_overlap']].to_csv("ZIKV_DEG_annotation.csv", index=False)
denv_ann[['symbol', 'DEG_type', 'Liver Development', 'Inflammatory Response']].to_csv("DENV_DEG_annotation.csv", index=False)


  zikv_ann[['Neurodevelopment', 'Inflammatory Response']] = zikv_ann[['Neurodevelopment', 'Inflammatory Response']].fillna(False)
  denv_ann['Liver Development'] = denv_ann['Liver Development'].fillna(False)
  denv_ann['Inflammatory Response'] = denv_ann['Inflammatory Response'].fillna(False)


## Output files for Annotation: 

ZIKV_DEG_annotation.csv: includes DEG type, neurodevelopment, apoptosis, and overlap with DENV.

DENV_DEG_annotation.csv: includes DEG type, liver development, and apoptosis.

## Create files for Cytoscape

In [None]:
# Load Files
ppi_file = "PPI_raw_Cleaned.xlsx"  
denv_deg_file = "C:/Users/ruman/Desktop/Project 2025/Final topic/1. My workflow/Analysis/DENV_DEG_annotation.csv"  
zikv_deg_file = "C:/Users/ruman/Desktop/Project 2025/Final topic/1. My workflow/Analysis/ZIKV_DEG_annotation.csv"  

# Load PPI file 
df = pd.read_excel(ppi_file)
denv_deg = pd.read_csv(denv_deg_file)
zikv_deg = pd.read_csv(zikv_deg_file)

# Add Virus Type and Clean Gene Names 
df['Virus'] = df['Bait'].apply(lambda x: 'DENV' if 'DENV' in x else 'ZIKV' if 'ZIKV' in x else 'Other')
df['Bait_Protein'] = df['Bait'].apply(lambda x: x.split()[-1])
df['GeneSymbol'] = df['Gene.names'].str.split().str[0]

# Split DENV and ZIKV 
df_denv = df[df['Virus'] == 'DENV'].copy()
df_zikv = df[df['Virus'] == 'ZIKV'].copy()

# Get DEG Lists 
denv_deg_genes = denv_deg.iloc[:, 0].dropna().unique()
zikv_deg_genes = zikv_deg.iloc[:, 0].dropna().unique()

# Define Function for DEG-filtered Cytoscape Export with Top 2 Abundant Interactions 
def make_deg_filtered_cytoscape_tables(df_subset, deg_genes, deg_annot, virus_label):
    # Filter to DEGs only
    filtered = df_subset[df_subset['GeneSymbol'].isin(deg_genes)].copy()

    # Sort by Abundance (descending) and keep top 2 per host
    filtered = filtered.sort_values(by='Abundance', ascending=False)
    filtered = filtered.groupby('GeneSymbol', as_index=False).head(2)

    # Edges 
    edges = filtered[['Bait_Protein', 'GeneSymbol']].copy()
    edges.columns = ['Source', 'Target']
    edges['Interaction'] = 'virus-host'

    # Nodes 
    viral_nodes = filtered['Bait_Protein'].unique()
    host_nodes = filtered['GeneSymbol'].unique()
    all_nodes = list(viral_nodes) + list(host_nodes)
    node_df = pd.DataFrame({'Node': all_nodes})
    node_df['Type'] = node_df['Node'].apply(lambda x: 'Virus' if x in viral_nodes else 'Host')

    # Add confidence flags
    if virus_label == 'ZIKV':
        node_df['ZIKV_HighConfidence'] = node_df['Node'].isin(
            filtered[filtered['ZIKV-Human M>0.72(fp) or 0.69 (ug)'].notna()]['GeneSymbol']
        )
    if virus_label == 'DENV':
        node_df['DENV_HighConfidence'] = node_df['Node'].isin(
            filtered[filtered['DENV-Human M>0.67&C>0.95'] == True]['GeneSymbol']
        )

    # Merge DEG annotation
    annotated_nodes = pd.merge(node_df, deg_annot, left_on="Node", right_on=deg_annot.columns[0], how="left")
    annotated_nodes.drop(columns=[deg_annot.columns[0]], inplace=True)

    # Save tables
    annotated_nodes.to_csv(f"{virus_label}_cytoscape_nodes_DEG.csv", index=False)
    edges.to_csv(f"{virus_label}_cytoscape_edges_DEG.csv", index=False)

    return annotated_nodes, edges

# Run the Function for DENV and ZIKV 
denv_nodes_deg, denv_edges_deg = make_deg_filtered_cytoscape_tables(df_denv, denv_deg_genes, denv_deg, 'DENV')
zikv_nodes_deg, zikv_edges_deg = make_deg_filtered_cytoscape_tables(df_zikv, zikv_deg_genes, zikv_deg, 'ZIKV')


#  Output Files:
DENV_cytoscape_nodes_DEG.csv

DENV_cytoscape_edges_DEG.csv

ZIKV_cytoscape_nodes_DEG.csv

ZIKV_cytoscape_edges_DEG.csv

# Modify the Node file:  Making columns for Shapes and colors of the nodes in cytoscape

# ZIKV Node file for Cytosacpe: 

In [None]:
# Load your Cytoscape node table
file_path = "C:/Users/ruman/Desktop/Project 2025/Final topic/1.1. Neural Stem Cell Analysis/PPI analysis/PPI_Data_CLEANED/Data for Cytosacpe/ZIKV_cytoscape_nodes_DEG.csv"  
df = pd.read_csv(file_path)


In [None]:
# Clean column names
df.columns = df.columns.str.strip()

# Fill NA with False for proper boolean logic
df['Neurodevelopment'] = df['Neurodevelopment'].fillna(False)
df['Inflammatory Response'] = df['Inflammatory Response'].fillna(False)

# Create the shape label column
def assign_shape_label(row):
    if row['Neurodevelopment'] == True:
        return 'Neurodevelopment'
    elif row['Inflammatory Response'] == True:
        return 'Inflammatory response'
    elif row['DEG_type'] == 'ZIKV protein':
        return 'ZIKV protein'
    else:
        return 'Human protein'

df['Shape_Label'] = df.apply(assign_shape_label, axis=1)

# Save the updated file
df.to_csv("node_table_with_shape_label.csv", index=False)


  df['Neurodevelopment'] = df['Neurodevelopment'].fillna(False)
  df['Inflammatory Response'] = df['Inflammatory Response'].fillna(False)


# output file: 
node_table_with_shape_label_Zika.csv


# DENV node file for cytoscape

In [None]:
# Load your DENV node file 
df = pd.read_csv("C:/Users/ruman/Desktop/Project 2025/Final topic/1. My workflow/Analysis/DENV_cytoscape_nodes_DEG.csv")

# Clean column names (remove extra spaces)
df.columns = df.columns.str.strip()

# Define shape labeling function
def assign_shape_label(row):
    if row['Type'] == 'Virus':
        return 'Virus protein'
    elif row.get('Liver development') is True:
        return 'Liver development'
    elif row.get('Inflammatory Response') is True:
        return 'Inflammatory response'
    else:
        return 'Human protein'

# Apply function to create new column
df['Shape_Label'] = df.apply(assign_shape_label, axis=1)

# Save to new CSV
df.to_csv("DENV_nodes_with_shape_labels.csv", index=False)


# Output file:
DENV_nodes_with_shape_labels.csv

###### Drugability #########

#### Filter the Drug targets to make a short list based on:
1. Host genes >= 2 virus-host interactions, and make a list.
2. Filter the Go terms (from STRING) that are enriched with the above gene list and Priotize host genes involved in some functionally relevent GO terms. Make a short list for GO terms.
3. Filter 10-12 host targets based on the frequency of the genes in the short list for GO terms.

## 1. Make list of genes having more than one edges from the PPI Edge file (Host genes >= 2 virus-host interactions)

In [None]:
# Load ZIKV PPI Edge File 
ppi_path = "ZIKV_cytoscape_edges_DEG.csv"  
ppi_df = pd.read_csv(ppi_path)

# Count Viral Interactions per Host Gene
virus_host_counts = ppi_df.groupby('Target')['Source'].nunique()
multi_virus_hosts = set(virus_host_counts[virus_host_counts >= 2].index.tolist())

# Add Manually Curated DENV-Overlap Genes 
denv_overlap_genes = {"MCM5", "ASNS", "SLC3A2", "PCNA"}
final_genes = sorted(multi_virus_hosts.union(denv_overlap_genes))

# Save to Excel
final_df = pd.DataFrame({'Shortlisted_Host_Genes': final_genes})
final_df.to_excel("Shortlisted_Host_Genes_ZIKV.xlsx", index=False)

print("Saved: Shortlisted_Host_Genes_ZIKV.xlsx")


Saved: Shortlisted_Host_Genes_ZIKV.xlsx


# File for filtered genes (Host genes >= 2 virus-host interactions):
Shortlisted_Host_Genes_ZIKV.xlsx

## 2. Priotize host genes involved in some functionally relevent GO terms ##

# GO term: Biological Fuction 

# Extract the GO terms from STRING raw files

In [None]:
# Load STRING Enrichment File 
file_path = "C:/Users/ruman/Desktop/Project 2025/Final topic/1.1. Neural Stem Cell Analysis/STRING/ZIKV/enrichment.Process (1).tsv"  
df = pd.read_csv(file_path, sep="\t")

# Select and Rename Relevant Columns 
df_filtered = df[[
    'term description',
    '#term ID',
    'false discovery rate',
    'matching proteins in your network (labels)'
]].copy()

df_filtered.columns = ['Description', 'GO term', 'FDR', 'Associated proteins']

# Convert FDR to Significance Format (Stars) 
def fdr_to_stars(fdr):
    if fdr < 0.0001:
        return '****'
    elif fdr < 0.001:
        return '***'
    elif fdr < 0.01:
        return '**'
    elif fdr < 0.05:
        return '*'
    else:
        return 'ns'

df_filtered['p-value'] = df_filtered['FDR'].apply(fdr_to_stars)

# Final Formatting 
df_final = df_filtered[['Description', 'GO term', 'p-value', 'Associated proteins']]

# Save to Excel 
output_path = "ZIKV_GO_Formatted_Table.xlsx"
df_final.to_excel(output_path, index=False)

print("File saved as:", output_path)


File saved as: ZIKV_GO_Formatted_Table.xlsx


# Filter GO terms that include one or more of those genes filtered from: Host genes >= 2 virus-host interactions

In [None]:
# Load the gene list 
gene_list_path = "C:/Users/ruman/Desktop/Project 2025/Final topic/1.1. Neural Stem Cell Analysis/STRING/ZIKV/Filtered files/Shortlisted_Host_Genes_ZIKV.xlsx" 
gene_list_df = pd.read_excel(gene_list_path)
gene_list = gene_list_df.iloc[:, 0].dropna().astype(str).tolist()

# Load the GO enrichment table 
go_table_path = "ZIKV_GO_Formatted_Table.xlsx"  
go_df = pd.read_excel(go_table_path)

# Create a new column for matched genes 
def extract_matching_genes(go_genes):
    go_gene_set = set(g.strip() for g in go_genes.split(','))
    matched = sorted(set(gene_list) & go_gene_set)
    return ', '.join(matched)

# Filter rows that contain any gene from the list
go_df['Matching Genes'] = go_df['Associated proteins'].apply(extract_matching_genes)
filtered_go_df = go_df[go_df['Matching Genes'] != '']

# Save the result 
output_file = "Filtered_GO_Terms_With_21_Gene_Hits.xlsx"
filtered_go_df.to_excel(output_file, index=False)

print(f"Saved filtered GO terms with matching gene details to: {output_file}")


Saved filtered GO terms with matching gene details to: Filtered_GO_Terms_With_21_Gene_Hits.xlsx


# GO: MOlecular Function 

In [None]:
# Extract the GO terms from STRING raw files

# Load STRING Enrichment File 
file_path = "C:/Users/ruman/Desktop/Project 2025/Final topic/1.1. Neural Stem Cell Analysis/STRING/ZIKV/enrichment.Function.tsv"  
df = pd.read_csv(file_path, sep="\t")

# Select and Rename Relevant Columns 
df_filtered = df[[
    'term description',
    '#term ID',
    'false discovery rate',
    'matching proteins in your network (labels)'
]].copy()

df_filtered.columns = ['Description', 'GO term', 'FDR', 'Associated proteins']

# Convert FDR to Significance Format (Stars) 
def fdr_to_stars(fdr):
    if fdr < 0.0001:
        return '****'
    elif fdr < 0.001:
        return '***'
    elif fdr < 0.01:
        return '**'
    elif fdr < 0.05:
        return '*'
    else:
        return 'ns'

df_filtered['p-value'] = df_filtered['FDR'].apply(fdr_to_stars)

# Final Formatting 
df_final = df_filtered[['Description', 'GO term', 'p-value', 'Associated proteins']]

# Save to Excel
output_path = "ZIKV_MF_Table.xlsx"
df_final.to_excel(output_path, index=False)

print("File saved as:", output_path)



File saved as: ZIKV_MF_Table.xlsx


In [None]:
##  Filter GO terms that include one or more of those genes filtered from: Host genes >= 2 virus-host interactions

# Load the gene list 
gene_list_path = "C:/Users/ruman/Desktop/Project 2025/Final topic/1.1. Neural Stem Cell Analysis/STRING/ZIKV/Filtered files/Shortlisted_Host_Genes_ZIKV.xlsx" 
gene_list_df = pd.read_excel(gene_list_path)
gene_list = gene_list_df.iloc[:, 0].dropna().astype(str).tolist()

# Load the GO enrichment table
go_table_path = "ZIKV_MF_Table.xlsx"  
go_df = pd.read_excel(go_table_path)

# Create a new column for matched genes 
def extract_matching_genes(go_genes):
    go_gene_set = set(g.strip() for g in go_genes.split(','))
    matched = sorted(set(gene_list) & go_gene_set)
    return ', '.join(matched)

# Filter rows that contain any gene from the list
go_df['Matching Genes'] = go_df['Associated proteins'].apply(extract_matching_genes)
filtered_go_df = go_df[go_df['Matching Genes'] != '']

# Save the result 
output_file = "Filtered_GO_MF_With_21_Gene_Hits.xlsx"
filtered_go_df.to_excel(output_file, index=False)

print(f"Saved filtered GO terms with matching gene details to: {output_file}")

Saved filtered GO terms with matching gene details to: Filtered_GO_MF_With_21_Gene_Hits.xlsx


# Go: Cellular component

In [None]:
# Extract the GO terms from STRING raw files

# Load STRING Enrichment File 
file_path = "C:/Users/ruman/Desktop/Project 2025/Final topic/1.1. Neural Stem Cell Analysis/STRING/ZIKV/enrichment.Component.tsv"  
df = pd.read_csv(file_path, sep="\t")

# Select and Rename Relevant Columns 
df_filtered = df[[
    'term description',
    '#term ID',
    'false discovery rate',
    'matching proteins in your network (labels)'
]].copy()

df_filtered.columns = ['Description', 'GO term', 'FDR', 'Associated proteins']

# Convert FDR to Significance Format (Stars) 
def fdr_to_stars(fdr):
    if fdr < 0.0001:
        return '****'
    elif fdr < 0.001:
        return '***'
    elif fdr < 0.01:
        return '**'
    elif fdr < 0.05:
        return '*'
    else:
        return 'ns'

df_filtered['p-value'] = df_filtered['FDR'].apply(fdr_to_stars)

# Final Formatting 
df_final = df_filtered[['Description', 'GO term', 'p-value', 'Associated proteins']]

# Save to Excel 
output_path = "ZIKV_component_Table.xlsx"
df_final.to_excel(output_path, index=False)

print("File saved as:", output_path)

##  Filter GO terms that include one or more of those genes filtered from: Host genes >= 2 virus-host interactions

# Load the gene list 
gene_list_path = "C:/Users/ruman/Desktop/Project 2025/Final topic/1.1. Neural Stem Cell Analysis/STRING/ZIKV/Filtered files/Shortlisted_Host_Genes_ZIKV.xlsx" 
gene_list_df = pd.read_excel(gene_list_path)
gene_list = gene_list_df.iloc[:, 0].dropna().astype(str).tolist()

# Load the GO enrichment table
go_table_path = "ZIKV_component_Table.xlsx" 
go_df = pd.read_excel(go_table_path)

# Create a new column for matched genes 
def extract_matching_genes(go_genes):
    go_gene_set = set(g.strip() for g in go_genes.split(','))
    matched = sorted(set(gene_list) & go_gene_set)
    return ', '.join(matched)

# Filter rows that contain any gene from the list
go_df['Matching Genes'] = go_df['Associated proteins'].apply(extract_matching_genes)
filtered_go_df = go_df[go_df['Matching Genes'] != '']

# Save the result
output_file = "Filtered_GO_Component_With_21_Gene_Hits.xlsx"
filtered_go_df.to_excel(output_file, index=False)

print(f"Saved filtered GO terms with matching gene details to: {output_file}")


File saved as: ZIKV_component_Table.xlsx
Saved filtered GO terms with matching gene details to: Filtered_GO_Component_With_21_Gene_Hits.xlsx


# KEGG pathways

In [None]:
# Extract the pathways from STRING raw files

# Load STRING Enrichment File 
file_path = "C:/Users/ruman/Desktop/Project 2025/Final topic/1.1. Neural Stem Cell Analysis/STRING/ZIKV/enrichment.KEGG.tsv"  
df = pd.read_csv(file_path, sep="\t")

# === Step 2: Select and Rename Relevant Columns ===
df_filtered = df[[
    'term description',
    '#term ID',
    'false discovery rate',
    'matching proteins in your network (labels)'
]].copy()

df_filtered.columns = ['Description', 'GO term', 'FDR', 'Associated proteins']

# === Step 3: Convert FDR to Significance Format (Stars) ===
def fdr_to_stars(fdr):
    if fdr < 0.0001:
        return '****'
    elif fdr < 0.001:
        return '***'
    elif fdr < 0.01:
        return '**'
    elif fdr < 0.05:
        return '*'
    else:
        return 'ns'

df_filtered['p-value'] = df_filtered['FDR'].apply(fdr_to_stars)

# === Step 4: Final Formatting ===
df_final = df_filtered[['Description', 'GO term', 'p-value', 'Associated proteins']]

# === Step 5: Save to Excel ===
output_path = "ZIKV_KEGG_Table.xlsx"
df_final.to_excel(output_path, index=False)

print("File saved as:", output_path)

##  Filter GO terms that include one or more of those genes filtered from: Host genes >= 2 virus-host interactions

# === Step 1: Load the gene list ===
gene_list_path = "C:/Users/ruman/Desktop/Project 2025/Final topic/1.1. Neural Stem Cell Analysis/STRING/ZIKV/Filtered files/Shortlisted_Host_Genes_ZIKV.xlsx" 
gene_list_df = pd.read_excel(gene_list_path)
gene_list = gene_list_df.iloc[:, 0].dropna().astype(str).tolist()

# === Step 2: Load the GO enrichment table ===
go_table_path = "ZIKV_KEGG_Table.xlsx" 
go_df = pd.read_excel(go_table_path)

# === Step 3: Create a new column for matched genes ===
def extract_matching_genes(go_genes):
    go_gene_set = set(g.strip() for g in go_genes.split(','))
    matched = sorted(set(gene_list) & go_gene_set)
    return ', '.join(matched)

# Filter rows that contain any gene from the list
go_df['Matching Genes'] = go_df['Associated proteins'].apply(extract_matching_genes)
filtered_go_df = go_df[go_df['Matching Genes'] != '']

# === Step 4: Save the result ===
output_file = "Filtered_GO_KEGG_With_21_Gene_Hits.xlsx"
filtered_go_df.to_excel(output_file, index=False)

print(f"Saved filtered GO terms with matching gene details to: {output_file}")


File saved as: ZIKV_KEGG_Table.xlsx
Saved filtered GO terms with matching gene details to: Filtered_GO_KEGG_With_21_Gene_Hits.xlsx


# Reactome pathways

In [None]:
# Extract the pathways from STRING raw files

# Load STRING Enrichment File 
file_path = "C:/Users/ruman/Desktop/Project 2025/Final topic/1.1. Neural Stem Cell Analysis/STRING/ZIKV/enrichment.RCTM.tsv"  
df = pd.read_csv(file_path, sep="\t")

# Select and Rename Relevant Columns
df_filtered = df[[
    'term description',
    '#term ID',
    'false discovery rate',
    'matching proteins in your network (labels)'
]].copy()

df_filtered.columns = ['Description', 'GO term', 'FDR', 'Associated proteins']

# Convert FDR to Significance Format (Stars)
def fdr_to_stars(fdr):
    if fdr < 0.0001:
        return '****'
    elif fdr < 0.001:
        return '***'
    elif fdr < 0.01:
        return '**'
    elif fdr < 0.05:
        return '*'
    else:
        return 'ns'

df_filtered['p-value'] = df_filtered['FDR'].apply(fdr_to_stars)

# Final Formatting 
df_final = df_filtered[['Description', 'GO term', 'p-value', 'Associated proteins']]

# Save to Excel 
output_path = "ZIKV_RCTM_Table.xlsx"
df_final.to_excel(output_path, index=False)

print("File saved as:", output_path)

##  Filter GO terms that include one or more of those genes filtered from: Host genes >= 2 virus-host interactions

# Load the gene list 
gene_list_path = "C:/Users/ruman/Desktop/Project 2025/Final topic/1.1. Neural Stem Cell Analysis/STRING/ZIKV/Filtered files/Shortlisted_Host_Genes_ZIKV.xlsx" 
gene_list_df = pd.read_excel(gene_list_path)
gene_list = gene_list_df.iloc[:, 0].dropna().astype(str).tolist()

# Load the GO enrichment table 
go_table_path = "ZIKV_RCTM_Table.xlsx" 
go_df = pd.read_excel(go_table_path)

# Create a new column for matched genes
def extract_matching_genes(go_genes):
    go_gene_set = set(g.strip() for g in go_genes.split(','))
    matched = sorted(set(gene_list) & go_gene_set)
    return ', '.join(matched)

# Filter rows that contain any gene from the list
go_df['Matching Genes'] = go_df['Associated proteins'].apply(extract_matching_genes)
filtered_go_df = go_df[go_df['Matching Genes'] != '']

# Save the result 
output_file = "Filtered_GO_RCTM_With_21_Gene_Hits.xlsx"
filtered_go_df.to_excel(output_file, index=False)

print(f"Saved filtered GO terms with matching gene details to: {output_file}")


File saved as: ZIKV_RCTM_Table.xlsx
Saved filtered GO terms with matching gene details to: Filtered_GO_RCTM_With_21_Gene_Hits.xlsx


### Output file for short list for GO terms: 'Short list of GO terms and pathways.xlsx'

### 3. Filter 10-12 host targets based on the frequency of the genes in the short list for GO terms ###
## Filter genes involved in multiple GO and sort them highest to lowest counts

In [None]:
#  Count Frequency of Genes Across Multiple Lists

# Load the Excel file 
file_path = "C:/Users/ruman/Desktop/Project 2025/Final topic/1.1. Neural Stem Cell Analysis/STRING/ZIKV/Filtered files/Short list of GO terms and pathways.xlsx"  
df = pd.read_excel(file_path)

# Extract the column containing comma-separated gene lists
column_name = "Matching Genes (genes >= 2 virus-host interactions)"
gene_series = df[column_name].dropna().astype(str)

# Flatten the list of genes 
all_genes = [gene.strip() for entry in gene_series for gene in entry.split(",")]

# Count gene occurrences 
gene_counts = Counter(all_genes)

# Convert to DataFrame and sort 
df_counts = pd.DataFrame(gene_counts.items(), columns=["Gene", "Count"])
df_counts = df_counts.sort_values(by="Count", ascending=False)

# Save to Excel 
output_file = "Gene_Frequency_From_File.xlsx"
df_counts.to_excel(output_file, index=False)

print(f"Saved gene frequency table to: {output_file}")


Saved gene frequency table to: Gene_Frequency_From_File.xlsx


### 3. Filter 10-12 host targets based on the frequency of the genes from all GO terms ###

In [None]:
# Count Frequency of Genes Across Multiple Lists
import pandas as pd
from collections import Counter

# Load the Excel file 
file_path = "C:/Users/ruman/Desktop/Project 2025/Final topic/1.1. Neural Stem Cell Analysis/STRING/ZIKV/Filtered files/All GO terms from STRING_ZIKA.xlsx"  
df = pd.read_excel(file_path)

# Extract the column containing comma-separated gene lists
column_name = "Matching Genes"
gene_series = df[column_name].dropna().astype(str)

# Flatten the list of genes 
all_genes = [gene.strip() for entry in gene_series for gene in entry.split(",")]

# Count gene occurrences 
gene_counts = Counter(all_genes)

# Convert to DataFrame and sort 
df_counts = pd.DataFrame(gene_counts.items(), columns=["Gene", "Count"])
df_counts = df_counts.sort_values(by="Count", ascending=False)

# Save to Excel 
output_file = "Gene_Frequency_From_All_GO_terms.xlsx"
df_counts.to_excel(output_file, index=False)

print(f"Saved gene frequency table to: {output_file}")


Saved gene frequency table to: Gene_Frequency_From_All_GO_terms.xlsx


# Output file for frequency: 'Gene_Frequency_From_All_GO_terms.xlsx'

# Final file for 15 short listed Drug Target Genes: 'Final_Target_Genes_shortlist'

In [None]:
# Load Files 
ppi_file = "PPI_raw_Cleaned.xlsx"  
zikv_deg_file = "ZIKV_DEG_annotation.csv"  

df = pd.read_excel(ppi_file)
zikv_deg = pd.read_csv(zikv_deg_file)

# Annotate Virus Type and Clean Names 
df['Virus'] = df['Bait'].apply(lambda x: 'ZIKV' if 'ZIKV' in x else 'DENV' if 'DENV' in x else 'Other')
df['Bait_Protein'] = df['Bait'].apply(lambda x: x.split()[-1])
df['GeneSymbol'] = df['Gene.names'].str.split().str[0]

# Filter for ZIKV data only 
df_zikv = df[df['Virus'] == 'ZIKV'].copy()
zikv_deg_genes = zikv_deg.iloc[:, 0].dropna().unique()

# Define Excluded Genes 
excluded_genes = ['CDK1', 'PCNA', 'AURKB', 'FANCD2', 'SQSTM1', 'SLC3A2', 'ASNS', 'SLC7A5', 'SLC25A10']

# Create DEG-Filtered Cytoscape Files (excluding the 9 genes)
def make_zikv_cytoscape_excluding_9(df_zikv, deg_genes, deg_annot, excluded_genes):
    # Filter to DEGs excluding the 9 genes
    filtered = df_zikv[df_zikv['GeneSymbol'].isin(deg_genes) & ~df_zikv['GeneSymbol'].isin(excluded_genes)].copy()

    # Sort by abundance and keep top 2 per host gene
    filtered = filtered.sort_values(by='Abundance', ascending=False)
    filtered = filtered.groupby('GeneSymbol', as_index=False).head(2)

    # Edge Table 
    edges = filtered[['Bait_Protein', 'GeneSymbol']].copy()
    edges.columns = ['Source', 'Target']
    edges['Interaction'] = 'virus-host'
    edges.to_csv("ZIKV_cytoscape_edges_filtered.csv", index=False)

    # Node Table 
    viral_nodes = filtered['Bait_Protein'].unique()
    host_nodes = filtered['GeneSymbol'].unique()
    all_nodes = list(viral_nodes) + list(host_nodes)

    node_df = pd.DataFrame({'Node': all_nodes})
    node_df['Type'] = node_df['Node'].apply(lambda x: 'Virus' if x in viral_nodes else 'Host')
    node_df['ZIKV_HighConfidence'] = node_df['Node'].isin(
        filtered[filtered['ZIKV-Human M>0.72(fp) or 0.69 (ug)'].notna()]['GeneSymbol']
    )

    # Merge DEG annotations
    annotated_nodes = pd.merge(node_df, deg_annot, left_on="Node", right_on=deg_annot.columns[0], how="left")
    annotated_nodes.drop(columns=[deg_annot.columns[0]], inplace=True)
    annotated_nodes.to_csv("ZIKV_cytoscape_nodes_filtered.csv", index=False)

    return edges, annotated_nodes

edges, nodes = make_zikv_cytoscape_excluding_9(df_zikv, zikv_deg_genes, zikv_deg, excluded_genes)

# Get ZIKV interactions for the 9 excluded genes 
excluded_interactions = df_zikv[df_zikv['GeneSymbol'].isin(excluded_genes)]
interaction_summary = excluded_interactions.groupby('GeneSymbol')['Bait_Protein'].unique().reset_index()
interaction_summary.columns = ['Gene', 'ZIKV_Proteins']
interaction_summary['ZIKV_Proteins'] = interaction_summary['ZIKV_Proteins'].apply(lambda x: ", ".join(sorted(set(x))))
interaction_summary.to_excel("ZIKV_interactions_for_excluded_genes.xlsx", index=False)

print("Cytoscape files and ZIKV-interacting excluded genes saved.")


Cytoscape files and ZIKV-interacting excluded genes saved.


outputs: 
ZIKV_cytoscape_edges_filtered.csv: edges for DEG host proteins excluding the 9 genes

ZIKV_cytoscape_nodes_filtered.csv: annotated node file for Cytoscape.

ZIKV_interactions_for_excluded_genes.xlsx: list of 9 genes and which ZIKV proteins interact with them.

In [1]:
# Save to PDF
!jupyter nbconvert "PPI_Analysis.ipynb" --to webpdf \
  --WebPDFExporter.allow_chromium_download=True \
  --output "PPI_Analysis"

[NbConvertApp] Converting notebook PPI_Analysis.ipynb to webpdf
[NbConvertApp] Building PDF
[NbConvertApp] PDF successfully created
[NbConvertApp] Writing 151805 bytes to PPI_Analysis.pdf
