# Oncogene Dataset Downloader
This notebook automates the process of downloading, processing, and exporting biomedical datasets for oncogene analysis using public APIs and bioinformatics libraries.

In [None]:
# Install required libraries if not already installed
# !pip install bioservices
# !pip install elementpath
import pandas as pd
import numpy as np
import os
import csv
from io import StringIO
from Bio.ExPASy import ScanProsite, Prosite
from Bio import SeqIO
import xml.etree.ElementTree as ET
import requests

pd.set_option('display.max_columns', None)


In [None]:
# Set working directory
os.chdir(".")

# Load cancer and healthy drivers datasets
cancer_drivers = pd.read_csv("datasets/NCG_cancerdrivers_annotation_supporting_evidence.tsv", sep="\t")
healthy_drivers = pd.read_csv("datasets/NCG_healthydrivers_annotation_supporting_evidence.tsv", sep="\t")

# Add driver type
cancer_drivers["driver_type"] = "cancer"
healthy_drivers["driver_type"] = "healthy"

# Drop problematic gene
cancer_drivers.drop(cancer_drivers[cancer_drivers["symbol"] == "WAS"].index, inplace=True)

# Select and merge columns
healthy_drivers = healthy_drivers[['entrez', 'symbol', 'pubmed_id', 'type', 'organ_system']]
drivers = pd.concat([cancer_drivers, healthy_drivers])

# Save datasets
cancer_drivers.to_csv("datasets/cancer_drivers.csv", index=False)
healthy_drivers.to_csv("datasets/healthy_drivers.csv", index=False)
drivers.to_csv("datasets/drivers.csv", index=False)

# Display unique genes
gene_symbols = drivers["symbol"].unique()
print(f"Number of unique driver genes: {len(gene_symbols)}")


In [None]:
from bioservices import UniProt

u = UniProt()
uniprot_info = pd.DataFrame()

for i, gene in enumerate(gene_symbols):
    info = u.search(query=f"gene_exact:{gene.strip()}+AND+organism_id:9606+AND+reviewed:true", frmt="tsv")
    df_info = pd.read_csv(StringIO(info), sep="\t")
    df_info["Gene Name Reference"] = df_info["Gene Names"].str.split().str[0]
    uniprot_info = pd.concat([uniprot_info, df_info])
    print(f"Gene {i}: {gene}")

uniprot_info.to_csv("uniprot_info.csv", index=False)

# Get detailed info for each entry
AC_uniprot = list(uniprot_info["Entry"])
uniprot_info_2 = pd.DataFrame()

for i, ac in enumerate(AC_uniprot):
    df = u.get_df(ac)
    uniprot_info_2 = pd.concat([uniprot_info_2, df])
    print(f"Accession {i}: {ac}")

uniprot_info_2.to_csv("datasets/uniprot_info_2.csv", index=False)


In [None]:
# Drop problematic sequences by index
uniprot_info_2.drop([1961, 2893], inplace=True)

# ScanProsite and write motif info
with open("datasets/prosite_protein_info.csv", 'w', newline='') as file_csv:
    writer = csv.writer(file_csv)
    for i, sequence in enumerate(uniprot_info_2["Sequence"]):
        try:
            result = ScanProsite.scan(seq=sequence).read().decode('utf-8')
            root = ET.fromstring(result)
            uniprot_ac = uniprot_info_2.iloc[i]["Entry"]
            for match in root.findall('.//{urn:expasy:scanprosite}match'):
                signature_ac = match.find('{urn:expasy:scanprosite}signature_ac').text
                start = match.find('{urn:expasy:scanprosite}start').text
                stop = match.find('{urn:expasy:scanprosite}stop').text
                sequence_ac = match.find('{urn:expasy:scanprosite}sequence_ac').text
                writer.writerow([uniprot_ac, signature_ac, start, stop, sequence_ac])
        except Exception as e:
            print(f"Error on sequence {i}: {e}")


In [None]:
prosite_protein_info = pd.read_csv("datasets/prosite_protein_info.csv", names=["uniprot_ac", "signature_ac", "start", "stop", "sequence_ac"])
prosite_protein_info.to_csv("datasets/prosite_protein_info_total.csv", index=False)

# Parse prosite.dat for motif information
AC_prosite = prosite_protein_info["signature_ac"].unique()
prosite_info = []

with open("prosite.dat") as handle:
    for i, ac in enumerate(AC_prosite):
        print(f"Motif {i}: {ac}")
        handle.seek(0)
        records = Prosite.parse(handle)
        for record in records:
            if record.accession == ac:
                prosite_info.append([record.accession, record.name, record.description, record.pattern])

# Save to CSV
with open("datasets/prosite_info.csv", 'w', newline='') as file_csv:
    writer = csv.writer(file_csv)
    writer.writerows(prosite_info)

pd.read_csv("datasets/prosite_info.csv", names=["accession", "name", "description", "pattern"])


In [None]:
# Helper function to download TSV and convert to CSV
def download_signor_data(url, protein_list, params_base):
    for protein in protein_list:
        params = {**params_base, "proteins": protein}
        try:
            response = requests.get(url, params=params)
            if response.status_code == 200:
                df_tsv = pd.read_csv(StringIO(response.text), sep='\t')
                df_tsv.to_csv("datasets/signor_info.csv", mode='a', index=False, header=False)
                print(f"Downloaded data for {protein}")
            else:
                print(f"Request error for {protein}: {response.status_code}")
        except Exception as e:
            print(f"Error for {protein}: {e}")

# Define API parameters
url_api = "https://signor.uniroma2.it/CancerGeneNet/getData.php"
params_base = {"type": "shortestPath", "phenotype": "", "output": "summary"}

# Run download
download_signor_data(url_api, uniprot_info_2['Gene Names (primary)'], params_base)
