In [5]:
import os
import re
import pandas as pd
import time
import requests
from urllib.parse import quote

In [6]:
#Import example data
example = pd.read_csv("Example.csv", sep=",")
test = pd.read_csv("Test.csv", sep=",")

example.head()

Unnamed: 0,HGNC ID,Approved symbol,Approved name,Status,Previous symbols,Alias symbols,Chromosome,Accession numbers,RefSeq IDs,NCBI Gene ID(supplied by NCBI),Ensembl ID(supplied by Ensembl),UniProt ID(supplied by UniProt),Mouse genome database ID(supplied by MGI),Locus type
0,HGNC:5,A1BG,alpha-1-B glycoprotein,Approved,,,19q13.43,,NM_130786,1.0,ENSG00000121410,P04217,MGI:2152878,gene with protein product
1,HGNC:37133,A1BG-AS1,A1BG antisense RNA 1,Approved,"NCRNA00181, A1BGAS, A1BG-AS",FLJ23569,19q13.43,BC040926,NR_015380,503538.0,ENSG00000268895,,,"RNA, long non-coding"
2,HGNC:24086,A1CF,APOBEC1 complementation factor,Approved,,"ACF, ASP, ACF64, ACF65, APOBEC1CF",10q11.23,AF271790,NM_014576,29974.0,ENSG00000148584,Q9NQ94,MGI:1917115,gene with protein product
3,HGNC:6,A1S9T,"symbol withdrawn, see [HGNC:12469](/data/gene-...",Symbol Withdrawn,,,,,,,,,,unknown
4,HGNC:7,A2M,alpha-2-macroglobulin,Approved,,"FWP007, S863-7, CPAMD5",12p13.31,"BX647329, X68728, M11313",NM_000014,2.0,ENSG00000175899,P01023,MGI:2449119,gene with protein product


In [7]:
#1. Extract names only (from a csv)

def extract_names(col_name, dataset):
    if isinstance(dataset, pd.DataFrame): #Use the df directly if it already is in Pandas
        df = dataset
    elif isinstance(dataset, str): #If it is a string, assume it's a file path
        if not os.path.exists(dataset):
            raise ValueError(f"The file path provided does not exist: {dataset}")
        try:
            df = pd.read_csv(dataset)
        except Exception as e:
            raise ValueError(f"Error reading the file: {e}")
    else:
        raise ValueError("Unsupported dataset type. Please provide a pandas DataFrame or a valid CSV file path (string).")

    #Part 2, return the specified column
    if col_name not in df.columns: #Check column is in dataset
        raise ValueError(f"Column '{col_name}' not found in the dataset. Available columns: {list(df.columns)}")
    
    return df[col_name] #return column as a series

In [8]:
#2. Classifying

def sort_labels(gene_series: pd.Series) -> pd.DataFrame: #Sort labels runs the get_category function on each entry
    gene_type_series = gene_series.apply(get_category)
    
    return pd.DataFrame({"Gene": gene_series, "Type": gene_type_series})
    
def get_category(label):
    label = str(label) #Ensure label is in string format
    print(f"\n1. Processing label '{label}'")

    #Firstly check for ENSEMBL,NCBI, or HGNC IDs

    if re.match(r"^ENSG\d+$", label):
        print("2. Label matches Ensembl ID format.")
        return "Ensembl ID"
    elif re.match(r"^\d+(\.\d+)?$", label):
        print("2. Label matches NCBI ID format.")
        return "NCBI ID"
    elif re.match(r"^HGNC:\d+$", label):
        print("2. Label is already an HGNC ID.")
        return "HGNC ID"

    print("2. Label does not match ENSEMBL/NCBI/HCNC ID, now checking via HGNC REST API")
    type_query = query_hgnc_fetch(label)
    
    if type_query is not None:
        print(f"3. Label is a {type_query}")
        return str(type_query)
    else:
        print("3. Unknown/Un-Mapped, added to log file")
        return "Unknown/Un-Mapped"
    
def query_hgnc_fetch(gene_symbol):
    headers = {"Accept": "application/json"}
    ENDPOINTS = [ #Try each endpoint in order until a record is found. (Approved last)
    ("Alias gene symbol", "https://rest.genenames.org/search/alias_symbol/"),
    ("Alias name", "https://rest.genenames.org/search/alias_name/"),
    ("Previous HGNC symbol", "https://rest.genenames.org/search/prev_symbol/"),
    ("Approved symbol", "https://rest.genenames.org/search/")]
    status_adder = "+AND+status:Approved"
    
    for endpoint_type, base_url in ENDPOINTS:
        encoded_symbol = quote(gene_symbol, safe='')
        url = base_url + encoded_symbol + status_adder

        #print(f"2.1. Constructed URL ({endpoint_type} search): {url}")
        
        try:
            response = requests.get(url, headers=headers)
            #print("2.2. Got response with status code", response.status_code)
            
            if response.status_code == 200:
                try:
                    data = response.json()
                    #print("2.3. JSON response parsed successfully.")
                except Exception as json_err:
                    print("2.3. Error parsing JSON:", json_err)
                    continue  # Try the next endpoint

                # Check if at least one record was found.
                if data.get("response", {}).get("numFound", 0) > 0:
                    #print(f"2.4. Found {data['response']['numFound']} record(s) for gene symbol '{gene_symbol}' using {endpoint_type} search.")
                    record = data["response"]["docs"][0]
                    record["search_source"] = endpoint_type
                    return endpoint_type
                #else:
                    #print(f"2.4. No records found for '{gene_symbol}' in {endpoint_type} search.")
            else:
                print(f"2.4. API returned error status code: {response.status_code} for gene symbol '{gene_symbol}' in {endpoint_type} search")
                print("Response text:", response.text)
        except Exception as e:
            print(f"2.4. Exception occurred while querying HGNC for {gene_symbol} in {endpoint_type} search: {e}")
        # Respect the rate limit between requests.
        time.sleep(0.1)
    
    #print(f"2.4. No record mapping found for '{gene_symbol}' after all searches.")
    return None

In [9]:
Testy = extract_names("Name", test)
Sorted = sort_labels(Testy)


1. Processing label 'HGNC:5'
2. Label is already an HGNC ID.

1. Processing label 'A1BG'
2. Label does not match ENSEMBL/NCBI/HCNC ID, now checking via HGNC REST API
3. Label is a Approved symbol

1. Processing label 'NCRNA00181'
2. Label does not match ENSEMBL/NCBI/HCNC ID, now checking via HGNC REST API
3. Label is a Previous HGNC symbol

1. Processing label 'ACF'
2. Label does not match ENSEMBL/NCBI/HCNC ID, now checking via HGNC REST API
3. Label is a Alias gene symbol

1. Processing label '1.0'
2. Label matches NCBI ID format.

1. Processing label 'ENSG00000121410'
2. Label matches Ensembl ID format.

1. Processing label 'HGNC:37133'
2. Label is already an HGNC ID.

1. Processing label 'A1BG-AS1'
2. Label does not match ENSEMBL/NCBI/HCNC ID, now checking via HGNC REST API
3. Label is a Approved symbol

1. Processing label 'A3GALT2P'
2. Label does not match ENSEMBL/NCBI/HCNC ID, now checking via HGNC REST API
3. Label is a Previous HGNC symbol

1. Processing label 'FLJ23569'
2. La

In [12]:
def group_genes_by_type(df):
    groups = {}
    for gene_type, sub_df in df.groupby('Type'):
        groups[gene_type] = sub_df.drop(columns=['Type']).reset_index(drop=True)
    return groups

grouped_dfs = group_genes_by_type(Sorted)

# Print each group.
for gene_type, group_df in grouped_dfs.items():
    print(f"\nGroup: {gene_type}")
    print(group_df)


Group: Alias gene symbol
       Gene
0       ACF
1  FLJ23569
2      MDR1

Group: Alias name
  Gene
0   AA

Group: Approved symbol
       Gene
0      A1BG
1  A1BG-AS1
2       A2M
3    ABCB11

Group: Ensembl ID
              Gene
0  ENSG00000121410
1  ENSG00000148584
2  ENSG00000085563
3  ENSG00000165029

Group: HGNC ID
         Gene
0      HGNC:5
1  HGNC:37133
2      HGNC:7
3     HGNC:42
4  HGNC:14637

Group: NCBI ID
      Gene
0      1.0
1  29974.0
2   5243.0
3     19.0

Group: Previous HGNC symbol
         Gene
0  NCRNA00181
1    A3GALT2P
2        BSEP
3      HDLDT1
