# Simple retrieval from genenames.org (better not to use, its non-exhaustive)

In [22]:
import requests

genes = ['ESR1', 'PGR', 'GATA3', 'TFF1', 'ESR2', 'ERBB4', 'FOXC1', 'MLPH', 'MAPT', 'SLC39A6',
    'NAT1', 'SLC7A5', 'ABCC4', 'BCL2', 'CCNB1', 'MYBL2', 'FOXM1', 'AURKA', 'CDC20', 'EXO1',
    'MELK', 'BIRC5', 'UBE2C', 'KIF2C', 'RRM2', 'TYMS', 'MMP11', 'ERBB2', 'GRB7', 'HER2',
    'LAPTM4B', 'STAT1', 'MIA', 'CTHRC1', 'CK5', 'EGFR', 'KRT17', 'KRT14', 'KRT6A', 'KRT6B',
    'KRT16', 'CDH3', 'CDH1', 'FOXA1', 'SCUBE2', 'SFRP1', 'AREG', 'TGFB1', 'NME1', 'NME2']

def get_alternative_names(gene):
    # Create the URL for the HGNC API
    url = f'https://www.genenames.org/cgi-bin/download/custom?col=gd_app_sym&col=gd_aliases&status=Approved&hgnc_dbtag=on&order_by=gd_app_sym_sort&format=text&submit=submit&custom_sort=gd_app_sym'

    # Send a GET request to the API
    response = requests.get(url)

    # Split the response text into lines
    lines = response.text.split('\n')

    # Find the gene in the lines and extract the alternative names
    for line in lines:
        if line.startswith(gene + '\t'):
            data = line.split('\t')
            alternative_names = data[1].split('|')
            return alternative_names

    return []

my_genes = {}
for gene in genes:
    alternative_names = get_alternative_names(gene)
    if alternative_names:
        my_genes[gene] = alternative_names

for i in my_genes:
    print(i, my_genes[i])

ESR1
PGR
GATA3
TFF1
ESR2
ERBB4
FOXC1
MLPH
MAPT
SLC39A6
NAT1
SLC7A5
ABCC4
BCL2
CCNB1
MYBL2
FOXM1
AURKA
CDC20
EXO1
MELK
BIRC5
UBE2C
KIF2C
RRM2
TYMS
MMP11
ERBB2
GRB7
LAPTM4B
STAT1
MIA
CTHRC1
EGFR
KRT17
KRT14
KRT6A
KRT6B
KRT16
CDH3
CDH1
FOXA1
SCUBE2
SFRP1
AREG
TGFB1
NME1
NME2


# Retrieve IDs from NCBI for the list of genes

In [28]:
import requests
from bs4 import BeautifulSoup

pam50_gene_list = ['ESR1', 'ERBB2', 'PGR', 'MKI67', 'CCNB1', 'MYBL2', 'CDC20', 'UBE2C', 'BIRC5', 'ANLN', 'KRT5', 'EGFR', 'FOXA1', 'GRB7', 'ERBB3', 'MELK', 'NDC80', 'TMEM45B', 'ACTR3B', 'CDCA1', 'RAD51AP1', 'UBE2T', 'RRM2', 'AURKA', 'EXO1', 'TOP2A', 'FOXM1', 'MLPH', 'GLI2', 'CDC6', 'KIF2C', 'GSTM1', 'TYMS', 'MMP11', 'MIA', 'MMP9', 'CCNE1', 'JUNB', 'MAPT', 'CENPF', 'GPR160', 'RABEP1', 'PSRC1', 'SPAG5', 'MAFB', 'PGRMC1', 'GRB14', 'CXXC5', 'LIMCH1']

def get_gene_id(gene_name):
    url = f'https://www.ncbi.nlm.nih.gov/gene/?term={gene_name}+Homo+sapiens'
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    gene_link = soup.find('a', text=gene_name)
    if gene_link:
        gene_id = gene_link.get('href').split('/')[-1]
        return gene_id
    else:
        return None

gene_ids = {}

for gene_name in pam50_gene_list:
    gene_id = get_gene_id(gene_name)
    gene_ids[gene_name] = gene_id

for gene_name, gene_id in gene_ids.items():
    print(f'Gene Name: {gene_name}, Gene ID: {gene_id}')

Gene Name: ESR1, Gene ID: 2099
Gene Name: ERBB2, Gene ID: 2064
Gene Name: PGR, Gene ID: 5241
Gene Name: MKI67, Gene ID: 4288
Gene Name: CCNB1, Gene ID: 891
Gene Name: MYBL2, Gene ID: 4605
Gene Name: CDC20, Gene ID: 991
Gene Name: UBE2C, Gene ID: 11065
Gene Name: BIRC5, Gene ID: 332
Gene Name: ANLN, Gene ID: 54443
Gene Name: KRT5, Gene ID: 3852
Gene Name: EGFR, Gene ID: 1956
Gene Name: FOXA1, Gene ID: 3169
Gene Name: GRB7, Gene ID: 2886
Gene Name: ERBB3, Gene ID: 2065
Gene Name: MELK, Gene ID: 9833
Gene Name: NDC80, Gene ID: 10403
Gene Name: TMEM45B, Gene ID: 120224
Gene Name: ACTR3B, Gene ID: 57180
Gene Name: CDCA1, Gene ID: None
Gene Name: RAD51AP1, Gene ID: 10635
Gene Name: UBE2T, Gene ID: 29089
Gene Name: RRM2, Gene ID: 6241
Gene Name: AURKA, Gene ID: 6790
Gene Name: EXO1, Gene ID: 9156
Gene Name: TOP2A, Gene ID: 7153
Gene Name: FOXM1, Gene ID: 2305
Gene Name: MLPH, Gene ID: 79083
Gene Name: GLI2, Gene ID: 2736
Gene Name: CDC6, Gene ID: 990
Gene Name: KIF2C, Gene ID: 11004
Gene Name

# Return all synonims for genes with IDs in the list

In [33]:
import requests
from bs4 import BeautifulSoup

gene_ids = [2099, 2064, 5241, 4288, 891, 4605, 991, 11065, 332, 54443, 3852, 1956, 3169, 2886, 2065, 9833, 10403, 120224, 57180, 83540, 10635, 29089, 6241, 6790, 9156, 7153, 2305, 79083, 2736, 990, 11004, 2944, 7298, 4320, 8190, 4318, 898, 3726, 4137, 1063, 26996, 9135, 84722, 10615, 9935, 10857, 2888, 51523, 22998]

def get_alternative_names(gene_id):
    url = f"https://www.ncbi.nlm.nih.gov/gene/{gene_id}"
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    dt_elements = soup.find_all('dt')
    for dt_element in dt_elements:
        if dt_element.text == "Also known as":
            dd_element = dt_element.find_next('dd')
            alternative_names = dd_element.text.split('; ')
            return alternative_names
    return []

my_genes = {}
for gene_id in gene_ids:
    alternative_names = get_alternative_names(gene_id)
    my_genes[gene_id] = alternative_names
    print(f"Gene ID: {gene_id}")
    print(f"Alternative Names: {alternative_names}")

Gene ID: 2099
Alternative Names: ['ER', 'ESR', 'Era', 'ESRA', 'ESTRR', 'NR3A1']
Gene ID: 2064
Alternative Names: ['NEU', 'NGL', 'HER2', 'TKR1', 'CD340', 'HER-2', 'VSCN2', 'MLN 19', 'MLN-19', 'c-ERB2', 'c-ERB-2', 'HER-2/neu', 'p185(erbB2)']
Gene ID: 5241
Alternative Names: ['PR', 'NR3C3']
Gene ID: 4288
Alternative Names: ['KIA', 'MIB-', 'MIB-1', 'PPP1R105']
Gene ID: 891
Alternative Names: ['CCNB']
Gene ID: 4605
Alternative Names: ['BMYB', 'B-MYB']
Gene ID: 991
Alternative Names: ['CDC20A', 'OOMD14', 'p55CDC', 'OZEMA14', 'bA276H19.3']
Gene ID: 11065
Alternative Names: ['UBCH10', 'dJ447F3.2']
Gene ID: 332
Alternative Names: ['API4', 'EPR-1']
Gene ID: 54443
Alternative Names: ['scra', 'FSGS8', 'Scraps']
Gene ID: 3852
Alternative Names: ['K5', 'CK5', 'DDD', 'DDD1', 'EBS1', 'EBS2', 'EBS2A', 'EBS2B', 'EBS2C', 'EBS2D', 'EBS2E', 'EBS2F', 'KRT5A']
Gene ID: 1956
Alternative Names: ['ERBB', 'ERRP', 'HER1', 'mENA', 'ERBB1', 'PIG61', 'NISBD2']
Gene ID: 3169
Alternative Names: ['HNF3A', 'TCF3A']
Gene

In [32]:
gene_ids

['2099',
 '2064',
 '207',
 '3845',
 '7422',
 '6416',
 '672',
 '4085',
 '2308',
 '782',
 '5241',
 '673',
 '4318',
 '596',
 '2249',
 '55904',
 '9575',
 '2353',
 '6256',
 '5050',
 '4171',
 '4193',
 '7157',
 '860',
 '598',
 '3576',
 '6732',
 '6720',
 '5290',
 '3417',
 '6790',
 '5728',
 '2202',
 '4055',
 '864',
 '6193',
 '6233',
 '6194',
 '3569',
 '23411',
 '9844',
 '648',
 '9133',
 '10891',
 '8811',
 '23471',
 '868',
 '811',
 '6159',
 '4843',
 '54106']

In [36]:
genes

['ESR1',
 'PGR',
 'GATA3',
 'TFF1',
 'ESR2',
 'ERBB4',
 'FOXC1',
 'MLPH',
 'MAPT',
 'SLC39A6',
 'NAT1',
 'SLC7A5',
 'ABCC4',
 'BCL2',
 'CCNB1',
 'MYBL2',
 'FOXM1',
 'AURKA',
 'CDC20',
 'EXO1',
 'MELK',
 'BIRC5',
 'UBE2C',
 'KIF2C',
 'RRM2',
 'TYMS',
 'MMP11',
 'ERBB2',
 'GRB7',
 'HER2',
 'LAPTM4B',
 'STAT1',
 'MIA',
 'CTHRC1',
 'CK5',
 'EGFR',
 'KRT17',
 'KRT14',
 'KRT6A',
 'KRT6B',
 'KRT16',
 'CDH3',
 'CDH1',
 'FOXA1',
 'SCUBE2',
 'SFRP1',
 'AREG',
 'TGFB1',
 'NME1',
 'NME2']

In [35]:
my_genes

{2099: ['ER', 'ESR', 'Era', 'ESRA', 'ESTRR', 'NR3A1'],
 2064: ['NEU',
  'NGL',
  'HER2',
  'TKR1',
  'CD340',
  'HER-2',
  'VSCN2',
  'MLN 19',
  'MLN-19',
  'c-ERB2',
  'c-ERB-2',
  'HER-2/neu',
  'p185(erbB2)'],
 5241: ['PR', 'NR3C3'],
 4288: ['KIA', 'MIB-', 'MIB-1', 'PPP1R105'],
 891: ['CCNB'],
 4605: ['BMYB', 'B-MYB'],
 991: ['CDC20A', 'OOMD14', 'p55CDC', 'OZEMA14', 'bA276H19.3'],
 11065: ['UBCH10', 'dJ447F3.2'],
 332: ['API4', 'EPR-1'],
 54443: ['scra', 'FSGS8', 'Scraps'],
 3852: ['K5',
  'CK5',
  'DDD',
  'DDD1',
  'EBS1',
  'EBS2',
  'EBS2A',
  'EBS2B',
  'EBS2C',
  'EBS2D',
  'EBS2E',
  'EBS2F',
  'KRT5A'],
 1956: ['ERBB', 'ERRP', 'HER1', 'mENA', 'ERBB1', 'PIG61', 'NISBD2'],
 3169: ['HNF3A', 'TCF3A'],
 2886: [],
 2065: ['HER3',
  'FERLK',
  'LCCS2',
  'VSCN1',
  'ErbB-3',
  'c-erbB3',
  'erbB3-S',
  'MDA-BF-1',
  'c-erbB-3',
  'p180-ErbB3',
  'p45-sErbB3',
  'p85-sErbB3'],
 9833: ['HPK38'],
 10403: ['HEC', 'HEC1', 'TID3', 'KNTC2', 'HsHec1', 'hsNDC80'],
 120224: [],
 57180: ['AR

# Compiling them all together

In [37]:
genes_with_synonims = []
for gene in genes:
    genes_with_synonims.append(gene)
for gene_id in my_genes:
    for synonim in my_genes[gene_id]:
        genes_with_synonims.append(synonim)
genes_with_synonims

['ESR1',
 'PGR',
 'GATA3',
 'TFF1',
 'ESR2',
 'ERBB4',
 'FOXC1',
 'MLPH',
 'MAPT',
 'SLC39A6',
 'NAT1',
 'SLC7A5',
 'ABCC4',
 'BCL2',
 'CCNB1',
 'MYBL2',
 'FOXM1',
 'AURKA',
 'CDC20',
 'EXO1',
 'MELK',
 'BIRC5',
 'UBE2C',
 'KIF2C',
 'RRM2',
 'TYMS',
 'MMP11',
 'ERBB2',
 'GRB7',
 'HER2',
 'LAPTM4B',
 'STAT1',
 'MIA',
 'CTHRC1',
 'CK5',
 'EGFR',
 'KRT17',
 'KRT14',
 'KRT6A',
 'KRT6B',
 'KRT16',
 'CDH3',
 'CDH1',
 'FOXA1',
 'SCUBE2',
 'SFRP1',
 'AREG',
 'TGFB1',
 'NME1',
 'NME2',
 'ER',
 'ESR',
 'Era',
 'ESRA',
 'ESTRR',
 'NR3A1',
 'NEU',
 'NGL',
 'HER2',
 'TKR1',
 'CD340',
 'HER-2',
 'VSCN2',
 'MLN 19',
 'MLN-19',
 'c-ERB2',
 'c-ERB-2',
 'HER-2/neu',
 'p185(erbB2)',
 'PR',
 'NR3C3',
 'KIA',
 'MIB-',
 'MIB-1',
 'PPP1R105',
 'CCNB',
 'BMYB',
 'B-MYB',
 'CDC20A',
 'OOMD14',
 'p55CDC',
 'OZEMA14',
 'bA276H19.3',
 'UBCH10',
 'dJ447F3.2',
 'API4',
 'EPR-1',
 'scra',
 'FSGS8',
 'Scraps',
 'K5',
 'CK5',
 'DDD',
 'DDD1',
 'EBS1',
 'EBS2',
 'EBS2A',
 'EBS2B',
 'EBS2C',
 'EBS2D',
 'EBS2E',
 'EBS2F'

In [38]:
import pandas as pd

anova_results = pd.read_excel("View_pairwise_anova_selected.xlsx")
survival_results = pd.read_excel("View_univariate_survival_analysis_logHR_selected.xlsx")

In [45]:
anova_markers = []
survival_markers = []
for i in survival_results["Unnamed: 0"]:
    if i in genes_with_synonims:
        survival_markers.append(i)
for i in anova_results["Unnamed: 0"]:
    if i in genes_with_synonims:
        anova_markers.append(i)

In [46]:
anova_markers

['CDC20', 'MLPH']

In [47]:
survival_markers

['CDC20', 'MLPH']

In [50]:
genes_with_synonims

['ESR1',
 'PGR',
 'GATA3',
 'TFF1',
 'ESR2',
 'ERBB4',
 'FOXC1',
 'MLPH',
 'MAPT',
 'SLC39A6',
 'NAT1',
 'SLC7A5',
 'ABCC4',
 'BCL2',
 'CCNB1',
 'MYBL2',
 'FOXM1',
 'AURKA',
 'CDC20',
 'EXO1',
 'MELK',
 'BIRC5',
 'UBE2C',
 'KIF2C',
 'RRM2',
 'TYMS',
 'MMP11',
 'ERBB2',
 'GRB7',
 'HER2',
 'LAPTM4B',
 'STAT1',
 'MIA',
 'CTHRC1',
 'CK5',
 'EGFR',
 'KRT17',
 'KRT14',
 'KRT6A',
 'KRT6B',
 'KRT16',
 'CDH3',
 'CDH1',
 'FOXA1',
 'SCUBE2',
 'SFRP1',
 'AREG',
 'TGFB1',
 'NME1',
 'NME2',
 'ER',
 'ESR',
 'Era',
 'ESRA',
 'ESTRR',
 'NR3A1',
 'NEU',
 'NGL',
 'HER2',
 'TKR1',
 'CD340',
 'HER-2',
 'VSCN2',
 'MLN 19',
 'MLN-19',
 'c-ERB2',
 'c-ERB-2',
 'HER-2/neu',
 'p185(erbB2)',
 'PR',
 'NR3C3',
 'KIA',
 'MIB-',
 'MIB-1',
 'PPP1R105',
 'CCNB',
 'BMYB',
 'B-MYB',
 'CDC20A',
 'OOMD14',
 'p55CDC',
 'OZEMA14',
 'bA276H19.3',
 'UBCH10',
 'dJ447F3.2',
 'API4',
 'EPR-1',
 'scra',
 'FSGS8',
 'Scraps',
 'K5',
 'CK5',
 'DDD',
 'DDD1',
 'EBS1',
 'EBS2',
 'EBS2A',
 'EBS2B',
 'EBS2C',
 'EBS2D',
 'EBS2E',
 'EBS2F'

In [51]:
all_genes = pd.read_excel("all_genes_data_Basal-like+Claudin-low.xlsx")

In [52]:
all_markers = []
for i in all_genes["gene_name"]:
    if i in genes_with_synonims:
        all_markers.append(i)
all_markers

['MLPH', 'CDC20']

In [66]:
import numpy as np
mask = survival_results.notna()

In [67]:
for x in survival_results['All-tumours']:
    if x == survival_results[mask]:
        print(x)

ValueError: The truth value of a DataFrame is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [95]:
atsurv = []
la = []
lb = []
her2 = []
bl = []
for i in survival_results['Unnamed: 0'][survival_results['All-tumours'].notna()]:
    atsurv.append(i)
for i in survival_results['Unnamed: 0'][survival_results['Luminal-A'].notna()]:
    la.append(i)
for i in survival_results['Unnamed: 0'][survival_results['Luminal-B'].notna()]:
    lb.append(i)
for i in survival_results['Unnamed: 0'][survival_results['HER2-enriched'].notna()]:
    her2.append(i)
for i in survival_results['Unnamed: 0'][survival_results['Basal-like'].notna()]:
    bl.append(i)

In [96]:
groups = {0: "atsurv", 1: "la", 2: "lb", 3: "her2", 4: "bl"}
for group in range(5):
    for group2 in range(5):
        if group2 > group:
            for gene in [atsurv, la, lb, her2, bl][group]:
                if gene in [atsurv, la, lb, her2, bl][group2]:
                    print(groups[group], groups[group2], gene)

atsurv la MORC4
atsurv lb MORC4
atsurv her2 CSKMT
la lb MORC4
la lb SMYD4
la lb WDR88
la her2 BRPF1
la her2 CBX5
la her2 HIF1AN
la her2 RBSN
la her2 SETD5
lb her2 MLPH


In [115]:
atsurv = []
la = []
lb = []
her2 = []
bl = []
for i in anova_results['Unnamed: 0'][anova_results['All-tumours'].notna()]:
    atsurv.append(i)
for i in anova_results['Unnamed: 0'][anova_results['Luminal-A'].notna()]:
    la.append(i)
for i in anova_results['Unnamed: 0'][anova_results['Luminal-B'].notna()]:
    lb.append(i)
for i in anova_results['Unnamed: 0'][anova_results['HER2-enriched'].notna()]:
    her2.append(i)
for i in anova_results['Unnamed: 0'][anova_results['Basal-like'].notna()]:
    bl.append(i)

In [116]:
groups = {0: "atsurv", 1: "la", 2: "lb", 3: "her2", 4: "bl"}
for group in range(5):
    for group2 in range(5):
        if group2 > group:
            for gene in [atsurv, la, lb, her2, bl][group]:
                if gene in [atsurv, la, lb, her2, bl][group2]:
                    print(groups[group], groups[group2], gene)

atsurv la BOP1
atsurv la CBX2
atsurv la CDC20
atsurv la CDYL2
atsurv la DNMT3B
atsurv la DTL
atsurv la EZH2
atsurv la FGD3
atsurv la KDM4B
atsurv la MLPH
atsurv la ORC1
atsurv la POC1A
atsurv la SUV39H2
atsurv la TCF19
atsurv la TLE3
atsurv la UHRF1
atsurv la WDHD1
atsurv la CBX7
atsurv la FYCO1
atsurv la MYRIP
atsurv la WDR86
atsurv lb DTL
atsurv lb MLPH
atsurv lb PLEKHF2
atsurv lb UHRF1
atsurv lb PLEKHF1
atsurv lb TLE4
atsurv her2 CBX2
atsurv her2 CDC20
atsurv her2 DNMT3B
atsurv her2 FGD1
atsurv her2 FGD3
atsurv her2 KDM4B
atsurv her2 ORC1
atsurv her2 TLE3
atsurv her2 CBX7
atsurv her2 EZH1
atsurv her2 MYRIP
atsurv her2 PRDM11
atsurv her2 TLE1
atsurv her2 WDR27
atsurv bl ANTKMT
atsurv bl BOP1
atsurv bl CBX2
atsurv bl CBX4
atsurv bl CDC20
atsurv bl CDYL2
atsurv bl DNMT3A
atsurv bl DNMT3B
atsurv bl EZH2
atsurv bl FGD1
atsurv bl FGD3
atsurv bl GSTCD
atsurv bl HDGF
atsurv bl JMJD4
atsurv bl KDM4B
atsurv bl MLPH
atsurv bl ORC1
atsurv bl PLEKHF2
atsurv bl SMYD3
atsurv bl SP140
atsurv bl SUV

# Super analysis

In [101]:
for i in anova_results:
    print(anova_results[i])

0       ANTKMT
1         BOP1
2        BRAT1
3         BUB3
4         CBX2
5         CBX3
6         CBX4
7         CBX8
8        CDC20
9        CDYL2
10        CHD4
11       CKAP5
12        COPA
13       COPB2
14       CORO7
15       DNMT1
16      DNMT3A
17      DNMT3B
18        DOHH
19         DTL
20     EEF2KMT
21       EHMT2
22       EIPR1
23        EZH2
24       FBXW9
25        FGD1
26        FGD3
27        FGD6
28       GNB1L
29       GSTCD
30        HDGF
31       JADE3
32       JMJD4
33       KDM4B
34       KDM5B
35       KMT5C
36       LRWD1
37        MLPH
38       MROH1
39        NSD2
40        ORC1
41     PLEKHF2
42       POC1A
43       PYGO2
44       RBBP7
45       RIOX1
46       SEC13
47       SMYD3
48       SP140
49     SUV39H1
50     SUV39H2
51        TBL2
52        TBL3
53       TCF19
54       TDRKH
55       THOC3
56       THOC6
57        TLE3
58       TRAF7
59      TTLL12
60       UHRF1
61       UTP18
62       WDHD1
63       WDR18
64       WDR24
65        WDR4
66       W