In [1]:
from pybiomart import Server
import pandas as pd

input_file = "MGI.xlsx" 
output_file = "human_MGI.csv" 

df = pd.read_excel(input_file, engine='openpyxl')

server = Server(host='http://www.ensembl.org')

dataset_rat = server.marts['ENSEMBL_MART_ENSEMBL'].datasets['rnorvegicus_gene_ensembl']

query = dataset_rat.query(
    attributes=['external_gene_name', 'hsapiens_homolog_associated_gene_name']
)

query.columns = ['Symbol', 'human_symbol']

df = df.merge(query, on='Symbol', how='left')

df.to_csv(output_file, index=False)

print(f"Conversion complete! Saved to {output_file}")


  warn("Workbook contains no default style, apply openpyxl's default")


Conversion complete! Saved to human_MGI.csv


In [2]:
df.head()

Unnamed: 0,Type,MGI ID,Symbol,Name,Chr,Start,End,Build,Strand,Best Match Type,Best Match,Match Score,human_symbol
0,protein coding gene,MGI:3041203,Adgrd1,adhesion G protein-coupled receptor D1,5,129173814,129281663.0,GRCm39,+,Process,neurogenesis,4,ADGRD1
1,protein coding gene,MGI:2685213,Adgrg4,adhesion G protein-coupled receptor G4,X,55939594,56025719.0,GRCm39,+,Process,neurogenesis,4,ADGRG4
2,protein coding gene,MGI:87984,Akp3,"alkaline phosphatase 3, intestine, not Mn requ...",1,87052695,87055634.0,GRCm39,+,Process,neurogenesis,4,ALPG
3,protein coding gene,MGI:87984,Akp3,"alkaline phosphatase 3, intestine, not Mn requ...",1,87052695,87055634.0,GRCm39,+,Process,neurogenesis,4,ALPP
4,protein coding gene,MGI:87984,Akp3,"alkaline phosphatase 3, intestine, not Mn requ...",1,87052695,87055634.0,GRCm39,+,Process,neurogenesis,4,ALPI


In [5]:
# check for null values in the 'human_symbol' column
null_values = df[df['human_symbol'].isnull()]
null_values

Unnamed: 0,Type,MGI ID,Symbol,Name,Chr,Start,End,Build,Strand,Best Match Type,Best Match,Match Score,human_symbol
42,protein coding gene,MGI:109178,Fgf13,fibroblast growth factor 13,X,58107505,58613431.0,GRCm39,-,Process,neurogenesis,4,
44,protein coding gene,MGI:1096383,Fgf15,fibroblast growth factor 15,7,144450269,144454690.0,GRCm39,+,Process,neurogenesis,4,
54,protein coding gene,MGI:2685373,Gm527,predicted gene 527,12,64964685,64971365.0,GRCm39,+,Process,neurogenesis,4,
55,lncRNA gene,MGI:5589890,Gm30731,"predicted gene, 30731",4,22490548,22493126.0,GRCm39,+,Process,neurogenesis,4,
62,protein coding gene,MGI:1915023,Kat8,K(lysine) acetyltransferase 8,7,127511689,127525010.0,GRCm39,+,Process,neurogenesis,4,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2042,protein coding gene,MGI:2684056,Vmn2r81,"vomeronasal 2, receptor 81",10,79083611,79130369.0,GRCm39,+,Subterm Process,neuron differentiation (neurogenesis),3,
2043,protein coding gene,MGI:3646522,Vmn2r82,"vomeronasal 2, receptor 82",10,79192425,79232600.0,GRCm39,+,Subterm Process,neuron differentiation (neurogenesis),3,
2044,protein coding gene,MGI:3644559,Vmn2r83,"vomeronasal 2, receptor 83",10,79304792,79327988.0,GRCm39,+,Subterm Process,neuron differentiation (neurogenesis),3,
2045,protein coding gene,MGI:3644483,Vmn2r120,"vomeronasal 2, receptor 120",17,57815783,57852314.0,GRCm39,-,Subterm Process,neuron differentiation (neurogenesis),3,


In [None]:
import pandas as pd
from pybiomart import Server
from rapidfuzz import process

# Load the CSV file
df = pd.read_csv("human_MGI.csv")

# Step 1: Try re-querying BioMart for missing values
server = Server(host="http://www.ensembl.org")
dataset_rat = server.marts['ENSEMBL_MART_ENSEMBL'].datasets['rnorvegicus_gene_ensembl']

query = dataset_rat.query(
    attributes=["external_gene_name", "hsapiens_homolog_associated_gene_name"]
)
query.columns = ["Symbol", "human_symbol"]

# Merge to re-check any missing values
df = df.merge(query, on="Symbol", how="left", suffixes=("", "_biomart"))
df["human_symbol"] = df["human_symbol"].fillna(df["human_symbol_biomart"])
df.drop(columns=["human_symbol_biomart"], inplace=True)

# Step 2: Use fuzzy matching for remaining null values
human_gene_list = query["human_symbol"].dropna().unique()
fuzzy_converted = []  # Store converted gene pairs

def fuzzy_match(symbol):
    result = process.extractOne(symbol, human_gene_list)
    if result:
        match, score, _ = result
        if score > 80:
            fuzzy_converted.append((symbol, match, score))  # Store match details
            return match
    return None

df["human_symbol_fuzzy"] = df["Symbol"].apply(
    lambda x: fuzzy_match(x) if pd.isna(df.at[df[df["Symbol"] == x].index[0], "human_symbol"]) else None
)
df["human_symbol"] = df["human_symbol"].fillna(df["human_symbol_fuzzy"])
df.drop(columns=["human_symbol_fuzzy"], inplace=True)

# Print all fuzzy matches
if fuzzy_converted:
    print("\nFuzzy Matching Applied for the following genes:")
    for rat_gene, human_gene, score in fuzzy_converted:
        print(f"{rat_gene} → {human_gene} (Confidence: {score:.2f})")



Fuzzy Matching Applied for the following genes:
C9orf72 → C9orf72 (Confidence: 100.00)


In [10]:
df.head()

Unnamed: 0,Type,MGI ID,Symbol,Name,Chr,Start,End,Build,Strand,Best Match Type,Best Match,Match Score,human_symbol
0,protein coding gene,MGI:3041203,Adgrd1,adhesion G protein-coupled receptor D1,5,129173814,129281663.0,GRCm39,+,Process,neurogenesis,4,ADGRD1
1,protein coding gene,MGI:2685213,Adgrg4,adhesion G protein-coupled receptor G4,X,55939594,56025719.0,GRCm39,+,Process,neurogenesis,4,ADGRG4
2,protein coding gene,MGI:87984,Akp3,"alkaline phosphatase 3, intestine, not Mn requ...",1,87052695,87055634.0,GRCm39,+,Process,neurogenesis,4,ALPG
3,protein coding gene,MGI:87984,Akp3,"alkaline phosphatase 3, intestine, not Mn requ...",1,87052695,87055634.0,GRCm39,+,Process,neurogenesis,4,ALPG
4,protein coding gene,MGI:87984,Akp3,"alkaline phosphatase 3, intestine, not Mn requ...",1,87052695,87055634.0,GRCm39,+,Process,neurogenesis,4,ALPG


In [11]:
df["human_symbol"].isna().sum()

144

In [13]:
!pip install chembl_webresource_client

Collecting chembl_webresource_client
  Downloading chembl_webresource_client-0.10.9-py3-none-any.whl.metadata (1.4 kB)
Collecting easydict (from chembl_webresource_client)
  Downloading easydict-1.13-py3-none-any.whl.metadata (4.2 kB)
Downloading chembl_webresource_client-0.10.9-py3-none-any.whl (55 kB)
Downloading easydict-1.13-py3-none-any.whl (6.8 kB)
Installing collected packages: easydict, chembl_webresource_client
Successfully installed chembl_webresource_client-0.10.9 easydict-1.13


In [18]:
df = pd.read_csv("human_MGI.csv")
df.head()

Unnamed: 0,Type,MGI ID,Symbol,Name,Chr,Start,End,Build,Strand,Best Match Type,Best Match,Match Score,human_symbol
0,protein coding gene,MGI:3041203,Adgrd1,adhesion G protein-coupled receptor D1,5,129173814,129281663.0,GRCm39,+,Process,neurogenesis,4,ADGRD1
1,protein coding gene,MGI:2685213,Adgrg4,adhesion G protein-coupled receptor G4,X,55939594,56025719.0,GRCm39,+,Process,neurogenesis,4,ADGRG4
2,protein coding gene,MGI:87984,Akp3,"alkaline phosphatase 3, intestine, not Mn requ...",1,87052695,87055634.0,GRCm39,+,Process,neurogenesis,4,ALPG
3,protein coding gene,MGI:87984,Akp3,"alkaline phosphatase 3, intestine, not Mn requ...",1,87052695,87055634.0,GRCm39,+,Process,neurogenesis,4,ALPP
4,protein coding gene,MGI:87984,Akp3,"alkaline phosphatase 3, intestine, not Mn requ...",1,87052695,87055634.0,GRCm39,+,Process,neurogenesis,4,ALPI


In [27]:
from chembl_webresource_client.new_client import new_client

target = new_client.target
target_query = target.search('ALPG')
targets = pd.DataFrame.from_dict(target_query)
targets

Unnamed: 0,cross_references,organism,pref_name,score,species_group_flag,target_chembl_id,target_components,target_type,tax_id
0,[],Homo sapiens,Alkaline phosphatase placental-like,16.0,False,CHEMBL3402,"[{'accession': 'P10696', 'component_descriptio...",SINGLE PROTEIN,9606
1,[],Mus musculus,"Alkaline phosphatase, placental-like",14.0,False,CHEMBL3112374,"[{'accession': 'P24823', 'component_descriptio...",SINGLE PROTEIN,10090


In [None]:
selected_target = targets.target_chembl_id[0]
selected_target

'CHEMBL3402'

In [None]:
activity = new_client.activity
# for all targets calculate results
res = activity.filter(target_chembl_id=selected_target).filter(standard_type="IC50")

In [30]:
df = pd.DataFrame.from_dict(res)
df

Unnamed: 0,action_type,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,...,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,,33155,[],CHEMBL640398,50% inhibition of human placental alkaline pho...,B,,,BAO_0000190,...,Homo sapiens,Alkaline phosphatase placental-like,9606,,,I50,mM,UO_0000065,,0.28
1,,,33157,[],CHEMBL640398,50% inhibition of human placental alkaline pho...,B,,,BAO_0000190,...,Homo sapiens,Alkaline phosphatase placental-like,9606,,,I50,mM,UO_0000065,,0.8
2,,,34373,[],CHEMBL640397,Concentration required for 50% inhibition of h...,B,,,BAO_0000190,...,Homo sapiens,Alkaline phosphatase placental-like,9606,,,I50,mM,UO_0000065,,0.97
3,,,34374,[],CHEMBL640398,50% inhibition of human placental alkaline pho...,B,,,BAO_0000190,...,Homo sapiens,Alkaline phosphatase placental-like,9606,,,I50,mM,UO_0000065,,2.22
4,,,39183,[],CHEMBL640398,50% inhibition of human placental alkaline pho...,B,,,BAO_0000190,...,Homo sapiens,Alkaline phosphatase placental-like,9606,,,I50,mM,UO_0000065,,10.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
716,,Active,5776570,[],CHEMBL1738497,PUBCHEM_BIOASSAY: Luminescent assay for HTS di...,F,,,BAO_0000190,...,Homo sapiens,Alkaline phosphatase placental-like,9606,,,IC50,uM,UO_0000065,,6.63
717,"{'action_type': 'INHIBITOR', 'description': 'N...",,24999546,[],CHEMBL5232628,Inhibition of human germ cell alkaline phospha...,B,,,BAO_0000190,...,Homo sapiens,Alkaline phosphatase placental-like,9606,,,IC50,nM,UO_0000065,,75.0
718,"{'action_type': 'INHIBITOR', 'description': 'N...",,24999547,[],CHEMBL5232628,Inhibition of human germ cell alkaline phospha...,B,,,BAO_0000190,...,Homo sapiens,Alkaline phosphatase placental-like,9606,,,IC50,nM,UO_0000065,,75.0
719,"{'action_type': 'INHIBITOR', 'description': 'N...",,24999548,[],CHEMBL5232628,Inhibition of human germ cell alkaline phospha...,B,,,BAO_0000190,...,Homo sapiens,Alkaline phosphatase placental-like,9606,,,IC50,nM,UO_0000065,,75.0


In [31]:
df["canonical_smiles"]

0                    O=c1c2ccccc2nc2ccc(-c3nn[nH]n3)cn12
1                         O=C(O)c1cccn2c(=O)c3ccccc3nc12
2      CCCCc1cc2c(=O)cc(C(=O)O)[nH]c2c2c(=O)cc(C(=O)O...
3      CCCCc1cc2c(=O)cc(C(=O)O)[nH]c2c2c(=O)cc(C(=O)O...
4                               O=C(O)c1ccc2nccc(=O)n2c1
                             ...                        
716    O=C(CSc1nnc(-c2ccncc2)n1-c1ccc(Cl)cc1)c1ccc(O)...
717              O=C(Nc1nc2ccccc2s1)c1cc(O)nc2ccc(O)cc12
718           COc1cc2c(C(=O)Nc3nc4ccccc4s3)cc(O)nc2cc1Cl
719             Cc1cc2nc(O)cc(C(=O)Nc3nc4ccccc4s3)c2cc1C
720            Cc1cc2c(C(=O)Nc3nc4ccccc4s3)cc(O)nc2cc1Cl
Name: canonical_smiles, Length: 721, dtype: object

In [35]:
df['standard_value']

0        280000.0
1        800000.0
2        970000.0
3       2220000.0
4      10000000.0
          ...    
716        6630.0
717          75.0
718          75.0
719          75.0
720          75.0
Name: standard_value, Length: 721, dtype: object

Processing gene symbols:   0%|          | 3/1976 [00:00<01:31, 21.56it/s]

Error processing ADGRG4: 'organism'


Processing gene symbols:   0%|          | 6/1976 [00:00<01:19, 24.75it/s]

Error processing ANKRD11: 'organism'
Error processing ASCL1: 'organism'
Error processing BRINP1: 'organism'
Error processing BTBD1: 'organism'


Processing gene symbols:   1%|          | 11/1976 [00:01<05:10,  6.33it/s]

Error processing BTBD2: 'organism'
Error processing BTBD3: 'organism'


Processing gene symbols:   1%|          | 13/1976 [00:04<15:59,  2.05it/s]

Error processing BTBD6: 'organism'


Processing gene symbols:   1%|          | 14/1976 [00:06<22:05,  1.48it/s]

Error processing CDK5RAP2: 'organism'


Processing gene symbols:   1%|          | 15/1976 [00:07<27:09,  1.20it/s]

Error processing CEP120: 'organism'


Processing gene symbols:   1%|          | 16/1976 [00:09<31:29,  1.04it/s]

Error processing CHAC1: 'organism'


Processing gene symbols:   1%|          | 17/1976 [00:10<36:21,  1.11s/it]

Error processing CLN5: 'organism'


Processing gene symbols:   1%|          | 18/1976 [00:20<37:57,  1.16s/it]


KeyboardInterrupt: 

In [None]:
dd = pd.read_csv("results/ALPG.csv")

Unnamed: 0,action_type,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,...,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,,33155,[],CHEMBL640398,50% inhibition of human placental alkaline pho...,B,,,BAO_0000190,...,Homo sapiens,Alkaline phosphatase placental-like,9606,,,I50,mM,UO_0000065,,0.28
1,,,33157,[],CHEMBL640398,50% inhibition of human placental alkaline pho...,B,,,BAO_0000190,...,Homo sapiens,Alkaline phosphatase placental-like,9606,,,I50,mM,UO_0000065,,0.80
2,,,34373,[],CHEMBL640397,Concentration required for 50% inhibition of h...,B,,,BAO_0000190,...,Homo sapiens,Alkaline phosphatase placental-like,9606,,,I50,mM,UO_0000065,,0.97
3,,,34374,[],CHEMBL640398,50% inhibition of human placental alkaline pho...,B,,,BAO_0000190,...,Homo sapiens,Alkaline phosphatase placental-like,9606,,,I50,mM,UO_0000065,,2.22
4,,,39183,[],CHEMBL640398,50% inhibition of human placental alkaline pho...,B,,,BAO_0000190,...,Homo sapiens,Alkaline phosphatase placental-like,9606,,,I50,mM,UO_0000065,,10.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
716,,Active,5776570,[],CHEMBL1738497,PUBCHEM_BIOASSAY: Luminescent assay for HTS di...,F,,,BAO_0000190,...,Homo sapiens,Alkaline phosphatase placental-like,9606,,,IC50,uM,UO_0000065,,6.63
717,"{'action_type': 'INHIBITOR', 'description': 'N...",,24999546,[],CHEMBL5232628,Inhibition of human germ cell alkaline phospha...,B,,,BAO_0000190,...,Homo sapiens,Alkaline phosphatase placental-like,9606,,,IC50,nM,UO_0000065,,75.00
718,"{'action_type': 'INHIBITOR', 'description': 'N...",,24999547,[],CHEMBL5232628,Inhibition of human germ cell alkaline phospha...,B,,,BAO_0000190,...,Homo sapiens,Alkaline phosphatase placental-like,9606,,,IC50,nM,UO_0000065,,75.00
719,"{'action_type': 'INHIBITOR', 'description': 'N...",,24999548,[],CHEMBL5232628,Inhibition of human germ cell alkaline phospha...,B,,,BAO_0000190,...,Homo sapiens,Alkaline phosphatase placental-like,9606,,,IC50,nM,UO_0000065,,75.00


In [7]:
dd["canonical_smiles"]

0                    O=c1c2ccccc2nc2ccc(-c3nn[nH]n3)cn12
1                         O=C(O)c1cccn2c(=O)c3ccccc3nc12
2      CCCCc1cc2c(=O)cc(C(=O)O)[nH]c2c2c(=O)cc(C(=O)O...
3      CCCCc1cc2c(=O)cc(C(=O)O)[nH]c2c2c(=O)cc(C(=O)O...
4                               O=C(O)c1ccc2nccc(=O)n2c1
                             ...                        
716    O=C(CSc1nnc(-c2ccncc2)n1-c1ccc(Cl)cc1)c1ccc(O)...
717              O=C(Nc1nc2ccccc2s1)c1cc(O)nc2ccc(O)cc12
718           COc1cc2c(C(=O)Nc3nc4ccccc4s3)cc(O)nc2cc1Cl
719             Cc1cc2nc(O)cc(C(=O)Nc3nc4ccccc4s3)c2cc1C
720            Cc1cc2c(C(=O)Nc3nc4ccccc4s3)cc(O)nc2cc1Cl
Name: canonical_smiles, Length: 721, dtype: object