In [1]:
import pandas as pd
from utils.prompt_factory import make_train_prompt_gene
from tqdm import tqdm
import re
import ast



In [2]:
src_file = './old_data/human_gene_set_bp.csv'
test_file = '/data1/jli49/GEN_RESP/data/updated_filtered_1000_selected_go_terms.csv'
target_file = './data/go_terms_bp.csv'

src_df = pd.read_csv(src_file)
test_df = pd.read_csv(test_file)
target_df = pd.read_csv(target_file)

remove obsolete rows from the src file

In [3]:
src_df = src_df[src_df["exact_source"].isin(target_df["GO"])]
src_df.shape


(7292, 12)

substitute the rephrase from old prompt to new prompt generated

In [4]:
src_df["rephrased"] =None
pattern1 = re.compile(r"^This term refers to a .* which involves")
pattern2 = re.compile(r"^This term refers to a .* which is")

for idx, row in src_df.iterrows():
    match = target_df[target_df["GO"] == row["exact_source"]]
    
    if not match.empty:
        description = match["gpt_4o_default description"].values[0]
        description = pattern1.sub("", description).strip()
        description = pattern2.sub("", description).strip()
        description = "This process involves " + description
        src_df.at[idx, "rephrased"] = description
    else:
        raise ValueError(f"Match not found for row index {idx}, row: {row.to_dict()}")


src_df.head

<bound method NDFrame.head of       Unnamed: 0  gene_set_id  \
0          12832        12989   
1          12834        12991   
2          12835        12992   
3          12836        12993   
4          12837        12994   
...          ...          ...   
7369       20474        20632   
7370       20475        20633   
7371       20476        20634   
7372       20477        20635   
7373       20478        20636   

                                      description_brief  description_full  \
0     The maintenance of the structure and integrity...               NaN   
1     The repair of single strand breaks in DNA. Rep...               NaN   
2     Any process that modulates the frequency, rate...               NaN   
3     Any process that modulates the frequency, rate...               NaN   
4     The cell cycle process in which the distance i...               NaN   
...                                                 ...               ...   
7369  Any process that modulates t

In [5]:
src_df.to_csv("./human_src_data.csv")

check for test gene sets are in the src_df

In [83]:
test_gene_set = test_df['Genes'].apply(lambda x: set(x.split()))

def convert_genes_set(genes_str):
    genes_list = ast.literal_eval(genes_str)
    return set(genes_list)

all_gene_set = src_df['genelist'].apply(convert_genes_set)


def is_subset_of_any(gene_set, all_sets):
    return any(gene_set == some_set for some_set in all_sets)

subset_results = test_gene_set.apply(lambda x: is_subset_of_any(x, all_gene_set))

non_subsets = test_gene_set[~subset_results]
# gene sets that are not in the source df
print(non_subsets)

0                          {SDC1, WNT10B, MEGF10, SOX15}
4      {SULT1C4, CDH3, MFSD12, ACMSD, OPN3, AKR1B10, ...
5      {UQCRC2, CYTB, UQCRH, CYCS, UQCRB, UQCRQ, UQCR...
6      {ANAPC11, CDC20, NEK6, MAD2L1, DPF2, ZNF207, S...
8      {ND1, NDUFB4, ATP5MC1, PPARA, ENO1, SDHC, ND2,...
                             ...                        
981    {NAA15, MAPT, NAA25, TADA2B, ERCC6, HDAC6, KAT...
982    {EFEMP2, NKX2-5, ARID5B, MYL11, ZFPM1, ISL1, T...
985         {CDC20, MOS, TTK, ZWINT, CHFR, MAPK15, KNL1}
988                        {MDGA1, LRFN3, PCDH8, MAPK14}
989    {RD3, IER3, TP53, PID1, SIRT6, NUPR1, PPARA, T...
Name: Genes, Length: 614, dtype: object


Drop test genes from src_df

In [84]:
subset_genes = test_gene_set[subset_results]
print(len(subset_genes))

src_df['genelist_set'] = src_df['genelist'].apply(convert_genes_set)



# Filter out rows from src_df where the genelist_set is in the subset_genes
src_df = src_df[~src_df['genelist_set'].apply(lambda x: is_subset_of_any(x, subset_genes))]

# Drop the temporary column
src_df = src_df.drop(columns=['genelist_set'])
src_df.shape

376


(6919, 12)

In [85]:
# The number does not match due to some duplicate gene sets in the test_df where the same gene set could have various corresponding GO terms

duplicates = test_df['Genes'].duplicated(keep=False)
duplicate_rows = test_df[duplicates]
print(duplicate_rows)

             GO                                              Genes  \
81   GO:0032348                                REST BMP5 BMP2 DKK3   
351  GO:0070886  SPPL3 C10orf71 AKAP5 LMCD1 IGF1 AKAP6 CAMTA1 T...   
357  GO:0009146                      ADA NUDT16 SAMHD1 ITPA NUDT15   
385  GO:2000065                                REST BMP5 BMP2 DKK3   
689  GO:1905459  ADCY10 PPARG IGF1 MFN2 E2F3 SOD2 SLC7A5 ATF4 P...   
736  GO:1905288  ADCY10 PPARG IGF1 MFN2 E2F3 SOD2 SLC7A5 ATF4 P...   
758  GO:0042640                  WNT5A CTNNB1 SPINK5 WNT10B FERMT1   
804  GO:0009204                      ADA NUDT16 SAMHD1 ITPA NUDT15   
867  GO:0051884                  WNT5A CTNNB1 SPINK5 WNT10B FERMT1   
949  GO:0106058  SPPL3 C10orf71 AKAP5 LMCD1 IGF1 AKAP6 CAMTA1 T...   

     Gene_Count                                   Term_Description  
81            4  negative regulation of aldosterone biosyntheti...  
351          21  positive regulation of calcineurin-NFAT signal...  
357           5   puri

Furthermore, drop same GO terms from src_df to make the train_df

In [86]:
def is_equal_any(GO, all_test_GO):
    return any(GO == some_GO for some_GO in all_test_GO)


all_test_GO = test_df["Term_Description"].tolist()
train_df = src_df[~src_df['GO_term'].apply(lambda x: is_equal_any(x, all_test_GO))]

train_df.shape

(6571, 12)

convert gene list to space separated in train_df

In [87]:
def convert_genes_string(genes_str):
    genes_list = ast.literal_eval(genes_str)
    return ' '.join(genes_list)


train_df['genelist'] = train_df['genelist'].apply(convert_genes_string)

train_df['genelist'].head

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['genelist'] = train_df['genelist'].apply(convert_genes_string)


<bound method NDFrame.head of 0       AKT3 PPARGC1A POLG2 PARP1 DNA2 TYMP FLCN PRIMP...
1       XNDC1N ERCC8 PARP1 APLF ERCC6 SIRT1 LIG4 APTX ...
2       PARP3 ACTR2 RAD50 ALYREF MAD2L2 KAT5 RAD51AP1 ...
3              RAD50 ANKLE1 ZSCAN4 ERCC2 MLH1 MRE11 TERF2
5       RPL10L RPLP0P6 FASTKD2 DHX30 MDN1 RRS1 BOP1 MR...
                              ...                        
7369    PIBF1 ABCD1 FABP5 ABCD2 SIRT1 ANXA1 IL1B MIR13...
7370    ABCD1 ABCD2 ANXA1 IL1B PLA2G3 AVP AVPR1A PTGS2...
7371                    CLN3 PROM2 NEDD4L SRC CAV3 UNC119
7372                 MLYCD NUDT8 NUDT7 ACACA ACACB NUDT19
7373                    PTGR1 ALOX12 ALOX5 ALOX15 ALOX15B
Name: genelist, Length: 6571, dtype: object>

Formulate test_df

In [88]:
for idx, row in target_df.iterrows():
    description = row['gpt_4o_default description']
    description = pattern1.sub("", description).strip()
    description = pattern2.sub("", description).strip()
    description = "This process involves " + description
    target_df.at[idx, "rephrased"] = description

In [89]:
# 1. change column name
test_df.rename(columns={'Genes': 'genelist'}, inplace=True)

# 2. add matching rephrase
test_df["rephrased"] =None
for i, row in test_df.iterrows():
    test_df.at[i, "rephrased"] = target_df[target_df['Term_Description'] == row['Term_Description']]['rephrased'].iloc[0]
test_df.head

<bound method NDFrame.head of              GO                                           genelist  \
0    GO:0048627                           MEGF10 SDC1 WNT10B SOX15   
1    GO:1904888  CPLANE2 NEUROG1 GRHL2 TGFB3 EXT1 TGFBR2 TWIST1...   
2    GO:0019585  DCXR UGT1A9 UGT2B7 PRKCE UGT1A7 UGT2A3 SORD UG...   
3    GO:1902267                         AZIN1 OAZ2 OAZ1 AZIN2 OAZ3   
4    GO:0019748  BDH2 CYP2A7 AKR1C1 ACMSD ATP7A ASIP DDT CYP3A4...   
..          ...                                                ...   
985  GO:0044785               MOS CDC20 CHFR TTK MAPK15 KNL1 ZWINT   
986  GO:0045494  CLN8 ADGRV1 NXNL2 CDH23 MKKS BBS10 TUB LCA5 NX...   
987  GO:0031204  HSPA5 SEC61A2 GLP1R SEC63 SEC62 SEC61G SEC61A1...   
988  GO:0099179                           MDGA1 LRFN3 MAPK14 PCDH8   
989  GO:0045980  PPP2CA CBFA2T3 TP53 SIRT6 TIGAR IER3 PFKFB1 PI...   

     Gene_Count                                   Term_Description  \
0             4                               myoblast deve

Save train/test DF (optional)

In [90]:
train_df.to_csv("training_code/new_train_human.csv")
test_df.to_csv("training_code/new_test_human.csv")

make train json

In [12]:
json_list = []

for idx, row in tqdm(train_df.iterrows(), total=train_df.shape[0]):

    gene_data = row['genelist']
    genes = gene_data.split(' ')
    prompt = make_train_prompt_gene(genes)

    json_object = {
        "sentence": [
            {"role": "system", "content": "You are a senior biologist."},
            {"role": "user", "content": prompt},
            {"role": "assistant", "content": row["rephrased"]}
        ]
    }

    json_list.append(json_object)

json_df = pd.DataFrame(json_list)

json_df.to_json('./training_code/new_train_human_wo_shot.json', orient='records', lines=True)

print(json_df.head(3).to_json(orient='records', lines=True))


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6571/6571 [1:48:08<00:00,  1.01it/s]

{"sentence":[{"role":"system","content":"You are a senior biologist."},{"role":"user","content":"Propose a brief description for the most prominent biological function performed by the given set of genes.\nBe concise, do not use unnecessary words.\nBe specific, avoid overly general statements such as \"the genes are involved in various cellular processes\".\nBe factual, do not editorialize.\n\nThe set of genes is: AKT3: AKT serine\/threonine kinase 3; PPARGC1A: PPARG coactivator 1 alpha; POLG2: DNA polymerase gamma 2, accessory subunit; PARP1: poly(ADP-ribose) polymerase 1; DNA2: DNA replication helicase\/nuclease 2; TYMP: thymidine phosphorylase; FLCN: folliculin; PRIMPOL: primase and DNA directed polymerase; ENDOG: endonuclease G; STOX1: storkhead box 1; SLC25A4: solute carrier family 25 member 4; LIG3: DNA ligase 3; MEF2A: myocyte enhancer factor 2A; MPV17: mitochondrial inner membrane protein MPV17; OPA1: OPA1 mitochondrial dynamin like GTPase; RRM2B: ribonucleotide reductase regul




make test json

In [13]:


inputlis = []
outputlis = []

for idx, row in tqdm(test_df.iterrows(), total=test_df.shape[0]):

    gene_data = row['genelist']
    genes = gene_data.split(' ')
    prompt = make_train_prompt_gene(genes)

    input  = [
    {
        'role': 'system', 'content': 'You are a senior biologist.'
    }, 
    {
        'role': 'user', 'content': prompt
    }
    ]
    output = row["rephrased"]
    inputlis.append(input)
    outputlis.append(output)


df_test = pd.DataFrame(
        {'input': inputlis,
        'output': outputlis},
        columns = ['input', 'output'])


df_test.to_json('./training_code/new_test_human_wo_shot.json', orient='records',lines=True)


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 990/990 [08:43<00:00,  1.89it/s]


In [None]:
output_series = pd.Series(outputlis)
missing_values = output_series.isna()

missing_values.sum()

0