In [16]:
import os
from openai import OpenAI  #OpenAI API client
from dotenv import load_dotenv  #For loading environment variables from .env file
load_dotenv()  #This loads variables from a .env file into environment variables
api_key = os.getenv("OPENAI_API_KEY")  #Get the OpenAI API key from environment variables
client = OpenAI(api_key=api_key)  #Initialize the OpenAI client with the API key
import pandas as pd
from Bio import SeqIO
from tqdm import tqdm
import numpy as np


In [9]:
fasta_dir = "sequences"

# Mapping of base categories to questions
questions = {
    "protein_coding": "Does this nucleotide sequence encode a protein? Only answer Yes or No. You must start your answer with 'Yes' or 'No'.",
    "enhancer": "Does this nucleotide sequence function as an enhancer in gene regulation? Only answer Yes or No. You must start your answer with 'Yes' or 'No'.",
    "promoter": "Does this nucleotide sequence act as a promoter for transcription initiation? Only answer Yes or No. You must start your answer with 'Yes' or 'No'.",
    "splice_site": "Does this nucleotide sequence contain a splice site for RNA processing? Only answer Yes or No. You must start your answer with 'Yes' or 'No'.",
    "methylated": "Is this nucleotide sequence methylated as part of epigenetic regulation? Only answer Yes or No. You must start your answer with 'Yes' or 'No'."
}

# Dictionary to hold sequence data
sequence_arrays = {}

# Loop through all FASTA files in the directory
for file in os.listdir(fasta_dir):
    if file.endswith(".fasta"):  # Process only FASTA files
        file_path = os.path.join(fasta_dir, file)
        var_name = os.path.splitext(file)[0]  # Use file name (without extension) as key

        # Determine base category
        if var_name.startswith("non_"):
            base_category = var_name[4:]  # Remove "non_" prefix
        else:
            base_category = var_name
        question = questions[base_category]  # Get the corresponding question

        # Read the FASTA file and create a list of dictionaries
        gene_dicts = []
        for record in SeqIO.parse(file_path, "fasta"):
            gene_dict = {
                "file_name": var_name,
                "gene": str(record.seq),
                "question": question,  # Add the question key
            }
            gene_dicts.append(gene_dict)
        
        # Store the list of dictionaries in the dictionary
        sequence_arrays[var_name] = gene_dicts

        print(f"Stored {var_name} with {len(gene_dicts)} genes.")

Stored enhancer with 98 genes.
Stored methylated with 100 genes.
Stored non_enhancer with 98 genes.
Stored non_methylated with 100 genes.
Stored non_promoter with 100 genes.
Stored non_protein_coding with 88 genes.
Stored non_splice_site with 70 genes.
Stored promoter with 100 genes.
Stored protein_coding with 100 genes.
Stored splice_site with 70 genes.


In [10]:
def generate_hallucination(gene_dict, model):
    response = client.chat.completions.create(
            model= model,  #Use the model specified in the constructor

            messages=
            [
            {"role": "system", "content": "You are an expert in genomics."},
            {"role": "user", "content": f"{gene_dict['gene']} Describe the gene in natural language:"}
        ],  #The conversation context to send to the API


            max_tokens=256,  #Maximum length of the response

            temperature=0.6  #Controls randomness/creativity of the response
        )
    return response.choices[0].message.content

In [11]:
def generate_classification(gene_dict, model):
    response = client.chat.completions.create(
            model = model,  #Use the model specified in the constructor

            messages=
            [
            {"role": "system", "content": "You are an expert in genomics."},
            {"role": "user", "content": f"{gene_dict['gene']} {gene_dict['question']}"}
        ],  #The conversation context to send to the API


            max_tokens=256,  #Maximum length of the response

            temperature=0.6  #Controls randomness/creativity of the response
        )
    return response.choices[0].message.content

In [12]:
def generate_hallucination_classification(gene_dict, model):
    response = client.chat.completions.create(
            model = model,  #Use the model specified in the constructor

            messages=
            [
            {"role": "system", "content": "You are an expert in genomics."},
            {"role": "user", "content": f"{gene_dict['gene']} {gene_dict['hallucination']} {gene_dict['question']}"}
        ],  #The conversation context to send to the API


            max_tokens=256,  #Maximum length of the response

            temperature=0.6  #Controls randomness/creativity of the response
        )
    return response.choices[0].message.content

In [13]:
all_results = []

MODELS = [
    "gpt-3.5-turbo",
    "gpt-4o-mini",

]
for model_idx, model in enumerate(MODELS):
    model_progress = tqdm(total=len(sequence_arrays), desc=f"Model [{model_idx+1}/6]: {model.split('/')[-1]}")
    
    for file_idx, (file, sequences) in enumerate(sequence_arrays.items()):
        file_progress = tqdm(total=len(sequences), desc=f"File [{file_idx+1}/{len(sequence_arrays)}]: {file}", leave=False)
        
        for gene_idx, gene in enumerate(sequences):
            file_progress.set_postfix_str(f"Gene {gene_idx+1}/{len(sequences)}")
            
            result = gene.copy()
            result['model'] = model
            
            try:
                result['hallucination'] = generate_hallucination(result, model)
            except Exception as e:
                print(f"Error generating hallucination for {file}, gene {gene_idx+1}: {str(e)}")
                result['hallucination'] = None
            
            try:
                result['hallucination_classification'] = generate_hallucination_classification(result, model)
            except Exception as e:
                print(f"Error generating hallucination classification for {file}, gene {gene_idx+1}: {str(e)}")
                result['hallucination_classification'] = None
            
            try:
                result['classification'] = generate_classification(result, model)
            except Exception as e:
                print(f"Error generating classification for {file}, gene {gene_idx+1}: {str(e)}")
                result['classification'] = None
            
            all_results.append(result)
            
            file_progress.update(1)
        
        file_progress.close()
        model_progress.update(1)
    
    model_progress.close()


Model [1/6]: gpt-3.5-turbo:   0%|          | 0/10 [00:20<?, ?it/s]
                                                                                  
                                                                                         
                                                                                      
                                                                                            
                                                                                          
                                                                                            
                                                                                         
                                                                                      
                                                                                            
                                                                                      
Model [1/6]: gpt-3.5-turbo: 100%|█████

In [28]:
df = pd.DataFrame(all_results)
df['correct'] = df['file_name'].str.contains("non", case=True, na=False).map({True: "No", False: "Yes"})
df['hallucination_classification'] = df['hallucination_classification'].str.replace(".", "", regex=False)
df['classification'] = df['classification'].str.replace(".", "", regex=False)

In [29]:
df['hallucination_classification'] = df['hallucination_classification'].apply(
    lambda x: 'Yes' if isinstance(x, str) and x.split()[0].lower()[:3] == 'yes' else
              'No' if isinstance(x, str) and x.split()[0].lower()[:2] == 'no' else np.nan
)
print(df['hallucination_classification'].values)

df['classification'] = df['classification'].apply(
    lambda x: 'Yes' if isinstance(x, str) and x.split()[0].lower()[:3] == 'yes' else
              'No' if isinstance(x, str) and x.split()[0].lower()[:2] == 'no' else np.nan
)
print(df['classification'].values)

['No' 'No' 'No' ... 'No' 'No' 'Yes']
['No' 'No' 'No' ... 'No' 'Yes' 'Yes']


In [30]:
df.isna().sum()

file_name                       0
gene                            0
question                        0
model                           0
hallucination                   0
hallucination_classification    1
classification                  0
correct                         0
dtype: int64

In [31]:
df = df.dropna()

df['hallucination_correct'] = df['hallucination_classification'] == df['correct']
df['no_hallucination_correct'] = df['classification'] == df['correct']
df.to_csv("trail_1_GPT.csv", index=False)

In [32]:
grouped_results = df.groupby('model')[['hallucination_correct', 'no_hallucination_correct']].mean()
grouped_results

Unnamed: 0_level_0,hallucination_correct,no_hallucination_correct
model,Unnamed: 1_level_1,Unnamed: 2_level_1
gpt-3.5-turbo,0.499458,0.486457
gpt-4o-mini,0.474026,0.508658


In [47]:
enhancer_hallucination_correct = df[df['file_name'].str.contains("enhancer")].groupby('model')['hallucination_correct'].mean()
enhancer_no_hallucination_correct = df[df['file_name'].str.contains("enhancer")].groupby('model')['no_hallucination_correct'].mean()

promoter_hallucination_correct = df[df['file_name'].str.contains("promoter")].groupby('model')['hallucination_correct'].mean()
promoter_no_hallucination_correct = df[df['file_name'].str.contains("promoter")].groupby('model')['no_hallucination_correct'].mean() 

splice_site_hallucination_correct = df[df['file_name'].str.contains("splice_site")].groupby('model')['hallucination_correct'].mean()     
splice_site_no_hallucination_correct = df[df['file_name'].str.contains("splice_site")].groupby('model')['no_hallucination_correct'].mean()

methylated_hallucination_correct = df[df['file_name'].str.contains("methylated")].groupby('model')['hallucination_correct'].mean()   
methylated_no_hallucination_correct = df[df['file_name'].str.contains("methylated")].groupby('model')['no_hallucination_correct'].mean() 

protein_coding_hallucination_correct = df[df['file_name'].str.contains("protein_coding")].groupby('model')['hallucination_correct'].mean()   
protein_coding_no_hallucination_correct = df[df['file_name'].str.contains("protein_coding")].groupby('model')['no_hallucination_correct'].mean()

data = {
    'enhancer_hallucination_correct': enhancer_hallucination_correct,
    'enhancer_no_hallucination_correct': enhancer_no_hallucination_correct,
    'promoter_hallucination_correct': promoter_hallucination_correct,
    'promoter_no_hallucination_correct': promoter_no_hallucination_correct,
    'splice_site_hallucination_correct': splice_site_hallucination_correct,
    'splice_site_no_hallucination_correct': splice_site_no_hallucination_correct,
    'methylated_hallucination_correct': methylated_hallucination_correct,
    'methylated_no_hallucination_correct': methylated_no_hallucination_correct,
    'protein_coding_hallucination_correct': protein_coding_hallucination_correct,
    'protein_coding_no_hallucination_correct': protein_coding_no_hallucination_correct
}

pivot_data = []

for feature in ['enhancer', 'promoter', 'splice_site', 'methylated', 'protein_coding']:
    hall_series = data[f'{feature}_hallucination_correct']
    
    no_hall_series = data[f'{feature}_no_hallucination_correct']
    
    for model in hall_series.index:
        pivot_data.append({
            'model': model,
            'hallucination_status': 'Hallucination',
            'feature': feature,
            'correct_count': hall_series[model]
        })
        
        pivot_data.append({
            'model': model,
            'hallucination_status': 'No Hallucination',
            'feature': feature,
            'correct_count': no_hall_series[model]
        })

long_df = pd.DataFrame(pivot_data)

pivot_df = long_df.pivot_table(
    index=['model', 'hallucination_status'],
    columns='feature',
    values='correct_count'
)

feature_order = ['enhancer', 'promoter', 'splice_site', 'methylated', 'protein_coding']
pivot_df = pivot_df[feature_order]


pivot_df['AVG'] = pivot_df.mean(axis=1)

pivot_df['AVG'] = pivot_df['AVG'].round(4) 

pivot_df



Unnamed: 0_level_0,feature,enhancer,promoter,splice_site,methylated,protein_coding,AVG
model,hallucination_status,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
gpt-3.5-turbo,Hallucination,0.5,0.485,0.457143,0.512563,0.531915,0.4973
gpt-3.5-turbo,No Hallucination,0.5,0.495,0.521429,0.552764,0.367021,0.4872
gpt-4o-mini,Hallucination,0.494898,0.505,0.492857,0.5,0.37766,0.4741
gpt-4o-mini,No Hallucination,0.5,0.5,0.514286,0.5,0.531915,0.5092
