In [1]:
import os
from openai import OpenAI  #OpenAI API client
from dotenv import load_dotenv  #For loading environment variables from .env file
load_dotenv()  #This loads variables from a .env file into environment variables
api_key = os.getenv("OPENAI_API_KEY")  #Get the OpenAI API key from environment variables
client = OpenAI(api_key=api_key)  #Initialize the OpenAI client with the API key
import pandas as pd
from Bio import SeqIO
from tqdm import tqdm
import numpy as np


In [2]:
fasta_dir = "sequences"

# Mapping of base categories to questions
questions = {
    "protein_coding": "Does this nucleotide sequence encode a protein? Only answer Yes or No. You must start your answer with 'Yes' or 'No'.",
    "enhancer": "Does this nucleotide sequence function as an enhancer in gene regulation? Only answer Yes or No. You must start your answer with 'Yes' or 'No'.",
    "promoter": "Does this nucleotide sequence act as a promoter for transcription initiation? Only answer Yes or No. You must start your answer with 'Yes' or 'No'.",
    "splice_site": "Does this nucleotide sequence contain a splice site for RNA processing? Only answer Yes or No. You must start your answer with 'Yes' or 'No'.",
    "methylated": "Is this nucleotide sequence methylated as part of epigenetic regulation? Only answer Yes or No. You must start your answer with 'Yes' or 'No'."
}

# Dictionary to hold sequence data
sequence_arrays = {}

# Loop through all FASTA files in the directory
for file in os.listdir(fasta_dir):
    if file.endswith(".fasta"):  # Process only FASTA files
        file_path = os.path.join(fasta_dir, file)
        var_name = os.path.splitext(file)[0]  # Use file name (without extension) as key

        # Determine base category
        if var_name.startswith("non_"):
            base_category = var_name[4:]  # Remove "non_" prefix
        else:
            base_category = var_name
        question = questions[base_category]  # Get the corresponding question

        # Read the FASTA file and create a list of dictionaries
        gene_dicts = []
        for record in SeqIO.parse(file_path, "fasta"):
            gene_dict = {
                "file_name": var_name,
                "gene": str(record.seq),
                "question": question,  # Add the question key
            }
            gene_dicts.append(gene_dict)
        
        # Store the list of dictionaries in the dictionary
        sequence_arrays[var_name] = gene_dicts

        print(f"Stored {var_name} with {len(gene_dicts)} genes.")


Stored enhancer with 98 genes.
Stored methylated with 100 genes.
Stored non_enhancer with 98 genes.
Stored non_methylated with 100 genes.
Stored non_promoter with 100 genes.
Stored non_protein_coding with 88 genes.
Stored non_splice_site with 70 genes.
Stored promoter with 100 genes.
Stored protein_coding with 100 genes.
Stored splice_site with 70 genes.


In [3]:
def generate_hallucination(gene_dict, model):
    """Generate a hallucination description for a gene sequence"""
    response = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": "You are an expert in genomics."},
            {"role": "user", "content": f"{gene_dict['gene']} Describe the gene in natural language:"}
        ],
        max_tokens=256,
        temperature=0.6
    )
    return response.choices[0].message.content


In [4]:
def generate_classification(gene_dict, model):
    """Generate a classification (Yes/No) for a gene sequence"""
    response = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": "You are an expert in genomics."},
            {"role": "user", "content": f"{gene_dict['gene']} {gene_dict['question']}"}
        ],
        max_tokens=256,
        temperature=0.6
    )
    return response.choices[0].message.content

In [5]:
def generate_hallucination_classification(gene_dict, model, hallucination):
    """Generate a classification with hallucination included"""
    response = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": "You are an expert in genomics."},
            {"role": "user", "content": f"{gene_dict['gene']} {hallucination} {gene_dict['question']}"}
        ],
        max_tokens=256,
        temperature=0.6
    )
    return response.choices[0].message.content

In [6]:
# Define the models to use
MODELS = [
    "gpt-3.5-turbo",
    "gpt-4o-mini",
    # Add other models here as needed
]

# Dictionary to store hallucinations for reuse
hallucinations = {}

# First, generate hallucinations with each model
print("Generating hallucinations...")
for hallucination_model in MODELS:
    hallucinations[hallucination_model] = {}
    for file_name, sequences in tqdm(sequence_arrays.items(), desc=f"Hallucinations for {hallucination_model}"):
        hallucinations[hallucination_model][file_name] = {}
        for i, gene_dict in enumerate(sequences):
            try:
                hallucination = generate_hallucination(gene_dict, hallucination_model)
                hallucinations[hallucination_model][file_name][i] = hallucination
            except Exception as e:
                print(f"Error generating hallucination for {file_name}, gene {i}: {str(e)}")
                hallucinations[hallucination_model][file_name][i] = None

# Results will be stored here
all_results = []

# Now, for each combination of hallucination model and classification model
progress = tqdm(total=len(MODELS) * len(MODELS))
for hallucination_model in MODELS:
    for classification_model in MODELS:
        progress.set_description(f"Hall: {hallucination_model} / Class: {classification_model}")
        
        # For each file (category of sequences)
        for file_name, sequences in sequence_arrays.items():
            for i, gene_dict in enumerate(sequences):
                result = gene_dict.copy()
                
                # Add model info
                result['hallucination_model'] = hallucination_model
                result['classification_model'] = classification_model
                
                # Get the pre-generated hallucination
                hallucination = hallucinations[hallucination_model][file_name].get(i)
                result['hallucination'] = hallucination
                
                # Skip if hallucination was not generated
                if hallucination is None:
                    continue
                
                # Generate direct classification (without hallucination)
                try:
                    result['classification'] = generate_classification(result, classification_model)
                except Exception as e:
                    print(f"Error generating classification: {str(e)}")
                    result['classification'] = None
                
                # Generate classification with hallucination
                try:
                    result['hallucination_classification'] = generate_hallucination_classification(
                        result, classification_model, hallucination
                    )
                except Exception as e:
                    print(f"Error generating hallucination classification: {str(e)}")
                    result['hallucination_classification'] = None
                
                all_results.append(result)
        
        progress.update(1)
progress.close()

Generating hallucinations...


Hallucinations for gpt-3.5-turbo: 100%|██████████| 10/10 [19:01<00:00, 114.13s/it]
Hallucinations for gpt-4o-mini: 100%|██████████| 10/10 [1:07:28<00:00, 404.87s/it]
Hall: gpt-4o-mini / Class: gpt-4o-mini: 100%|██████████| 4/4 [1:31:06<00:00, 1366.65s/it]  


In [7]:
df = pd.DataFrame(all_results)

# Add expected correct answer column
df['correct'] = df['file_name'].str.contains("non", case=True, na=False).map({True: "No", False: "Yes"})

# Clean the classification outputs
df['hallucination_classification'] = df['hallucination_classification'].str.replace(".", "", regex=False)
df['classification'] = df['classification'].str.replace(".", "", regex=False)


In [8]:
df['hallucination_classification'] = df['hallucination_classification'].apply(
    lambda x: 'Yes' if isinstance(x, str) and x.split()[0].lower()[:3] == 'yes' else
              'No' if isinstance(x, str) and x.split()[0].lower()[:2] == 'no' else np.nan
)
print(df['hallucination_classification'].values)

df['classification'] = df['classification'].apply(
    lambda x: 'Yes' if isinstance(x, str) and x.split()[0].lower()[:3] == 'yes' else
              'No' if isinstance(x, str) and x.split()[0].lower()[:2] == 'no' else np.nan
)
print(df['classification'].values)

['No' 'No' 'No' ... 'No' 'No' 'Yes']
['No' 'No' 'No' ... 'No' 'Yes' 'Yes']


In [9]:
df.isna().sum()

file_name                       0
gene                            0
question                        0
hallucination_model             0
classification_model            0
hallucination                   0
classification                  0
hallucination_classification    6
correct                         0
dtype: int64

In [10]:
df = df.dropna(subset=['hallucination_classification', 'classification', 'correct'])

# Add correctness columns
df['hallucination_correct'] = df['hallucination_classification'] == df['correct']
df['no_hallucination_correct'] = df['classification'] == df['correct']

# Save full results
df.to_csv("cross_model_hallucination_results.csv", index=False)


In [20]:
df

Unnamed: 0,file_name,gene,question,hallucination_model,classification_model,hallucination,classification,hallucination_classification,correct,hallucination_correct,no_hallucination_correct
0,enhancer,ACACTTCGTCTCCAGCTCTCTGCTCGCTCGCCTCGCAGTCACAGAC...,Does this nucleotide sequence function as an e...,gpt-3.5-turbo,gpt-3.5-turbo,The gene described is a long sequence of nucle...,No,No,Yes,False,False
1,enhancer,ATCTCACCGCTTGACGATCAAGGGGGCAAAGCTTCGGTGTTCATAG...,Does this nucleotide sequence function as an e...,gpt-3.5-turbo,gpt-3.5-turbo,The gene described in the sequence is a segmen...,No,No,Yes,False,False
2,enhancer,ACACTTCGTCTCCAGCTCTCTGCTCGCTCGCCTCGCAGTCACAGAC...,Does this nucleotide sequence function as an e...,gpt-3.5-turbo,gpt-3.5-turbo,The gene described in the provided sequence is...,No,No,Yes,False,False
3,enhancer,ATACATTCTGGGCGGGCAGGAAGCGCACGCTGGGATCGAGGCTTGC...,Does this nucleotide sequence function as an e...,gpt-3.5-turbo,gpt-3.5-turbo,The gene described is a long sequence of DNA c...,No,No,Yes,False,False
4,enhancer,ACACTTCGTCTCCAGCTCTCTGCTCGCTCGCCTCGCAGTCACAGAC...,Does this nucleotide sequence function as an e...,gpt-3.5-turbo,gpt-3.5-turbo,The gene described is a long sequence of DNA c...,No,No,Yes,False,False
...,...,...,...,...,...,...,...,...,...,...,...
3691,splice_site,CTGGACCTCAGCTGCTGAACAGGCTACAAGAGGCTGGGGAGACGTG...,Does this nucleotide sequence contain a splice...,gpt-4o-mini,gpt-4o-mini,The provided sequence appears to be a nucleoti...,Yes,Yes,Yes,True,True
3692,splice_site,CTGGACCTGAGATGCTGAACAGGCTCCAAGAGGCTGGGGAGACATG...,Does this nucleotide sequence contain a splice...,gpt-4o-mini,gpt-4o-mini,The provided sequence is a nucleotide sequence...,Yes,Yes,Yes,True,True
3693,splice_site,CGACCAGCCGAATCGCTCCTGCAGCAGAGGCCGATACAAAGCAGGG...,Does this nucleotide sequence contain a splice...,gpt-4o-mini,gpt-4o-mini,The sequence you provided is a string of nucle...,No,No,Yes,False,False
3694,splice_site,GCCTGGGTGTTGGCTGCTGTTGCTTGCACAGATTGGTCCCCTTGTC...,Does this nucleotide sequence contain a splice...,gpt-4o-mini,gpt-4o-mini,The sequence you've provided appears to be a n...,Yes,No,Yes,False,True


In [23]:
features = ['enhancer', 'promoter', 'splice_site', 'methylated', 'protein_coding']
summary_data = []

for hallucination_model in MODELS:
    for classification_model in MODELS:
        model_combo = f"{hallucination_model}-{classification_model}"
        combo_df = df[(df['hallucination_model'] == hallucination_model) & 
                       (df['classification_model'] == classification_model)]
        
        row_data = {
            'hallucination_model': hallucination_model,
            'classification_model': classification_model,
            'combo': model_combo
        }
        
        # Calculate accuracy for each feature
        for feature in features:
            feature_df = combo_df[combo_df['file_name'].str.contains(feature)]
            
            hall_acc = feature_df['hallucination_correct'].mean() if len(feature_df) > 0 else np.nan
            no_hall_acc = feature_df['no_hallucination_correct'].mean() if len(feature_df) > 0 else np.nan
            
            row_data[f'{feature}_hall_acc'] = hall_acc
            row_data[f'{feature}_no_hall_acc'] = no_hall_acc
        
        # Calculate overall accuracy
        row_data['overall_hall_acc'] = combo_df['hallucination_correct'].mean()
        row_data['overall_no_hall_acc'] = combo_df['no_hallucination_correct'].mean()
        
        summary_data.append(row_data)

summary_df = pd.DataFrame(summary_data)

# Create pivot tables for easier comparison
hall_pivot = pd.pivot_table(
    summary_df, 
    values=['overall_hall_acc'] + [f'{f}_hall_acc' for f in features],
    index='hallucination_model',
    columns='classification_model'
)

no_hall_pivot = pd.pivot_table(
    summary_df, 
    values=['overall_no_hall_acc'] + [f'{f}_no_hall_acc' for f in features],
    index='hallucination_model',
    columns='classification_model'
)

# Save summary results
summary_df.to_csv("cross_model_summary.csv", index=False)
hall_pivot.to_csv("cross_model_hall_pivot.csv")
no_hall_pivot.to_csv("cross_model_no_hall_pivot.csv")

# Generate difference analysis (how much hallucination affects accuracy)
diff_data = []
for _, row in summary_df.iterrows():
    diff_row = {
        'hallucination_model': row['hallucination_model'],
        'classification_model': row['classification_model'],
        'combo': row['combo'],
        'overall_diff': row['overall_no_hall_acc'] - row['overall_hall_acc']
    }
    
    for feature in features:
        diff_row[f'{feature}_diff'] = row[f'{feature}_no_hall_acc'] - row[f'{feature}_hall_acc']
    
    diff_data.append(diff_row)

diff_df = pd.DataFrame(diff_data)
diff_pivot = pd.pivot_table(
    diff_df,
    values=['overall_diff'] + [f'{f}_diff' for f in features],
    index='hallucination_model',
    columns='classification_model'
)

diff_df.to_csv("cross_model_diff.csv", index=False)
diff_pivot.to_csv("cross_model_diff_pivot.csv")

print("Analysis complete. Results saved to CSV files.")

# Return the summary dataframe for quick inspection
summary_df

Analysis complete. Results saved to CSV files.


Unnamed: 0,hallucination_model,classification_model,combo,enhancer_hall_acc,enhancer_no_hall_acc,promoter_hall_acc,promoter_no_hall_acc,splice_site_hall_acc,splice_site_no_hall_acc,methylated_hall_acc,methylated_no_hall_acc,protein_coding_hall_acc,protein_coding_no_hall_acc,overall_hall_acc,overall_no_hall_acc
0,gpt-3.5-turbo,gpt-3.5-turbo,gpt-3.5-turbo-gpt-3.5-turbo,0.5,0.5,0.5,0.49,0.507143,0.507143,0.510101,0.555556,0.489362,0.404255,0.501085,0.491323
1,gpt-3.5-turbo,gpt-4o-mini,gpt-3.5-turbo-gpt-4o-mini,0.5,0.5,0.5,0.5,0.485714,0.492857,0.5,0.5,0.281915,0.5,0.453463,0.498918
2,gpt-4o-mini,gpt-3.5-turbo,gpt-4o-mini-gpt-3.5-turbo,0.52551,0.489796,0.492462,0.512563,0.5,0.514286,0.517766,0.441624,0.351064,0.37234,0.477174,0.46413
3,gpt-4o-mini,gpt-4o-mini,gpt-4o-mini-gpt-4o-mini,0.510204,0.5,0.5,0.5,0.464286,0.514286,0.5,0.5,0.356383,0.510638,0.467532,0.504329
