In [9]:
from dotenv import load_dotenv
import os
from openai import OpenAI
import pandas as pd
from Bio import SeqIO
load_dotenv()
nvda = os.getenv("nvda")
client = OpenAI(
  base_url = "https://integrate.api.nvidia.com/v1",
  api_key = nvda
)
from tqdm import tqdm
import numpy as np

In [10]:
fasta_dir = "sequences"

# Mapping of base categories to questions
questions = {
    "protein_coding": "Does this nucleotide sequence encode a protein? Only answer Yes or No. You must start your answer with 'Yes' or 'No'.",
    "enhancer": "Does this nucleotide sequence function as an enhancer in gene regulation? Only answer Yes or No. You must start your answer with 'Yes' or 'No'.",
    "promoter": "Does this nucleotide sequence act as a promoter for transcription initiation? Only answer Yes or No. You must start your answer with 'Yes' or 'No'.",
    "splice_site": "Does this nucleotide sequence contain a splice site for RNA processing? Only answer Yes or No. You must start your answer with 'Yes' or 'No'.",
    "methylated": "Is this nucleotide sequence methylated as part of epigenetic regulation? Only answer Yes or No. You must start your answer with 'Yes' or 'No'."
}

# Dictionary to hold sequence data
sequence_arrays = {}

# Loop through all FASTA files in the directory
for file in os.listdir(fasta_dir):
    if file.endswith(".fasta"):  # Process only FASTA files
        file_path = os.path.join(fasta_dir, file)
        var_name = os.path.splitext(file)[0]  # Use file name (without extension) as key

        # Determine base category
        if var_name.startswith("non_"):
            base_category = var_name[4:]  # Remove "non_" prefix
        else:
            base_category = var_name
        question = questions[base_category]  # Get the corresponding question

        # Read the FASTA file and create a list of dictionaries
        gene_dicts = []
        for record in SeqIO.parse(file_path, "fasta"):
            gene_dict = {
                "file_name": var_name,
                "gene": str(record.seq),
                "question": question,  # Add the question key
            }
            gene_dicts.append(gene_dict)
        
        # Store the list of dictionaries in the dictionary
        sequence_arrays[var_name] = gene_dicts

        print(f"Stored {var_name} with {len(gene_dicts)} genes.")


Stored enhancer with 98 genes.
Stored methylated with 100 genes.
Stored non_enhancer with 98 genes.
Stored non_methylated with 100 genes.
Stored non_promoter with 100 genes.
Stored non_protein_coding with 88 genes.
Stored non_splice_site with 70 genes.
Stored promoter with 100 genes.
Stored protein_coding with 100 genes.
Stored splice_site with 70 genes.


In [11]:
def generate_hallucination(gene_dict, model):
    """Generate a hallucination description for a gene sequence"""
    response = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": "You are an expert in genomics."},
            {"role": "user", "content": f"{gene_dict['gene']} Describe the gene in natural language:"}
        ],
        max_tokens=256,
        temperature=0.6
    )
    return response.choices[0].message.content


In [12]:
def generate_classification(gene_dict, model):
    """Generate a classification (Yes/No) for a gene sequence"""
    response = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": "You are an expert in genomics."},
            {"role": "user", "content": f"{gene_dict['gene']} {gene_dict['question']}"}
        ],
        max_tokens=256,
        temperature=0.6
    )
    return response.choices[0].message.content

In [13]:
def generate_hallucination_classification(gene_dict, model, hallucination):
    """Generate a classification with hallucination included"""
    response = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": "You are an expert in genomics."},
            {"role": "user", "content": f"{gene_dict['gene']} {hallucination} {gene_dict['question']}"}
        ],
        max_tokens=256,
        temperature=0.6
    )
    return response.choices[0].message.content

In [14]:
all_results = []

MODELS = [
    "meta/llama3-8b-instruct",
    "meta/llama-3.1-8b-instruct",
    "meta/llama-3.3-70b-instruct",
    "tiiuae/falcon3-7b-instruct",
    "qwen/qwen2.5-7b-instruct",
    ]

# Dictionary to store hallucinations for reuse
# Dictionary to store hallucinations for reuse
hallucinations = {}

# First, generate hallucinations with each model
print("Generating hallucinations...")
for hallucination_model in MODELS:
    hallucinations[hallucination_model] = {}
    for file_name, sequences in tqdm(sequence_arrays.items(), desc=f"Hallucinations for {hallucination_model}"):
        hallucinations[hallucination_model][file_name] = {}
        for i, gene_dict in enumerate(sequences):
            try:
                hallucination = generate_hallucination(gene_dict, hallucination_model)
                hallucinations[hallucination_model][file_name][i] = hallucination
            except Exception as e:
                print(f"Error generating hallucination for {file_name}, gene {i}: {str(e)}")
                hallucinations[hallucination_model][file_name][i] = None

# Results will be stored here
all_results = []

# Now, for each combination of hallucination model and classification model
progress = tqdm(total=len(MODELS) * len(MODELS))
for hallucination_model in MODELS:
    for classification_model in MODELS:
        progress.set_description(f"Hall: {hallucination_model} / Class: {classification_model}")
        
        # For each file (category of sequences)
        for file_name, sequences in sequence_arrays.items():
            for i, gene_dict in enumerate(sequences):
                result = gene_dict.copy()
                
                # Add model info
                result['hallucination_model'] = hallucination_model
                result['classification_model'] = classification_model
                
                # Get the pre-generated hallucination
                hallucination = hallucinations[hallucination_model][file_name].get(i)
                result['hallucination'] = hallucination
                
                # Skip if hallucination was not generated
                if hallucination is None:
                    continue
                
                # Generate direct classification (without hallucination)
                try:
                    result['classification'] = generate_classification(result, classification_model)
                except Exception as e:
                    print(f"Error generating classification: {str(e)}")
                    result['classification'] = None
                
                # Generate classification with hallucination
                try:
                    result['hallucination_classification'] = generate_hallucination_classification(
                        result, classification_model, hallucination
                    )
                except Exception as e:
                    print(f"Error generating hallucination classification: {str(e)}")
                    result['hallucination_classification'] = None
                
                all_results.append(result)
        
        progress.update(1)
progress.close()

Generating hallucinations...


Hallucinations for meta/llama3-8b-instruct: 100%|██████████| 10/10 [28:35<00:00, 171.58s/it]
Hallucinations for meta/llama-3.1-8b-instruct: 100%|██████████| 10/10 [55:05<00:00, 330.50s/it]
Hallucinations for meta/llama-3.3-70b-instruct: 100%|██████████| 10/10 [56:00<00:00, 336.04s/it]
Hallucinations for tiiuae/falcon3-7b-instruct: 100%|██████████| 10/10 [53:15<00:00, 319.60s/it]
Hallucinations for qwen/qwen2.5-7b-instruct: 100%|██████████| 10/10 [1:28:55<00:00, 533.51s/it]
Hall: qwen/qwen2.5-7b-instruct / Class: qwen/qwen2.5-7b-instruct: 100%|██████████| 25/25 [4:09:49<00:00, 599.57s/it]        


In [None]:
df = pd.DataFrame(all_results)

df['correct'] = df['file_name'].str.contains("non", case=True, na=False).map({True: "No", False: "Yes"})

df['hallucination_classification'] = df['hallucination_classification'].str.replace(".", "", regex=False)
df['classification'] = df['classification'].str.replace(".", "", regex=False)


In [17]:
def classify_value(x):
    if isinstance(x, str):
        parts = x.split()
        if parts:  # only process non-empty strings
            first = parts[0].lower()
            if first.startswith('yes'):
                return 'Yes'
            elif first.startswith('no'):
                return 'No'
    return np.nan

df['hallucination_classification'] = df['hallucination_classification'].apply(classify_value)
print(df['hallucination_classification'].values)

df['classification'] = df['classification'].apply(classify_value)
print(df['classification'].values)


['Yes' 'Yes' 'Yes' ... 'No' 'No' 'No']
['No' 'No' 'No' ... 'No' 'No' 'No']


In [20]:
df[df['hallucination_classification'].isna()]['hallucination_model'].value_counts()

hallucination_model
meta/llama-3.1-8b-instruct     6
qwen/qwen2.5-7b-instruct       4
meta/llama-3.3-70b-instruct    2
Name: count, dtype: int64

In [21]:
df[df['classification'].isna()]['hallucination_model'].value_counts()

hallucination_model
meta/llama-3.3-70b-instruct    1
qwen/qwen2.5-7b-instruct       1
Name: count, dtype: int64

In [22]:
df = df.dropna(subset=['hallucination_classification', 'classification', 'correct'])

# Add correctness columns
df['hallucination_correct'] = df['hallucination_classification'] == df['correct']
df['no_hallucination_correct'] = df['classification'] == df['correct']

# Save full results
df.to_csv("cross_model_hallucination_OPEN_results.csv", index=False)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['hallucination_correct'] = df['hallucination_classification'] == df['correct']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['no_hallucination_correct'] = df['classification'] == df['correct']


In [23]:
features = ['enhancer', 'promoter', 'splice_site', 'methylated', 'protein_coding']
summary_data = []

for hallucination_model in MODELS:
    for classification_model in MODELS:
        model_combo = f"{hallucination_model}-{classification_model}"
        combo_df = df[(df['hallucination_model'] == hallucination_model) & 
                       (df['classification_model'] == classification_model)]
        
        row_data = {
            'hallucination_model': hallucination_model,
            'classification_model': classification_model,
            'combo': model_combo
        }
        
        # Calculate accuracy for each feature
        for feature in features:
            feature_df = combo_df[combo_df['file_name'].str.contains(feature)]
            
            hall_acc = feature_df['hallucination_correct'].mean() if len(feature_df) > 0 else np.nan
            no_hall_acc = feature_df['no_hallucination_correct'].mean() if len(feature_df) > 0 else np.nan
            
            row_data[f'{feature}_hall_acc'] = hall_acc
            row_data[f'{feature}_no_hall_acc'] = no_hall_acc
        
        # Calculate overall accuracy
        row_data['overall_hall_acc'] = combo_df['hallucination_correct'].mean()
        row_data['overall_no_hall_acc'] = combo_df['no_hallucination_correct'].mean()
        
        summary_data.append(row_data)

summary_df = pd.DataFrame(summary_data)

# Create pivot tables for easier comparison
hall_pivot = pd.pivot_table(
    summary_df, 
    values=['overall_hall_acc'] + [f'{f}_hall_acc' for f in features],
    index='hallucination_model',
    columns='classification_model'
)

no_hall_pivot = pd.pivot_table(
    summary_df, 
    values=['overall_no_hall_acc'] + [f'{f}_no_hall_acc' for f in features],
    index='hallucination_model',
    columns='classification_model'
)

# Save summary results
summary_df.to_csv("cross_model_OPEN_summary.csv", index=False)
hall_pivot.to_csv("cross_model_hall_OPEN_pivot.csv")
no_hall_pivot.to_csv("cross_model_OPEN_no_hall_pivot.csv")

# Generate difference analysis (how much hallucination affects accuracy)
diff_data = []
for _, row in summary_df.iterrows():
    diff_row = {
        'hallucination_model': row['hallucination_model'],
        'classification_model': row['classification_model'],
        'combo': row['combo'],
        'overall_diff': row['overall_no_hall_acc'] - row['overall_hall_acc']
    }
    
    for feature in features:
        diff_row[f'{feature}_diff'] = row[f'{feature}_no_hall_acc'] - row[f'{feature}_hall_acc']
    
    diff_data.append(diff_row)

diff_df = pd.DataFrame(diff_data)
diff_pivot = pd.pivot_table(
    diff_df,
    values=['overall_diff'] + [f'{f}_diff' for f in features],
    index='hallucination_model',
    columns='classification_model'
)

diff_df.to_csv("cross_model_OPEN_diff.csv", index=False)
diff_pivot.to_csv("cross_model_OPEN_diff_pivot.csv")

print("Analysis complete. Results saved to CSV files.")

# Return the summary dataframe for quick inspection
summary_df

Analysis complete. Results saved to CSV files.


Unnamed: 0,hallucination_model,classification_model,combo,enhancer_hall_acc,enhancer_no_hall_acc,promoter_hall_acc,promoter_no_hall_acc,splice_site_hall_acc,splice_site_no_hall_acc,methylated_hall_acc,methylated_no_hall_acc,protein_coding_hall_acc,protein_coding_no_hall_acc,overall_hall_acc,overall_no_hall_acc
0,meta/llama3-8b-instruct,meta/llama3-8b-instruct,meta/llama3-8b-instruct-meta/llama3-8b-instruct,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.56383,0.202128,0.512987,0.439394
1,meta/llama3-8b-instruct,meta/llama-3.1-8b-instruct,meta/llama3-8b-instruct-meta/llama-3.1-8b-inst...,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.558511,0.398936,0.511905,0.479437
2,meta/llama3-8b-instruct,meta/llama-3.3-70b-instruct,meta/llama3-8b-instruct-meta/llama-3.3-70b-ins...,0.5,0.5,0.46,0.5,0.5,0.507143,0.5,0.5,0.569149,0.505319,0.505411,0.502165
3,meta/llama3-8b-instruct,tiiuae/falcon3-7b-instruct,meta/llama3-8b-instruct-tiiuae/falcon3-7b-inst...,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.574468,0.457447,0.515152,0.491342
4,meta/llama3-8b-instruct,qwen/qwen2.5-7b-instruct,meta/llama3-8b-instruct-qwen/qwen2.5-7b-instruct,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.56383,0.468085,0.512987,0.493506
5,meta/llama-3.1-8b-instruct,meta/llama3-8b-instruct,meta/llama-3.1-8b-instruct-meta/llama3-8b-inst...,0.5,0.5,0.505,0.5,0.5,0.5,0.5,0.5,0.569149,0.202128,0.515152,0.439394
6,meta/llama-3.1-8b-instruct,meta/llama-3.1-8b-instruct,meta/llama-3.1-8b-instruct-meta/llama-3.1-8b-i...,0.5,0.5,0.505,0.5,0.5,0.5,0.505,0.5,0.585106,0.398936,0.519565,0.479348
7,meta/llama-3.1-8b-instruct,meta/llama-3.3-70b-instruct,meta/llama-3.1-8b-instruct-meta/llama-3.3-70b-...,0.5,0.5,0.5,0.5,0.5,0.492857,0.5,0.5,0.606383,0.505319,0.521645,0.5
8,meta/llama-3.1-8b-instruct,tiiuae/falcon3-7b-instruct,meta/llama-3.1-8b-instruct-tiiuae/falcon3-7b-i...,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.62234,0.457447,0.524946,0.491323
9,meta/llama-3.1-8b-instruct,qwen/qwen2.5-7b-instruct,meta/llama-3.1-8b-instruct-qwen/qwen2.5-7b-ins...,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.606383,0.468085,0.521645,0.493506
