In [1]:
import sys
sys.path.append('../src')
sys.path.append('../experiments')

In [5]:
from run_experiments import *

In [3]:
# =============================================================================
# 🔐 API Keys Setup - Set your API keys here (optional if already configured)
# =============================================================================

# Set your API keys here (leave empty if already configured in .env or environment)
OPENAI_API_KEY = ''    # Your OpenAI API key
LLAMA_API_KEY = ''     # Your LLaMA API key  
DEEPSEEK_API_KEY = ''  # Your DeepSeek API key

# Apply the keys to environment variables (only if not already set)
import os

# Check and set OpenAI key
if OPENAI_API_KEY:
    os.environ['OPENAI_API_KEY'] = OPENAI_API_KEY
    print("✅ OpenAI API key set from notebook")
elif os.getenv('OPENAI_API_KEY'):
    print("✅ OpenAI API key already configured")
else:
    print("⚠️  OpenAI API key not available")

# Check and set LLaMA key
if LLAMA_API_KEY:
    os.environ['LLAMA_API_KEY'] = LLAMA_API_KEY
    print("✅ LLaMA API key set from notebook")
elif os.getenv('LLAMA_API_KEY'):
    print("✅ LLaMA API key already configured")
else:
    print("⚠️  LLaMA API key not available")

# Check and set DeepSeek key
if DEEPSEEK_API_KEY:
    os.environ['DEEPSEEK_API_KEY'] = DEEPSEEK_API_KEY
    print("✅ DeepSeek API key set from notebook")
elif os.getenv('DEEPSEEK_API_KEY'):
    print("✅ DeepSeek API key already configured")
else:
    print("⚠️  DeepSeek API key not available")

print("\n💡 If all set, you can run experiments with any configured APIs!")

✅ OpenAI API key already configured
✅ LLaMA API key already configured
✅ DeepSeek API key already configured

💡 If all set, you can run experiments with any configured APIs!


### compare LLM results with ground truth

In [8]:
results = run_experiment_with_custom_data(
    data_path="../data/sample_data/sample_posts_test.csv", 
    models=["o3-mini"],  
    post_id_col="PostId",
    content_col="Body", 
    expert_label_col="Expert_Label"
)

print(results.keys())

INFO:run_experiments:Running experiment with o3-mini using zero_shot_prompt


 Loading data from: ../data/sample_data/sample_posts_test.csv
 Loaded 5 posts
 Using columns: PostID='PostId', Content='Body', Label='Expert_Label'
 Label distribution: {'Neutral': 3, 'Positive': 2}

 Running experiment: o3-mini_zero_shot_prompt


Processing posts with o3-mini: 100%|█████████████████████████████████████████████████████| 5/5 [00:09<00:00,  1.86s/it]
INFO:run_experiments:Running experiment with o3-mini using few_shot_prompt


 Results saved to: ../results/custom_experiments/o3-mini_zero_shot_prompt_predictions.csv
 Accuracy: 1.000, F1: 1.000, Response Rate: 1.000

 Running experiment: o3-mini_few_shot_prompt


Processing posts with o3-mini: 100%|█████████████████████████████████████████████████████| 5/5 [00:10<00:00,  2.12s/it]

 Results saved to: ../results/custom_experiments/o3-mini_few_shot_prompt_predictions.csv
 Accuracy: 1.000, F1: 1.000, Response Rate: 1.000

 Generating comparison summary...
 Comparison summary saved to: ../results/custom_experiments/comparison_summary.csv

 Results Summary:
              Experiment  Accuracy  F1_Macro  Response_Rate
o3-mini_zero_shot_prompt       1.0       1.0            1.0
 o3-mini_few_shot_prompt       1.0       1.0            1.0

 Experiment completed! Results saved to: ../results/custom_experiments/
dict_keys(['o3-mini_zero_shot_prompt', 'o3-mini_few_shot_prompt'])





### only prediction

In [7]:
results = predict_sentiment_batch(
    data_path="../data/sample_data/sample_posts_test.csv",
    models=["gpt-4o-mini", "llama3.1-70b"],
    post_id_col="PostId",
    content_col="Body",
    verbose=True
)


 Loading data from: ../data/sample_data/sample_posts_test.csv
 Loaded 5 posts for prediction
 Using columns: PostID='PostId', Content='Body'
 Predicting sentiment using prompt: few_shot_prompt

 Predicting with gpt-4o-mini...


Processing posts with gpt-4o-mini: 100%|█████████████████████████████████████████████████| 5/5 [00:03<00:00,  1.36it/s]


 gpt-4o-mini: 5/5 predictions (100.0% success rate)
 Distribution: {'Neutral': 3, 'Positive': 2}

 Predicting with llama3.1-70b...


Processing posts with llama3.1-70b: 100%|████████████████████████████████████████████████| 5/5 [00:06<00:00,  1.27s/it]

 llama3.1-70b: 5/5 predictions (100.0% success rate)
 Distribution: {'Neutral': 3, 'Positive': 2}

 Predictions saved to: ../results/predictions/sentiment_predictions_20250715_015718.csv
 Results summary:
   - Total posts: 5
   - Models used: ['gpt-4o-mini', 'llama3.1-70b']
   - New columns: ['Predicted_gpt-4o-mini', 'Predicted_llama3.1-70b']





In [8]:
results

Unnamed: 0,PostId,Body,Expert_Label,Category,Predicted_gpt-4o-mini,Predicted_llama3.1-70b
0,POST_001,"I thought I'd need help last night, but I mana...",Positive,Health Improvement,Positive,Positive
1,POST_018,The research shows mixed results for this trea...,Neutral,Generated,Neutral,Neutral
2,POST_016,Has anyone tried the new inhaler device? Wonde...,Neutral,Generated,Neutral,Neutral
3,POST_002,There was a fuss about the drug about ten year...,Neutral,Uncertainty,Neutral,Neutral
4,POST_009,The new inhaler technique really helped me dur...,Positive,Generated,Positive,Positive


### Run with real data
#### Before using with real data, make sure you saved data in the folder: real_data

In [4]:

results = run_experiment_with_custom_data(
    data_path="../data/real_data/my_data.csv", # change file name
    models=["llama3.1-70b"],  
    post_id_col="PostId",# change column names
    content_col="Body", 
    expert_label_col="Sentiment_XL"
)

print(results.keys())

INFO:run_experiments:Running experiment with llama3.1-70b using zero_shot_prompt


 Loading data from: ../data/real_data/my_data.csv
 Loaded 9 posts
 Using columns: PostID='PostId', Content='Body', Label='Sentiment_XL'
 Label distribution: {'Negative': 5, 'Positive': 3, 'Neutral': 1}

 Running experiment: llama3.1-70b_zero_shot_prompt


Processing posts with llama3.1-70b: 100%|████████████████████████████████████████████████| 9/9 [00:11<00:00,  1.23s/it]
INFO:run_experiments:Running experiment with llama3.1-70b using few_shot_prompt


 Results saved to: ../results/custom_experiments/llama3.1-70b_zero_shot_prompt_predictions.csv
 Accuracy: 1.000, F1: 1.000, Response Rate: 1.000

 Running experiment: llama3.1-70b_few_shot_prompt


Processing posts with llama3.1-70b: 100%|████████████████████████████████████████████████| 9/9 [00:13<00:00,  1.53s/it]

 Results saved to: ../results/custom_experiments/llama3.1-70b_few_shot_prompt_predictions.csv
 Accuracy: 1.000, F1: 1.000, Response Rate: 1.000

 Generating comparison summary...
 Comparison summary saved to: ../results/custom_experiments/comparison_summary.csv

 Results Summary:
                   Experiment  Accuracy  F1_Macro  Response_Rate
llama3.1-70b_zero_shot_prompt       1.0       1.0            1.0
 llama3.1-70b_few_shot_prompt       1.0       1.0            1.0

 Experiment completed! Results saved to: ../results/custom_experiments/
dict_keys(['llama3.1-70b_zero_shot_prompt', 'llama3.1-70b_few_shot_prompt'])





In [9]:
results = run_experiment_with_custom_data(
    data_path="../data/real_data/my_data.csv", 
    models=["llama3.1-70b","llama3.1-405b"],  
    post_id_col="PostId",
    content_col="Body", 
    expert_label_col="Sentiment_XL"
)

print(results.keys())

INFO:run_experiments:Running experiment with llama3.1-70b using zero_shot_prompt


 Loading data from: ../data/real_data/my_data.csv
 Loaded 9 posts
 Using columns: PostID='PostId', Content='Body', Label='Sentiment_XL'
 Label distribution: {'Negative': 5, 'Positive': 3, 'Neutral': 1}

 Running experiment: llama3.1-70b_zero_shot_prompt


Processing posts with llama3.1-70b: 100%|████████████████████████████████████████████████| 9/9 [00:12<00:00,  1.34s/it]
INFO:run_experiments:Running experiment with llama3.1-70b using few_shot_prompt


 Results saved to: ../results/custom_experiments/llama3.1-70b_zero_shot_prompt_predictions.csv
 Accuracy: 1.000, F1: 1.000, Response Rate: 1.000

 Running experiment: llama3.1-70b_few_shot_prompt


Processing posts with llama3.1-70b: 100%|████████████████████████████████████████████████| 9/9 [00:10<00:00,  1.22s/it]
INFO:run_experiments:Running experiment with llama3.1-405b using zero_shot_prompt


 Results saved to: ../results/custom_experiments/llama3.1-70b_few_shot_prompt_predictions.csv
 Accuracy: 1.000, F1: 1.000, Response Rate: 1.000

 Running experiment: llama3.1-405b_zero_shot_prompt


Processing posts with llama3.1-405b: 100%|███████████████████████████████████████████████| 9/9 [00:13<00:00,  1.52s/it]
INFO:run_experiments:Running experiment with llama3.1-405b using few_shot_prompt


 Results saved to: ../results/custom_experiments/llama3.1-405b_zero_shot_prompt_predictions.csv
 Accuracy: 0.889, F1: 0.852, Response Rate: 1.000

 Running experiment: llama3.1-405b_few_shot_prompt


Processing posts with llama3.1-405b: 100%|███████████████████████████████████████████████| 9/9 [00:27<00:00,  3.02s/it]

 Results saved to: ../results/custom_experiments/llama3.1-405b_few_shot_prompt_predictions.csv
 Accuracy: 1.000, F1: 1.000, Response Rate: 1.000

 Generating comparison summary...
 Comparison summary saved to: ../results/custom_experiments/comparison_summary.csv

 Results Summary:
                    Experiment  Accuracy  F1_Macro  Response_Rate
 llama3.1-70b_zero_shot_prompt     1.000     1.000            1.0
  llama3.1-70b_few_shot_prompt     1.000     1.000            1.0
llama3.1-405b_zero_shot_prompt     0.889     0.852            1.0
 llama3.1-405b_few_shot_prompt     1.000     1.000            1.0

 Experiment completed! Results saved to: ../results/custom_experiments/
dict_keys(['llama3.1-70b_zero_shot_prompt', 'llama3.1-70b_few_shot_prompt', 'llama3.1-405b_zero_shot_prompt', 'llama3.1-405b_few_shot_prompt'])





In [5]:
results = run_experiment_with_custom_data(
    data_path="../data/real_data/my_data.csv", 
    models=["deepseek-chat"],  
    post_id_col="PostId",
    content_col="Body", 
    expert_label_col="Sentiment_XL"
)

print(results.keys())

INFO:run_experiments:Running experiment with deepseek-chat using zero_shot_prompt


 Loading data from: ../data/real_data/my_data.csv
 Loaded 9 posts
 Using columns: PostID='PostId', Content='Body', Label='Sentiment_XL'
 Label distribution: {'Negative': 5, 'Positive': 3, 'Neutral': 1}

 Running experiment: deepseek-chat_zero_shot_prompt


Processing posts with deepseek-chat: 100%|███████████████████████████████████████████████| 9/9 [01:20<00:00,  8.99s/it]
INFO:run_experiments:Running experiment with deepseek-chat using few_shot_prompt


 Results saved to: ../results/custom_experiments/deepseek-chat_zero_shot_prompt_predictions.csv
 Accuracy: 1.000, F1: 1.000, Response Rate: 1.000

 Running experiment: deepseek-chat_few_shot_prompt


Processing posts with deepseek-chat: 100%|███████████████████████████████████████████████| 9/9 [00:46<00:00,  5.12s/it]

 Results saved to: ../results/custom_experiments/deepseek-chat_few_shot_prompt_predictions.csv
 Accuracy: 0.889, F1: 0.619, Response Rate: 1.000

 Generating comparison summary...
 Comparison summary saved to: ../results/custom_experiments/comparison_summary.csv

 Results Summary:
                    Experiment  Accuracy  F1_Macro  Response_Rate
deepseek-chat_zero_shot_prompt     1.000     1.000            1.0
 deepseek-chat_few_shot_prompt     0.889     0.619            1.0

 Experiment completed! Results saved to: ../results/custom_experiments/
dict_keys(['deepseek-chat_zero_shot_prompt', 'deepseek-chat_few_shot_prompt'])





In [6]:
results = run_experiment_with_custom_data(
    data_path="../data/real_data/my_data.csv", 
    models=["gpt-4.1-mini"],  
    post_id_col="PostId",
    content_col="Body", 
    expert_label_col="Sentiment_XL",
    verbose=False
)

print(results.keys())

INFO:run_experiments:Running experiment with gpt-4.1-mini using zero_shot_prompt
Processing posts with gpt-4.1-mini: 100%|████████████████████████████████████████████████| 9/9 [00:05<00:00,  1.50it/s]
INFO:run_experiments:Running experiment with gpt-4.1-mini using few_shot_prompt
Processing posts with gpt-4.1-mini: 100%|████████████████████████████████████████████████| 9/9 [00:04<00:00,  1.83it/s]

dict_keys(['gpt-4.1-mini_zero_shot_prompt', 'gpt-4.1-mini_few_shot_prompt'])





In [7]:
results = run_experiment_with_custom_data(
    data_path="../data/real_data/my_data.csv", 
    models=["o3-mini"],  
    post_id_col="PostId",
    content_col="Body", 
    expert_label_col="Sentiment_XL",
    verbose=False
)

print(results.keys())

INFO:run_experiments:Running experiment with o3-mini using zero_shot_prompt
Processing posts with o3-mini: 100%|█████████████████████████████████████████████████████| 9/9 [00:28<00:00,  3.14s/it]
INFO:run_experiments:Running experiment with o3-mini using few_shot_prompt
Processing posts with o3-mini: 100%|█████████████████████████████████████████████████████| 9/9 [00:32<00:00,  3.62s/it]

dict_keys(['o3-mini_zero_shot_prompt', 'o3-mini_few_shot_prompt'])



