In [97]:
import csv
from langchain import PromptTemplate, LLMChain
from langchain.llms import OpenAI
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.chat_models import ChatOpenAI
from langchain.evaluation.qa import QAEvalChain
from langchain.prompts.chat import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate,
)
import os
from src.data import *

In [82]:
train, test = get_datasets()
model = "gpt-3.5-turbo"
cot_bool = False
authority = ""
verb = "causes"
os.environ["OPENAI_API_KEY"] = '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'

In [91]:
system_template="""You are an AI assistant for helping humans build causal diagrams. Assess the validity of the given causal relationship between two variables."""

human_template = '''The following statement expresses a causal relationship between two variables.{ans_format}
----------------
STATEMENT: {authority}{var1} {verb} {var2}.
ANSWER:{cot}'''

messages = [SystemMessagePromptTemplate.from_template(system_template), HumanMessagePromptTemplate.from_template(human_template)]
prompt = ChatPromptTemplate.from_messages(messages)
chat = LLMChain(llm=ChatOpenAI(model_name=model, temperature=0), prompt=prompt)

In [92]:
#Generate answers to prompts
examples = []
predictions = examples[:]

if cot_bool:
    cot = '''
Let's think step by step.'''
    ans_format = '''
Your answer should determine whether the causal relationship is true or false.'''
else:
    cot = ''
    ans_format = '''
Your answer should just be 'TRUE' if the causal relationship is true or 'FALSE' if the causal relationship is false.'''

for i in range(2):#len(test[0])):
    #TRUE
    result = chat.run(cot=cot, ans_format=ans_format, authority=authority, var1=test[0][i][0], verb=verb, var2=test[0][i][1])
    dict = {
        "query" : "{}{} {} {}".format(authority, test[0][i][0], verb, test[0][i][1]),
        "answer" : "TRUE",
        "result" : result, 
        }
    examples.append(dict)
    #print(prompt.format(cot=cot, ans_format=ans_format, authority=authority, var1=test[0][i][0], verb=verb, var2=test[0][i][1]))
    #FALSE
    result = chat.run(cot=cot, ans_format=ans_format, authority=authority, var1=test[1][i][0], verb=verb, var2=test[1][i][1])
    dict = {
        "query" : "{}{} {} {}".format(authority, test[1][i][0], verb, test[1][i][1]),
        "answer" : "FALSE",
        "result" : result, 
        }
    examples.append(dict)
    #print(prompt.format(cot=cot, ans_format=ans_format, authority=authority, var1=test[1][i][0], verb=verb, var2=test[1][i][1]))

predictions = examples[:]


System: You are an AI assistant for helping humans build causal diagrams. Assess the validity of the given causal relationship between two variables.
Human: The following statement expresses a causal relationship between two variables.
Your answer should just be 'TRUE' if the causal relationship is true or 'FALSE' if the causal relationship is false.
----------------
STATEMENT: EIF3A causes Lung Neoplasms.
ANSWER:
System: You are an AI assistant for helping humans build causal diagrams. Assess the validity of the given causal relationship between two variables.
Human: The following statement expresses a causal relationship between two variables.
Your answer should just be 'TRUE' if the causal relationship is true or 'FALSE' if the causal relationship is false.
----------------
STATEMENT: acetaminophen glucuronide causes rs3130690.
ANSWER:
System: You are an AI assistant for helping humans build causal diagrams. Assess the validity of the given causal relationship between two variables.

In [88]:
#Set up evaluation framework and evaluate results
llm = ChatOpenAI(model_name=model, temperature=0)

template = """You are a teacher grading a quiz.
You are given a question, the student's answer, and the true answer, and are asked to score it as either CORRECT or INCORRECT.

Example Format:
QUESTION: question here
STUDENT ANSWER: student's answer here
TRUE ANSWER: true answer here
GRADE: CORRECT or INCORRECT here

Please remember to grade them based on being factually accurate. If the STUDENT ANSWER has any conflicting answers, then the GRADE should be INCORRECT. If the STUDENT ANSWER has more than one answer, then the GRADE should be INCORRECT. If the STUDENT ANSWER does not clearly have just one correct answer, then the GRADE should be INCORRECT. Begin!

QUESTION: {query}
STUDENT ANSWER: {result}
TRUE ANSWER: {answer}
GRADE:"""

prompt = PromptTemplate(
    input_variables=["query", "result", "answer"],
    template=template,
)

eval_chain = QAEvalChain.from_llm(llm, prompt=prompt)
graded_outputs = eval_chain.evaluate(examples, predictions)

In [110]:
tp = 0
tn = 0
fp = 0
fn = 0

for i in range(len(graded_outputs)):
    if examples[i]['answer'] == 'TRUE':
        if graded_outputs[i]['text'] == 'CORRECT':
            tp += 1
        else:
            fn += 1
    else:
        if graded_outputs[i]['text'] == 'CORRECT':
            tn += 1
        else:
            fp += 1

results = [
        {"Model": model, 
        "Number of Examples": len(examples),
        "Verb": verb,
        "Authority": authority, 
        "Uses CoT": cot_bool, 
        "Overall Answer Accuracy": (tp+tn)/(tp+tn+fp+fn), 
        "Precision": tp/(tp+fp),
        "Recall": tp/(tp+fn),
        "Specificity": tn/(tn+fp)
        }
    ]

def export_to_csv(data, file_name):
    file_exists = os.path.isfile(file_name)
    with open(file_name, "a", newline="") as csvfile:
        fieldnames = data[0].keys()
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        if not file_exists:
            writer.writeheader()
        for row in data:
            writer.writerow(row)

export_to_csv(results, "experiment_results.csv")

for i in range(len(results[0].keys())):
    print(list(results[0].keys())[i], ':', list(results[0].values())[i])


Model : gpt-3.5-turbo
Number of Examples : 4
Verb : causes
Authority : 
Uses CoT : False
Overall Answer Accuracy : 0.5
Recall : 0.0
Specificity : 1.0
