# Evaluate and test the model pipeline
#### The purpose of this notebook is to assess and test the model pipeline as standalone functions
#### It also runs the politifact data.
#### Author: Michael Denton
#### Published Date: April 15, 2024

In [1]:
import pandas as pd
import os
import operator
from langgraph.graph import StateGraph, END
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import JsonOutputParser, StrOutputParser, CommaSeparatedListOutputParser
from langchain_community.llms import Ollama
from typing import TypedDict, Annotated, Sequence
from langchain_core.messages import BaseMessage
from langchain.tools import Tool
from langchain_community.utilities import GoogleSearchAPIWrapper
from langchain.agents import AgentType, initialize_agent, load_tools

from typing import List
from langchain.prompts import PromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field
import json


def run_claim_judge_pipeline(text, search_version=0, one_claim = False, llm_to_use = 'mistral', baseline = False, study_mode = False):
    """
    Runs the judgement pipeline end to end

    Search API versions for Google: 
    - 0 = Entire web (v0)
    - 1 = politifact plus dictionaries (v1)
    - 2 = Just dictionaries (v2)
    - 3 = News sources (new york times, la times, washington post, economist, hbr)
    - 4 = .gov websites plus news sources (new york times, washington post, la times, reuters, bloomberg)
    - 5 = Entire web excluding politifact
    """

    # text = "Jupter is the smallest planet in the solar system."
    # search_version = 0
    # llm_to_use = 'mistral'
    
    # Keys removed for privacy reasons
    search_dict_for_keys = {
        0: "",
        1: "",
        2: "",
        3: "",
        4: "",
        5: "",
    }

    if search_version not in list(search_dict_for_keys.keys()):
        raise Exception('API search version not supported. Try one of 0,1,2,3.')


    # Set API key and CSE ID
    os.environ["GOOGLE_CSE_ID"] = search_dict_for_keys[search_version]
    os.environ["GOOGLE_API_KEY"] = ""
    # os.environ["TAVILY_API_KEY"] = ""

    if llm_to_use == 'llama2':
        llm = Ollama(model="llama2")
    elif llm_to_use == 'mistral':
        llm = Ollama(model="mistral-openorca")
    else:
        raise Exception('Selected LLM not supported')

    # Define the output parser
    string_parser = StrOutputParser()
    # json_parser = JsonOutputParser()
    list_parser = CommaSeparatedListOutputParser()

    claims_agent_messages = [
        ("system", "[INST]You are tasked with extracting the verifiable claims from the user query."),
        ("system", "Each claim should exist as a unique immutable string in a python list without any extras, like this:\n['first claim, 'second claim', 'third claim']\n"),
        ("system", "The domain focus is politics."),
        ("system", "One statement per claim."),
        ("user", "The user query:\n{input}\n"),
        ("system", "[/INST]")
    ]

    judge_agent_messages = [
        ("system", "[INST]You are tasked with judging whether the following claims are contained within the text."),
        ("system", "If these Claims, Facts, or Opinions are not in the original text, you will answer 'no'. If they are, answer 'yes'"),
        ("system", "The claims are:\n{extracted_claims}\n"),
        ("system", "These were claims from the following text:\n{input}\n"),
        ("system", "Are the claims are contained within the text? Answer only 'yes' or 'no' with no other text.[/INST]")
    ]

    feedback_agent_messages = [
        ("system", "Agents have assessed text for verifiable claims"),
        ("system", "A judge has compared these to the original text and decided that the extracted claims are not sufficient based on the original text."),
        ("system", "Given the analysis and the original text, provide instructions to improve the output."),
        ("user", "The verifiable claims are:\n{extracted_claims}\n"),
        ("user", "The origin text is:\n{input}\n"),
        ("user", "What changes should the agents implement?"),
    ]


    feedback_prompt = "You have previously done this before, and a judge has suggested the following notes to improve your answer: {reinforcer_notes}"


    # claims_agent = claims_agent_messages | llm | list_parser
    # judge_agent = judge_agent_messages | llm | list_parser

    # Define the agent state
    class AgentState(TypedDict):
        input_text: str
        claims_to_check: list[str]
        baseline: bool
        apiversion: int
        reinforcer_notes: str
        judge_output: str
        iteration: int
        max_iterations: int
        final_output: list
        responses: list

    def input(state):
        input_text = state["input_text"]
        
        return {"input_text": input_text}

    def is_one_sentence(state):
        """
        Checks if one sentence, if it is output go to search.
        Otherwise we'll parse out facts in each sentence
        """

        input_text = state["input_text"]
        if (len([val for val in str(input_text).replace('"','').replace("'",'').replace('\n','').replace('\t','').split('.') if val != '']) == 1) or one_claim:
            return 'go_to_search'
        return 'extract_claims_from_text'


    def extract_claims(state):
        input_text = state["input_text"]
        if state.get("reinforcer_notes"):
            claims_agent_messages.append(("system", feedback_prompt))
        reinforcer_notes = state.get("reinforcer_notes", "")

        claims_prompt = ChatPromptTemplate.from_messages(claims_agent_messages)
        claims_agent = claims_prompt | llm | list_parser
        response = claims_agent.invoke({"input": input_text, "reinforcer_notes": reinforcer_notes})
        print(response)
        # The format comes back as a list within a list
        if 'list' in str(type(response)).lower():
            
            if len(response) == 1:
                new_response = response[0]
            else:
                new_response = response

            if 'list' in str(type(new_response)).lower():
                if len(new_response) > 1:
                    formatted_responses = []
                    for val in new_response:
                        val = val.replace('[','').replace(']','').replace('<|im_end|>','')
                        if '\n' in val:
                            vals = val.split('\n')
                            formatted_responses = formatted_responses + vals
                        else:
                            formatted_responses.append(val)
                    response = formatted_responses
                

        return {"claims_to_check": response}

    def extract_judgement(state):
        formatted_extraction = state.get("claims_to_check")
        input = state.get("input_text")

        judge_prompt = ChatPromptTemplate.from_messages(judge_agent_messages)
        judge_agent = judge_prompt | llm | string_parser
        verdict = judge_agent.invoke({"input": input, "extracted_claims": formatted_extraction})
        print(verdict)
        if 'yes' in str(verdict).lower():
            verdict = 'yes'
        else:
            verdict = 'no'

        return {"judge_output": verdict}

    def feedback_on_incorrect_extractions(state):
        formatted_extraction = state.get("claims_to_check")
        input = state.get("input_text")
        instructor_prompt = ChatPromptTemplate.from_messages(feedback_agent_messages)
        instructor_agent = instructor_prompt | llm | string_parser
        instructions = instructor_agent.invoke({"input": input, "extracted_claims":formatted_extraction})
        return {"reinforcer_notes": instructions}                                                   

    def judge_happy(state):
        judges_verdict = state.get("judge_output")
        judges_verdict = judges_verdict.strip().replace("\n", "").lower()
        if judges_verdict == 'yes':
            return 'continue'
        return 'feedback'


    def google_search_agent(state):
        """
        AgentType
        """
        if state["claims_to_check"] is not None:
            claims_to_check = state["claims_to_check"] 
        else:
            claims_to_check = state["input_text"]

        if 'list' not in str(type(claims_to_check)).lower():
            claims_to_check = [claims_to_check]

        responses = []
        for clm in claims_to_check:
            print('checking',clm)
            responses.append(google_search_agent_call(stuff=clm, llm=llm, baseline=baseline))

        return {"responses": responses} 

    def print_output(state):
        responses = state.get("responses")

        return {"final_output": responses}

    graph = StateGraph(AgentState)

    # Define nodes in our graph

    graph.add_node("input", input)
    graph.add_node("claims_agent", extract_claims)
    graph.add_node("judge_agent", extract_judgement)
    graph.add_node("feedback_agent", feedback_on_incorrect_extractions)
    graph.add_node("search_agent", google_search_agent)
    graph.add_node("output", print_output)

    graph.add_conditional_edges(
        "input", 
        is_one_sentence, 
        {
            "go_to_search": "search_agent",
            "extract_claims_from_text": "claims_agent"
        }
    )
    graph.add_edge('claims_agent', 'judge_agent')
    graph.add_conditional_edges(
        "judge_agent",
        judge_happy,
        {
            "feedback": "feedback_agent",
            "continue": "search_agent"
        }
    )
    graph.add_edge('feedback_agent', 'claims_agent')
    graph.add_edge("search_agent", "output")
    graph.add_edge('output', END)

    graph.set_entry_point("input")

    verify = graph.compile()

    inputs = {"input_text": text}

    # for output in verify.stream(inputs):
    # # stream() yields dictionaries with output keyed by node name
    #     for key, value in output.items():
    #         print(f"Output from node '{key}':")
    #         print("---")
    #         print(value)
    #     print("\n---\n")

    return verify.invoke(inputs)


def google_search_agent_call(stuff, llm, study_mode=False, baseline=False):
    """
    Google search agent
    control_ouputs = None or a list of user defined outputs [example: user wants the llm to judge based on [yes, no, maybe, sometimes]]
    """

    examples = """
        The user query is : "The sky is blue". The output JSON is : 
        {
        "statement": "The sky is blue.",
        "judgement": "True",
        "justification": "The sky appears blue to the human eye, as it reflects the blue light of the atmosphere.",
        }, 

        The user query is : "Joseph Biden is the 46th president of the United States of America." The output JSON is :
        {
        "statement": "Joseph Biden is the 46th president of the United States of America.",
        "judgement": "True",
        "justification": "The White House website lists Joseph R. Biden Jr as the 46th president of the United States.",
        }, 

        The user query is : "Donald Trump won the 2020 United States Presidential Election." The output JSON is :
        {
        "statement": "Donald Trump won the 2020 United States Presidential Election.",
        "judgement": "False",
        "justification": "Donald Trump lost to Joe Biden in the 2020 presidential election by a count of 232 electoral votes to 306 for Joe Biden",
        },
    """
    
    class Judgement(BaseModel):
            
            statement: str = Field(description="The question or statement we are verifying.")
            judgement: str = Field(description="The judgement of the veracity of the question or statement.")
            justification: str = Field(description="The justification of the answer we have given.")

    json_parser = JsonOutputParser(pydantic_object=Judgement)
    string_parser = StrOutputParser()

    # Start timer
    import time
    start_time = time.time()

    if baseline:

        info = ''
        tool = None

    else:

        # set up google search
        info = ''
        tool = None
    
        search = GoogleSearchAPIWrapper()
        
        def top10_results(query):
            return search.results(query, 10)

        tool = Tool(
            name="google_search",
            description="Search Google for top 10 links.",
            func=top10_results,
        )

        # try:
        info = tool.invoke(stuff)
        # except:
        #     info = ''
    
    
    prompt = ChatPromptTemplate.from_messages([
            ("system","To answer the user query, provide a judgement on the truthfulness."),
            ("system","Use as few words as possible."),
            ("system","The correct format is a JSON: {format_instructions}"),
            ("system","The JSON output requires keys to include 'statement', 'judgement', 'justification'."),
            ("system","Here are some examples of the correct output: {example_output}"),
            ("system","Information from google search for context: {information}"),
            ("user","The user query is: {query}"),
            ("system","The output JSON is :")
    ])


    # f = open("temp.txt", "w")
    # f.writelines(str(info))
    # f.close()

    # print(info)
    try:
        search_agent = prompt | llm | json_parser
        res = search_agent.invoke(
            {"query":stuff, "information":info, "format_instructions":json_parser.get_format_instructions(), "example_output": examples}
        )

        if study_mode:
            print('First JSON output parser\n',res)

    except:

        if study_mode:
            print('JSON output parser failed.')

        search_agent = prompt | llm | string_parser
        res = search_agent.invoke(
            {"query":stuff, "information":info, "format_instructions":json_parser.get_format_instructions(), "example_output": examples}
        )

        if study_mode:
            print('First string output parser\n',res)

    # We have the result, now we need to convert into a dictionary object
    old_res = res
    if 'dict' not in str(type(res)).lower():
        if study_mode:
            print('Type was not a dictionary')
        try:
            res = eval('{'+res[res.find("{")+1:res.find("}")]+'}')
            
        except:
            if study_mode:
                print('Failed 1')   
            # Need to also consider the case of a nested dictionary, in which case this won't work.
            try:
                res = eval('{'+res[res.find("{")+1:res.find("}")]+'}'+'}') 
            except:
                
                try:
                    res = eval('{'+res[res.find("{")+1:res.find("}")]+'}]}')    
                except:
                    res = {}
    if study_mode:
        print(type(res))
    if 'dict' not in str(type(res)).lower():
        res = {}


    if ('statement' not in res.keys() or 'judgement' not in res.keys() or 'justification' not in res.keys()):

        max_tries = 6
        m=1
        new_res = None
        while (m < max_tries) and ('statement' not in res.keys() or 'judgement' not in res.keys() or 'justification' not in res.keys()):
            
            if study_mode:
                print(m)
            
            if len(res) == 0:
                last_res = old_res
            else:
                last_res = res


            new_prompt = ChatPromptTemplate.from_messages([
                ("system","Your original answer is missing key information required in the JSON output"),
                ("system","The JSON output requires keys to include 'statement', 'judgement', 'justification'"),
                ("system","The previous output was {last_result}"),
                ("system","Here are some examples of the correct output: {example_output}"),
                ("system","The correct format is a JSON: {format_instructions}"),
                ("user","The query to judge validity is:{query}"),
                ("user","The search information is:{info}"),
            ])

            try:
                search_agent = new_prompt | llm | json_parser
                new_res = search_agent.invoke(
                    {"query":stuff, "info":info, "format_instructions":json_parser.get_format_instructions(), "example_output": examples, "last_result":last_res}
                    )

                if study_mode:
                    print(new_res)

            except:
                search_agent = new_prompt | llm | string_parser
                new_res = search_agent.invoke(
                    {"query":stuff, "info":info, "format_instructions":json_parser.get_format_instructions(), "example_output": examples, "last_result":last_res}
                )

                if study_mode:
                    print(new_res)


            if 'dict' not in str(type(new_res)).lower():
                try:
                    new_res = eval('{'+new_res[new_res.find("{")+1:new_res.find("}")]+'}')
                except:
                    try:
                        new_res = eval('{'+new_res[new_res.find("{")+1:new_res.find("}")]+'}'+'}')
                    except:
                        old_res = new_res
                        new_res = {}

            if study_mode:
                print(new_res)

            if 'statement' in new_res.keys():
                if new_res['statement'] == '':
                    new_res.pop('statement')

            if 'justification' in new_res.keys():
                if new_res['justification'] == '':
                    new_res.pop('justification')

            if 'judgement' in new_res.keys():
                if new_res['judgement'] == '':
                    new_res.pop('judgement')

            if 'source' in new_res.keys():
                if new_res['source'] == '':
                    new_res.pop('source')

            if len(new_res) > 0:
                old_res = new_res
                res = new_res

            m+=1
        
    time_taken = time.time() - start_time

    res['process_time'] = time_taken
    res['information'] = str(info)
    res['message'] = "Successfully returned judgement"

    return res
        


In [2]:
text = """

The economic news in 2023 was almost miraculously good. Not only did America’s economy defy widespread predictions of recession, it also defied claims that only a significant rise in unemployment could bring inflation under control. Instead, we got a combination of strong growth, unemployment near a 50-year low and plunging inflation.

But last week, the Bureau of Labor Statistics reported that both the Consumer Price Index and the Producer Price Index rose 0.3 percent in January, more than most analysts expected. And the usual suspects — inflation perma-bears, political enemies of the Biden administration and economists who wrongly predicted that disinflation would require mass unemployment — jumped on the data as if it were a fumbled football.

So, are the good times over?

No. Everything we know suggests that those disappointing numbers were mostly a statistical blip rather than marking a significant worsening in inflation trends.

Before I explain how such blips can happen, let me tell you what indicators I was looking at after the inflation reports.

First, I was looking at financial markets, where instruments like inflation swaps and index bonds tell you what inflation rates investors putting real money on the line expect. The pricing on these instruments is still pointing to low inflation, around 2 percent or a bit more.

Second, I was waiting to see what happened in the Atlanta Federal Reserve’s survey of business inflation expectations, which asks businesses how much they expect costs to rise over the next year. If inflation were suddenly surging, you’d expect businesses to notice. But their inflation expectations rose to 2.3 percent in February from … 2.2 percent in January.

But if nothing much has changed, where did those slightly scary B.L.S. numbers come from?

In principle, the government estimates overall consumer prices the same way the American Farm Bureau Federation estimates the price of a classic Thanksgiving dinner (which was, by the way, down 4.5 percent in 2023): it calculates the cost of buying a fixed basket of goods and services.

In practice, our economy is a lot more complicated than a standardized holiday dinner menu, and estimating inflation involves a lot of fancy statistical footwork. The B.L.S. is extremely competent and professional — in fact, one rarely heralded policy advantage the United States has over other countries is that we generally have better data. But while I have nothing but praise for the bureau, its reports can still sometimes be misleading, for several reasons.

One reason is that to make sense of monthly data, you need to adjust for seasonal factors. Some of these factors are obvious: fresh vegetables get more expensive in the winter, cheaper in the summer. Others are less obvious. Goldman Sachs, which correctly predicted a bump in official inflation, points out that there is a “January effect” on prices, because many companies raise their prices at the beginning of the year. And Goldman argued, in advance, that the official numbers wouldn’t be sufficiently adjusted to reflect this effect, leading to a spurious bump in measured inflation — a bump that will vanish in the months ahead.

Goldman also pointed out that the single largest component in the Consumer Price Index — 27 percent of the basket! — is a price nobody actually pays: owners’ equivalent rent, an estimate of what homeowners would be paying if they rented their houses. There are reasons the bureau measures housing costs this way, but there are also reasons to believe that in recent years that number has become misleading, distorting and exaggerating estimates of overall inflation. As it happens, the B.L.S. also produces an estimate of prices excluding owners’ equivalent rent, roughly matching the way European countries measure inflation. This “harmonized” index is up only 2.3 percent over the past year.

If you find all of this a bit mind-numbing, let me tell you a secret — so do I, even though this is supposed to be my field. But the bottom line is important: Despite some disappointing numbers last week, the basic narrative hasn’t changed. The U.S. economy continues to look like an amazing success story.

Saying this leads, of course, to pushback from Republicans who’ve claimed ad nauseam that Biden’s “socialist” policies would be a disaster — and as I recently wrote, for such people believing is seeing, so they continue to insist that the economy is terrible even when by all objective measures, it’s doing pretty well. You also get some pushback from people on the left, who apparently believe that a progressive president shouldn’t be allowed to tout policy successes until he has completely eliminated poverty and insecurity — that is, never.

The fact, however, is that Biden has put in place a very ambitious agenda — major enhancements of Obamacare, student debt relief, big infrastructure spending, large-scale promotion of semiconductors and green energy that have led to a surge in manufacturing investment. Many voices warned that he was overreaching, that the economy would pay a big price.

But it hasn’t. It turns out that we can, in fact, afford to do a lot to improve Americans’ lives and invest in the future.


"""

In [1]:
# text = 'COVID-19 misinformation cost at least 2,800 lives and $300M, new reports says.'


out = run_claim_judge_pipeline(
    text=text, search_version=0, one_claim=False, llm_to_use = 'mistral', baseline=False, study_mode=True
    )

# print(out)

In [4]:
type(out)

dict

In [10]:
out.keys()

dict_keys(['input_text', 'claims_to_check', 'judge_output', 'responses', 'final_output'])

In [11]:
out['claims_to_check']

['1. The economic news in 2023 was almost miraculously good with strong growth',
 'unemployment near a 50-year low',
 'and plunging inflation.',
 '2. Inflation figures for January were higher than expected',
 'but this is mostly a statistical blip rather than a significant worsening of inflation trends.',
 '3. Financial markets indicate that investors still expect low inflation rates around 2 percent or slightly more.',
 "4. The Atlanta Federal Reserve's survey of business inflation expectations rose to 2.3 percent in February from 2.2 percent in January.",
 '5. Seasonal factors and the way housing costs are estimated can sometimes make inflation figures misleading',
 'leading to temporary spikes in measured inflation.',
 '6. Despite some disappointing numbers',
 "the U.S. economy continues to look like an amazing success story with Biden's progressive agenda not causing the economic harm critics predicted."]

In [8]:
import json
with open('result.json', 'w') as fp:
    json.dump(out, fp)

In [3]:
import pprint

pprint.pp(out['final_output'])

[{'statement': 'COVID-19 misinformation cost at least 2,800 lives and $300M, '
               'new report says.',
  'judgement': 'True',
  'justification': 'Recent reports have indicated that the spread of COVID-19 '
                   'misinformation in Canada has cost at least 2800 lives and '
                   '$300 million in hospital expenses over nine months.',
  'process_time': 23.405393838882446,
  'information': "[{'title': 'COVID-19 misinformation cost at least 2,800 "
                 "lives and $300M, new ...', 'link': "
                 "'https://www.cbc.ca/news/politics/cost-of-covid-19-misinformation-study-1.6726356', "
                 "'snippet': 'Jan 27, 2023 ... The spread of COVID-19 "
                 'misinformation in Canada cost at least 2800 lives and $300 '
                 'million in hospital expenses over nine months of '
                 "the\\xa0...'}, {'title': 'COVID misinformation led to at "
                 "least 2800 deaths in Canada, $300M ...', 

### Assess Accuracy against the politifact dataset

In [2]:
# Data has been previously web-scraped and is available in JSON format

import json
import pandas as pd

# path = ''
# fname = 'politifact_3000.json'

# f = open(path+fname)
# data = json.load(f)

# politifact_scraped_data = pd.DataFrame()

# j=0
# for i in data:
#     temp = {}
#     # if j==0:
#     for key in i.keys():
#         temp[key] = str(i[key])

#     politifact_scraped_data = pd.concat([politifact_scraped_data, pd.DataFrame(data=temp, index=[j])])
#     j+=1
 
# f.close()

# politifact_scraped_data.to_excel('politifact_scraped_data.xlsx')

politifact_scraped_data = pd.read_excel('politifact_scraped_data.xlsx')
# sampled_data = pd.read_excel('sampled_subset_of_poltifact_data.xlsx')
previous_judged = pd.read_excel('sampled_data_merged.xlsx')

In [5]:
politifact_scraped_data.head(2)

Unnamed: 0,index,claim,claim_source,review_date,review_author,veracity,review_tags,review_points,review_article,review_url
0,0,Michael Moore is supporting Trump in the 2024 ...,Instagram posts,"March 4, 2024",Sofia Ahmed,false,"['Elections', 'Pop Culture', 'Instagram posts']","['In a longer video of the 2016 documentary ""M...","Michael Moore, a liberal filmmaker, did not an...",https://www.politifact.com/factchecks/2024/mar...
1,1,The 2022 CHIPS and Science Act “attracted $640...,Joe Biden,"February 29, 2024",Louis Jacobson,barely-true,"['National', 'Science', 'Technology', 'Joe Bid...",['A Semiconductor Industry Association analysi...,"Ahead of his 2024 State of the Union address, ...",https://www.politifact.com/factchecks/2024/feb...


In [6]:
politifact_scraped_data['veracity'].value_counts()

veracity
false          1781
pants-fire      539
barely-true     261
half-true       173
mostly-true     157
true             71
full-flop        12
half-flip         6
Name: count, dtype: int64

In [7]:
politifact_scraped_data = politifact_scraped_data[~politifact_scraped_data['veracity'].isin(['full-flop','half-flip'])]

In [8]:
# replace later
saved_dataframe = previous_judged

max_n = int(100 / 6)

samples = []
for veracity, veracity_data in politifact_scraped_data.groupby('veracity'):

    veracity_data = veracity_data[~veracity_data['index'].isin(saved_dataframe['index'])]

    if len(veracity_data) > 0:
        samples.append(veracity_data.sample(n=max_n, replace=True))

In [9]:
sampled_data = pd.concat(samples)
sampled_data.describe()

Unnamed: 0,index
count,96.0
mean,1649.166667
std,778.053613
min,89.0
25%,1110.5
50%,1740.5
75%,2239.75
max,2992.0


In [11]:
sampled_data.sort_values(by='index').head()

Unnamed: 0,index,claim,claim_source,review_date,review_author,veracity,review_tags,review_points,review_article,review_url
89,89,"“Wisconsin had over 1,400 opioid overdose deat...",Tammy Baldwin,"February 6, 2024",Laura Schulte,true,"['Drugs', 'Public Health', 'Wisconsin', 'Tammy...",['The number of people who died from opioids i...,The number of opioid deaths has been steadily ...,https://www.politifact.com/factchecks/2024/feb...
165,165,"Gov. Tony Evers and Democrats ""rejected our (I...",Robin Vos,"December 22, 2023",Vanessa Swales,mostly-true,"['Redistricting', 'States', 'Iowa', 'Wisconsin...",['Wisconsin Legislative Reference Bureau says ...,The State of Wisconsin’s redistricting process...,https://www.politifact.com/factchecks/2023/dec...
195,195,Wisconsin Republicans are backing “a nonpartis...,Ron Tusler,"December 8, 2023",D.L. Davis,half-true,"['Redistricting', 'States', 'Iowa', 'Wisconsin...",['The Iowa model refers to a redistricting met...,Iowa has played an outsized role in presidenti...,https://www.politifact.com/factchecks/2023/dec...
225,225,"In Florida, “any woman who has an abortion aft...",Gavin Newsom,"November 29, 2023",Amy Sherman,half-true,"['Abortion', 'Crime', 'California', 'Florida',...",['Florida Gov. Ron DeSantis signed a bill in A...,California’s Democratic Gov. Gavin Newsom and ...,https://www.politifact.com/factchecks/2023/nov...
252,252,U.S. Rep. Mike Johnson is the 45th speaker of ...,Facebook posts,"November 3, 2023",Ciara O'Rourke,false,"['Facebook Fact-checks', 'Facebook posts']",['U.S. Rep. Mike Johnson is the 56th speaker o...,"U.S. Rep. Mike Johnson, R-La., was elected sp...",https://www.politifact.com/factchecks/2023/nov...


In [2]:
# for i,row in sampled_data.iterrows():
#     print(row['index'])
#     print('>',row['claim'])
#     print('')

In [12]:
previous_judged.sort_values(by='index').head()

Unnamed: 0.2,Unnamed: 0.1,index,claim,claim_source,review_date,review_author,veracity,review_tags,review_points,review_article,review_url,Unnamed: 0,judgement,API,judgement_answer,sources
0,0,30,“Marijuana is currently classified in the same...,Kirsten Gillibrand,"February 20, 2024",Conor Amendola,true,"['Drugs', 'Public Health', 'New York', 'Kirste...","['Sen. Kirsten Gillibrand, D-N.Y., is correct ...","When Sen. Kirsten Gillibrand, D-N.Y., shared a...",https://www.politifact.com/factchecks/2024/feb...,27,[{'statement': 'Marijuana is currently classif...,0,True,"[{'title': 'Yes, the federal government still ..."
1,1,44,President Joe Biden “directed New York AG Witc...,Donald Trump,"February 20, 2024",Louis Jacobson,false,"['National', 'Legal Issues', 'New York', 'Dona...",['Letitia James said during her 2018 campaign ...,Following a $355 million court ruling against ...,https://www.politifact.com/factchecks/2024/feb...,23,[{'statement': 'President Joe Biden “directed ...,0,False,"[{'title': ""Fact-check: Trump's baseless claim..."
2,2,65,"""(The) top issue for college students is the e...",Scott Walker,"February 2, 2024",Hope Karnopp,true,"['Abortion', 'Economy', 'Education', 'Wisconsi...",['Polling commissioned by Young America’s Foun...,"In the lead-up to the 2024 election, president...",https://www.politifact.com/factchecks/2024/feb...,25,[{'statement': '(The) top issue for college st...,0,True,"[{'title': ""PolitiFact: Most college voters' N..."
3,3,105,Donald Trump “proposed when he was president” ...,Nikki Haley,"January 19, 2024",Louis Jacobson,barely-true,"['New Hampshire', 'Infrastructure', 'Taxes', '...","['While president, Donald Trump expressed luke...","ROCHESTER, N.H. — In the Republican presidenti...",https://www.politifact.com/factchecks/2024/jan...,0,[{'statement': 'Donald Trump “proposed when he...,0,False,[{'title': 'Nikki Haley said Donald Trump want...
4,4,156,"Jimmy Kimmel was on the Jeffrey Epstein list, ...",X posts,"January 4, 2024",Jeff Cercone,pants-fire,"['National', 'Criminal Justice', 'Facebook Fac...",['Comedian and TV host Jimmy Kimmel was not me...,The first 40 of more than 200 court documents ...,https://www.politifact.com/factchecks/2024/jan...,7,[{'statement': 'Jimmy Kimmel was on the Jeffre...,0,False,"[{'title': 'Jimmy Kimmel on X: ""Dear Aasshole:..."


In [19]:
sampled_data = sampled_data.drop_duplicates(subset=['claim']).reset_index(drop=True)

In [20]:
sampled_data['veracity'].value_counts()

veracity
false          16
pants-fire     16
barely-true    15
mostly-true    15
true           15
half-true      14
Name: count, dtype: int64

In [22]:
# # Provide judgement
# sampled_data = pd.read_csv('politifact_judged_v1.csv')

# def run_function_wrapper(x, apiv):
#     try:
#         out = run_claim_judge_pipeline(
#             text=x['claim'], one_claim=True, search_version=apiv, llm_to_use = 'mistral', outputs = None, baseline=False, study_mode=True
#         )
#         return out
#     except Exception as e:
#         print(e)
#         return None

# # sampled_data['jugdement_api_v4'] = sampled_data.apply(lambda x: run_function_wrapper(x, apiv=4), axis=1)
# sampled_data['jugdement_api_v0'] = sampled_data.apply(lambda x: run_function_wrapper(x, apiv=0), axis=1)

In [27]:

#sampled_data_merged = pd.read_excel('sampled_data_merged.xlsx')
# sampled_data[(~sampled_data['index'].isin(indxs)) & (~sampled_data['index'].isin(spanish))][['index','claim']].to_excel('test.xlsx')

In [14]:
spanish = [126, 85, 2983, 2504, 2600]
indxs = []
res_v0= []

for i,row in sampled_data[(~sampled_data['index'].isin(indxs)) & (~sampled_data['index'].isin(spanish))][['index','claim']].iterrows():

    try:
        out_v0 = run_claim_judge_pipeline(
                text=row['claim'], one_claim = True, search_version=0, llm_to_use = 'mistral', study_mode=False
        )
        indxs.append(row['index'])
        res_v0.append(out_v0['final_output'])
    except Exception as e:
        print(e)
        indxs.append(row['index'])
        res_v0.append(None)

judged_politifact_v0 = pd.DataFrame(data={'index': indxs, 'judgement': res_v0, 'API': 0})
judged_politifact_v0.head()

judged_politifact_v0.to_excel('judged_subset_of_politifact_data_part4.xlsx')

checking Says Tim Michels' family foundation "funded an organization that tracks women when they get near abortion clinics … he wants to treat women like they're the criminals."
checking Raphael Warnock "believes in no cash bail" and "letting people out of prison that are going to make us prisoners in our own home."
checking Cheri Beasley “backs tax hikes — even on families making under $75,000.”
checking Says Gov. Tony Evers would, “allow the government to use red flag laws to confiscate your firearms without due process.”
checking “Mandela Barnes came out in favor of abortion up until the moment of birth.”
checking "38,000 Texans had their license to carry denied, revoked, or suspended over the last five years because law enforcement deemed them too dangerous to carry a loaded gun in public. But thanks to Greg Abbott's new law, they don't need a license to carry anymore."
checking "Kari Lake has been appearing at rallies with neo-Nazis," while "two federal juries have found" Katie Ho

### Evaluate the models
- Must run and save to a list so that when the google API runs out of credits, the information is saved

In [7]:
sampled_data = pd.read_excel('sampled_data_merged.xlsx')
print(len(sampled_data.drop_duplicates(subset='claim')))

206


In [8]:
indxs_apiv5 = []
res_apiv5 = []

spanish = [126, 85, 2983, 2504, 2600]
for i,row in sampled_data[(~sampled_data['index'].isin(indxs_apiv5)) & (~sampled_data['index'].isin(spanish))][['index','claim']].iterrows():
    
    print(i)
    try:
        out_v0 = run_claim_judge_pipeline(
                text=row['claim'], one_claim = True, search_version=5, llm_to_use = 'mistral', study_mode=False, baseline=False
        )
        indxs_apiv5.append(row['index'])
        res_apiv5.append(out_v0['final_output'])
    except Exception as e:
        print(e)
        indxs_apiv5.append(row['index'])
        res_apiv5.append(None)

judged_politifact_apiv5 = pd.DataFrame(data={'index': indxs_apiv5, 'judgement': res_apiv5, 'API': 5})
judged_politifact_apiv5.head()

judged_politifact_apiv5.to_excel('judged_subset_of_politifact_data_apiv5.xlsx')

0
checking “Marijuana is currently classified in the same category of drugs as heroin and a more dangerous category than fentanyl or cocaine.”
1
checking President Joe Biden “directed New York AG Witch Hunt” into Donald Trump’s real estate.
2
checking "(The) top issue for college students is the economy."
3
checking “Wisconsin had over 1,400 opioid overdose deaths in 2022.”
4
checking Donald Trump “proposed when he was president” that “he wanted to raise the gas tax up to 25 cents."
5
checking Jimmy Kimmel was on the Jeffrey Epstein list, documents show.
6
checking “Right now, over 70% of federal employees are still working from home three years after COVID."
7
checking Gov. Tony Evers and Democrats "rejected our (Iowa model redistricting) proposal to enact the very plan they originally endorsed."
8
checking Wisconsin Republicans are backing “a nonpartisan redistricting plan based off the Iowa Model. ... Republicans, Democrats, and the Governor pushed this plan last time redistricting 

In [None]:
# judged_politifact_apiv5 = pd.DataFrame(data={'index': indxs_apiv5, 'judgement': res_apiv5, 'API': 'baseline-mistral7b'})
# judged_politifact_apiv5.head()

# judged_politifact_apiv5.to_excel('judged_subset_of_politifact_data_apiv5.xlsx')

In [21]:
indxs_baseline_m7 = []
res_baseline_m7 = []

for i,row in sampled_data.iterrows():
    
    print(i)
    try:
        out_v0 = run_claim_judge_pipeline(
                text=row['claim'], one_claim = True, search_version=0, llm_to_use = 'mistral', study_mode=False, baseline=True
        )
        indxs_baseline_m7.append(row['index'])
        res_baseline_m7.append(out_v0['final_output'])
    except Exception as e:
        print(e)
        indxs_baseline_m7.append(row['index'])
        res_baseline_m7.append(None)

judged_politifact_baseline_m7 = pd.DataFrame(data={'index': indxs_baseline_m7, 'judgement': res_baseline_m7, 'API': 'baseline-mistral7b'})
judged_politifact_baseline_m7.head()

judged_politifact_baseline_m7.to_excel('judged_subset_of_politifact_data_mistral7b_baseline.xlsx')

0
checking “Marijuana is currently classified in the same category of drugs as heroin and a more dangerous category than fentanyl or cocaine.”
1
checking President Joe Biden “directed New York AG Witch Hunt” into Donald Trump’s real estate.
2
checking "(The) top issue for college students is the economy."
3
checking “Wisconsin had over 1,400 opioid overdose deaths in 2022.”
4
checking Donald Trump “proposed when he was president” that “he wanted to raise the gas tax up to 25 cents."
5
checking Jimmy Kimmel was on the Jeffrey Epstein list, documents show.
6
checking “Right now, over 70% of federal employees are still working from home three years after COVID."
7
checking Gov. Tony Evers and Democrats "rejected our (Iowa model redistricting) proposal to enact the very plan they originally endorsed."
8
checking Wisconsin Republicans are backing “a nonpartisan redistricting plan based off the Iowa Model. ... Republicans, Democrats, and the Governor pushed this plan last time redistricting 

In [22]:
indxs_baseline_llama2 = []
res_baseline_llama2 = []

for i,row in sampled_data.iterrows():
    
    print(i)
    try:
        out_v0 = run_claim_judge_pipeline(
                text=row['claim'], one_claim = True, search_version=0, llm_to_use = 'llama2', study_mode=False, baseline=True
        )
        indxs_baseline_llama2.append(row['index'])
        res_baseline_llama2.append(out_v0['final_output'])
    except Exception as e:
        print(e)
        indxs_baseline_llama2.append(row['index'])
        res_baseline_llama2.append(None)

judged_politifact_baseline_llama2 = pd.DataFrame(data={'index': indxs_baseline_llama2, 'judgement': res_baseline_llama2, 'API': 'baseline-llama2'})
judged_politifact_baseline_llama2.head()

judged_politifact_baseline_llama2.to_excel('judged_subset_of_politifact_data_llama2_baseline.xlsx')

0
checking “Marijuana is currently classified in the same category of drugs as heroin and a more dangerous category than fentanyl or cocaine.”
1
checking President Joe Biden “directed New York AG Witch Hunt” into Donald Trump’s real estate.
2
checking "(The) top issue for college students is the economy."
3
checking “Wisconsin had over 1,400 opioid overdose deaths in 2022.”
4
checking Donald Trump “proposed when he was president” that “he wanted to raise the gas tax up to 25 cents."
5
checking Jimmy Kimmel was on the Jeffrey Epstein list, documents show.
6
checking “Right now, over 70% of federal employees are still working from home three years after COVID."
7
checking Gov. Tony Evers and Democrats "rejected our (Iowa model redistricting) proposal to enact the very plan they originally endorsed."
8
checking Wisconsin Republicans are backing “a nonpartisan redistricting plan based off the Iowa Model. ... Republicans, Democrats, and the Governor pushed this plan last time redistricting 