In [11]:
# Data Processing
import pandas as pd
import os
import csv
import json
from tqdm import tqdm
from openai import OpenAI
import tiktoken


import dotenv

dotenv.load_dotenv()

OPENAI_KEY = os.environ.get("OPENAI_API_KEY")

client = OpenAI()


In [5]:
election_emails_df = pd.read_csv("../data/2020_Election_Emails_with_common_sense.csv")

news_articles_df = pd.read_csv("../inputs/samar_common_sense.tsv", sep="\t")

In [32]:
news_articles_df.tail()

Unnamed: 0.1,Unnamed: 0,href,source,min(a.target_time),trafilatura
2879,456378,/world/scotland-police-blasted-report-describi...,foxnews,2022-12-31 21:14:38,Scotland police blasted for report describing ...
2880,456544,/world/south-korea-worries-about-the-return-of...,washingtonpost,2018-03-23 14:03:35,BRUSSELS — President Trump’s decision to make ...
2881,456828,/world/talks-between-taliban-insurgents-and-pr...,washingtonpost,2019-07-09 11:22:42,KABUL — A marathon two-day meeting between Tal...
2882,457381,/world/the_americas/mexico-prepares-to-absorb-...,washingtonpost,2017-03-04 02:07:18,MEXICO CITY — The deportees stepped off their ...
2883,457546,/world/the_americas/us-canada-border-hyder-ste...,washingtonpost,2020-09-18 00:20:10,The mining towns of Hyder and Stewart form one...


In [17]:
def extract_commonsense(text):
    system_prompt = (
        f"You are a bot tasked with extracting population level common sense statements from text. "
        f"The statements you extract, should be considered common sense by general public. So in other words, most people should agree with such statement being common sense. "
        f"If the statements you find, are not considered common sense by a portion of the population, do not include them in your answer. "
        f"Try to return only one statement from the provided text. You return the statements you find in a JSON list with only the statements. "
        f"Do not include the tags for JSON formatting in your output."


        'when the auther uses the word "common sense" in the text, what are they implying is commonsensical.'
    )

    response = client.chat.completions.create(
        model="gpt-4-1106-preview",
        messages=[
            {
                "role": "system",
                "content": system_prompt,
            },
            {
                "role": "user",
                "content": text,
            },
        ],
        temperature=0,
        top_p=1,
        # response_format={"type": "json_object"},
    )

    return response

In [18]:
news_articles_df.trafilatura.values[:5]

array(['PARIS — Immediately upon unveiling its new cover — a depiction of Muhammad — the French satirical newspaper Charlie Hebdo on Tuesday reignited the debate pitting free speech against religious sensitivities that has embroiled Europe since 12 people were killed during an attack on its Paris offices by Muslim extremists a week ago.\\nThe cover shows the bearded prophet shedding a tear and holding up a sign saying, “I am Charlie,” the rallying cry that has become synonymous with support of the newspaper and free expression. Above the cartoon on a green background is the headline “All is forgiven.”\\nWhile surviving staff members, at an emotional news conference, described their choice of cover as a show of forgiveness, most Muslims consider any depiction of their prophet to be blasphemous. Moreover, interpretations quickly swirled around the Internet that the cartoon also contained disguised crudity.\\nOne of Egypt’s highest Islamic authorities warned that the cartoon would exacerb

In [23]:
enc = tiktoken.get_encoding("cl100k_base")


answers = []

for emails in tqdm(news_articles_df.trafilatura.values[:100]):
    # print(len(enc.encode(emails)))
    gpt_res = extract_commonsense(emails)
    answers.append(gpt_res)

  0%|          | 0/100 [00:00<?, ?it/s]

100%|██████████| 100/100 [03:37<00:00,  2.17s/it]


In [33]:
data = []
for i, a in enumerate(answers):
    email_text = news_articles_df.trafilatura.values[i]
    extracted_content = json.loads(a.choices[0].message.content)

    # Calculate cost
    prompt_tokens = a.usage.prompt_tokens
    completion_tokens = a.usage.completion_tokens
    cost_of_prompt = (prompt_tokens / 1000) * 0.01
    cost_of_completion = (completion_tokens / 1000) * 0.03
    total_cost = cost_of_prompt + cost_of_completion

    data.append(
        [
            email_text,
            json.dumps(extracted_content),
            prompt_tokens,
            completion_tokens,
            cost_of_prompt,
            cost_of_completion,
            total_cost,
        ]
    )


columns = [
    "news",
    "statements",
    "prompt_tokens",
    "completion_tokens",
    "prompt_cost",
    "completion_cost",
    "total_cost",
]


df = pd.DataFrame(data, columns=columns)

df#.to_csv( "../extracted_statements/news_articles/extracted_meta_data.csv", index=False)

Unnamed: 0,news,statements,prompt_tokens,completion_tokens,prompt_cost,completion_cost,total_cost
0,PARIS — Immediately upon unveiling its new cov...,"[""Most people consider any depiction of their ...",2025,14,0.02025,0.00042,0.02067
1,Republican governors across the nation are pro...,"[""There is little evidence that cutting the in...",1978,19,0.01978,0.00057,0.02035
2,"KHAN YOUNIS, Gaza Strip — He said he fell for ...","[""Romance is not among the humanitarian reason...",1714,23,0.01714,0.00069,0.01783
3,The Palestinian Authority and the Palestine Li...,"[""Money is oxygen for terrorism.""]",1359,7,0.01359,0.00021,0.01380
4,"On Friday evening, Boris Nemtsov, a Russian op...","[""Too much power in the hands of one man can l...",2090,15,0.02090,0.00045,0.02135
...,...,...,...,...,...,...,...
95,During his Sunday night interview with Charlie...,[],1466,1,0.01466,0.00003,0.01469
96,"Over the past decade, advertisers have spent u...","[""Teenagers being teenagers, the room was full...",3716,15,0.03716,0.00045,0.03761
97,"WASHINGTON — For the final time, President Oba...","[""Most people would agree that a president's S...",1649,26,0.01649,0.00078,0.01727
98,"DENVER — Ken Ivory, a Republican state represe...","[""Many states require that their land be used ...",1613,15,0.01613,0.00045,0.01658


In [34]:
df.total_cost.sum()

2.37098

In [28]:
extracted_statements = []

for i, a in enumerate(answers):
    if len(json.loads(a.choices[0].message.content)) > 0:
        extracted_content = json.loads(a.choices[0].message.content)
        extracted_statements.extend(extracted_content)

with open(
    "../extracted_statements/news_articles/extracted_statements.csv",
    "w",
    newline="",
    encoding="utf-8",
) as file:
    writer = csv.writer(file)
    writer.writerow(["statements"])  # Header for the column
    for statement in extracted_statements:
        writer.writerow([statement])

In [30]:

markdown_output = "## Extracted Statements:\n\n"
for i, a in enumerate(answers):
    if len(json.loads(a.choices[0].message.content)) > 0:
        markdown_output += f"### News {i + 1}\n"
        markdown_output += f"{news_articles_df.trafilatura.values[i]}\n\n"
        markdown_output += "Extracted statements:\n"
        extracted_content = json.loads(a.choices[0].message.content)
        for statement in extracted_content:
            markdown_output += f"- **{statement}**\n"
        markdown_output += "\n"

with open("../extracted_statements/news_articles/extracted_answers.md", "w") as file:
    file.write(markdown_output)

In [14]:

statements_df = pd.read_csv('../data/statements.csv')
statements_list = []

for index, row in statements_df.head(50).iterrows():
    statement_dict = {'index': index, 'statement': row['fixed statement']}
    statements_list.append(statement_dict)

    print('|', row['fixed statement'], '|', ss[index]['normative_statement'], '|')




| 1 plus 1 equals 2. | It is commonly accepted that 1 plus 1 should equal 2. |
| 5 is significantly larger than 1. | In general perspectives, 5 is expected to be significantly larger than 1. |
| To maintain good health, one needs a balanced diet and regular exercise. | One should maintain a balanced diet and regular exercise for good health. |
| A ball is round. | A ball should be round according to conventional definitions. |
| A baton twirler wouldn't want to suffer a broken finger. | A baton twirler should be cautious to avoid suffering a broken finger. |
| A battery cannot provide power indefinitely. | A battery is expected not to provide power indefinitely; it should be replaced or recharged periodically. |
| No one usually wants a bill. | People generally should not desire to receive a bill. |
| A birthday might inspire someone to bake a cake. | One might conventionally bake a cake to celebrate a birthday. |
| A birthday could motivate someone to purchase a gift for a loved one. 

In [13]:
s = """
[{"index": 0, "normative_statement": "It is commonly accepted that 1 plus 1 should equal 2."},
 {"index": 1, "normative_statement": "In general perspectives, 5 is expected to be significantly larger than 1."},
 {"index": 2, "normative_statement": "One should maintain a balanced diet and regular exercise for good health."},
 {"index": 3, "normative_statement": "A ball should be round according to conventional definitions."},
 {"index": 4, "normative_statement": "A baton twirler should be cautious to avoid suffering a broken finger."},
 {"index": 5, "normative_statement": "A battery is expected not to provide power indefinitely; it should be replaced or recharged periodically."},
 {"index": 6, "normative_statement": "People generally should not desire to receive a bill."},
 {"index": 7, "normative_statement": "One might conventionally bake a cake to celebrate a birthday."},
 {"index": 8, "normative_statement": "It is typical to purchase a gift for a loved one on their birthday."},
 {"index": 9, "normative_statement": "Christians are conventionally expected to worship in a building called a church."},
 {"index": 10, "normative_statement": "A cat should typically dislike taking a bath, as per common perception of feline behavior."},
 {"index": 11, "normative_statement": "A cat should not appreciate having its tail pulled, according to standard behavior expectations of pets."},
 {"index": 12, "normative_statement": "A cat is expected not to like to be wet."},
 {"index": 13, "normative_statement": "A cat should prefer not to get wet, aligning with general feline behavior."},
 {"index": 14, "normative_statement": "A cat is classified as a species of feline; this categorization is a norm."},
 {"index": 15, "normative_statement": "It is often said that a celebrity should wear sunglasses to avoid being recognized, having worked hard to become famous."},
 {"index": 16, "normative_statement": "A child would not typically laugh at its mother's ugliness, due to inherent familial bonds."},
 {"index": 17, "normative_statement": "A circle is expected to be round based on its definition."},
 {"index": 18, "normative_statement": "To turn on a computer, one should press its power button."},
 {"index": 19, "normative_statement": "According to scientific understanding, a computer virus should be considered non-living."},
 {"index": 20, "normative_statement": "A couch potato is often encouraged to develop a healthier lifestyle with good physical fitness."},
 {"index": 21, "normative_statement": "A crime is generally understood to prompt a legal trial."},
 {"index": 22, "normative_statement": "It is a saying that a deaf husband and a blind wife may make a harmonious couple."},
 {"index": 23, "normative_statement": "In a constitutional republic, a well-armed lamb should have the right to contest the vote in a decision on dinner, unlike in a direct democracy."},
 {"index": 24, "normative_statement": "If one wishes to share knowledge, teaching others is a commonly endorsed method."},
 {"index": 25, "normative_statement": "Learning a different language is often recommended to gain a new perspective on life."},
 {"index": 26, "normative_statement": "It makes practical sense that a doctor and a farmer would have more combined knowledge than a doctor alone."},
 {"index": 27, "normative_statement": "A dog should not desire fleas, as they are unwanted parasites."},
 {"index": 28, "normative_statement": "It is often asserted that a fanatic is someone who should not be entrenched in their thinking, though often they are."},
 {"index": 29, "normative_statement": "It's commonly thought that some people speak their minds, while others should think before speaking."},
 {"index": 30, "normative_statement": "It is generally acknowledged that a few vices can overshadow many virtues."},
 {"index": 31, "normative_statement": "It is wise not to interfere in others' affairs, as a fool does."},
 {"index": 32, "normative_statement": "Whether a ban on fracking should be imposed or not is subject to public policy debate."},
 {"index": 33, "normative_statement": "It is not uncommon for some freshmen in college to consider dropping out."},
 {"index": 34, "normative_statement": "Having a full bladder typically necessitates finding a way to urinate."},
 {"index": 35, "normative_statement": "A gardener's dog should neither eat the cabbage nor allow anyone else to do so."},
 {"index": 36, "normative_statement": "Homophobia should be considered undesirable for a gay person."},
 {"index": 37, "normative_statement": "The validity of a popular opinion should not necessarily be determined by its popularity."},
 {"index": 38, "normative_statement": "Setting a goal should involve selecting a challenge that also invokes excitement."},
 {"index": 39, "normative_statement": "Many doctors advocate that a good laugh and ample sleep should be considered among the best remedies."},
 {"index": 40, "normative_statement": "A successful logistics corridor should ideally include excellent access to key transportation and customer bases."},
 {"index": 41, "normative_statement": "A good marriage could be thought of as one where partners complement each other's senses, such as between a blind wife and a deaf husband."},
 {"index": 42, "normative_statement": "The value of a good night's sleep should not be undervalued by those eager to maintain health and fitness."},
 {"index": 43, "normative_statement": "Regular exercise is widely recommended as a good method for maintaining health."},
 {"index": 44, "normative_statement": "A grain of sand is recognized as being extremely small."},
 {"index": 45, "normative_statement": "A good boss should not require employees to work overtime without proper compensation."},
 {"index": 46, "normative_statement": "A sociable person typically should not enjoy extended periods of solitude."},
 {"index": 47, "normative_statement": "A hallmark of a happy marriage could be considered a long, ongoing conversation."},
 {"index": 48, "normative_statement": "One cannot persuade a heart to love, just as one should not expect to persuade a stomach to digest food."},
 {"index": 49, "normative_statement": "A hexagon is commonly understood to always have six sides."}]
"""

ss = json.loads(s)