In [1]:
import openai
import json
import numpy as np
from numpy.linalg import norm
from scipy import spatial
import requests

api_key = open("api_key").read()
openai.api_key = api_key

In [2]:
def save_json(data, filepath=r'new_data.json'):
   with open(filepath, 'w') as fp:
      json.dump(data, fp, indent=4)

In [38]:
# search function
def strings_ranked_by_relatedness(
    query: str,
    db_embeddings: list,
    relatedness_fn=lambda x, y: 1 - spatial.distance.cosine(x, y),
) -> tuple[list[str], list[float]]:
    """Returns a list of strings and relatednesses, sorted from most related to least."""
    query_embedding_response = openai.Embedding.create(
        model="text-embedding-ada-002",
        input=query,
    )
    query_embedding = query_embedding_response["data"][0]["embedding"]
    strings_and_relatednesses = [
        (doc_data["summary"], relatedness_fn(query_embedding, doc_data["embedding"]))
        for doc_data in db_embeddings
    ]
    strings_and_relatednesses.sort(key=lambda x: x[1], reverse=True)
    return strings_and_relatednesses

In [30]:
article_embeddings = json.load(open('data/raw/AllTheNews/embeddings/2016_10p.json'))
res = []
for article_data in article_embeddings:
    res.append({
        "doc_id": article_data['id'],
        'content': article_data['content'],
        "summary": article_data['summary'],
        "embedding": article_data['embedding'],
    })
save_json(res, 'data/result/AllTheNews/network/server/embeddings.json')

In [5]:
def request_chatgpt_gpt4(messages):
    url = "http://127.0.0.1:5000/event_hgraph"
    body = {"messages": messages}
    response = requests.post(url, json=body).json()
    gpt_response = response['choices'][0]['message']['content'].strip()
    return gpt_response


In [35]:
def evaluate_fitness(doc, query):
    example_news_1 = """
    The article discussed the severe flooding in Missouri and Illinois, which has resulted in numerous deaths and extensive damage to homes and communities. Gov. Jay Nixon declared a state of emergency and activated the Missouri National Guard to assist with evacuations and traffic control. The rising flood waters have also prompted the U.S. Army Corps of Engineers to work on strengthening levees to protect local areas.
    """
    example_query_1 = "Missouri flooding"
    example_answer_1 = "Relevant"
    example_news_2 = """
    The article discussed Vladimir Putin's long-standing support for Donald Trump, which was evident through Twitter accounts and Putin's recent comments calling Trump a \"brilliant and talented person.\" Putin's embrace of Trump is seen as a darker move, similar to his endorsements of other insurgent parties in Europe, as a way to chip away at countries and institutions that he believes have conspired against Russia. Putin supports Trump because of the chaos and destruction that Trump's antics bring to the U.S.
    """
    example_query_2 = "Hillary Clinton"
    example_answer_2 = "Irrelevant"
    template = """
    Article: {news} \n\n\n
    Do you think the article is talking about: {topic}? \n\n\n
    """
    messages = [
        {
            "role": "system",
            "content": """
                You are a news article filtering system.
                The user will provide you with an article and a topic, your job is to decide whether the article is talking about the topic.
                Reply with 'Relevant' or 'Irrelevant'.
            """,
        },
        # # example 1
        # { "role": "user", "content": template.format(news=example_news_1, topic=example_query_1)},
        # { "role": "system", "name": "example_system", "content": example_answer_1},
        # # example 2
        # { "role": "user", "content": template.format(news=example_news_2, topic=example_query_2)},
        # { "role": "system", "name": "example_system", "content": example_answer_2},
        # user input
        { "role": "user", "content": template.format(news=doc, topic=query)}
    ]
    response = request_chatgpt_gpt4(messages)
    if 'Irrelevant' in response:
        return 'Irrelevant'
    elif 'Relevant' in response:
        return 'Relevant'
    else:
        return 'Irrelevant'


In [40]:
def binary_search_threshold(docs, query):
    # binary search to find the most appropriate threshold
    start = 0
    end = len(docs)
    mid = len(docs) // 2
    binary_search_threshold = 5
    while True:
        quality = evaluate_fitness(docs[mid][0], query) # returns 'Relevant' or 'Irrelevant'
        print(mid, quality, docs[mid][0])
        if quality == 'Relevant':
            start = mid
            mid = (mid + end) // 2
        else:
            end = mid
            mid = (mid + start) // 2

        if mid - start <= binary_search_threshold or end - mid <= binary_search_threshold:
            break
    return docs[mid][1]

In [41]:
query = "donald trump presidential election"
# examples
strings_and_relatednesses = strings_ranked_by_relatedness(query, article_embeddings)
binary_search_threshold(strings_and_relatednesses, query)
# for string, relatedness in strings_and_relatednesses:
#     print(f"{relatedness=:.3f}")
#     display(string)

3774 Irrelevant The article discussed how former NBA star Dennis Rodman has been charged with four misdemeanors, including driving his car in the wrong direction on a freeway in Southern California, causing another driver to crash. The incident occurred in Santa Ana, California, on July 20 when Rodman was driving an SUV north in a southbound carpool lane on Interstate 5. Rodman allegedly drove directly at another car, forcing the driver to swerve and crash into a concrete dividing wall, and then fled the scene.
1887 Relevant The article discussed Donald Trump's plan to help families afford child care, which includes allowing families to fully deduct the average cost of child care. However, this deduction would mostly benefit richer families. Trump's plan has fewer components compared to Clinton's plan, which aims to limit childcare expenses at 10% of household income and increase spending on early Head Start. Trump's proposal represents an evolution from his previous dismissive stance 

0.7908691993456348

In [44]:
print(strings_and_relatednesses[2154][0])

The article discussed the controversy surrounding the headline "Bill Kristol: Republican Spoiler, Renegade Jew" and the backlash it received on social media. It also highlighted Kristol's plan to mount a third-party run, which the author argues would split the Republican vote and ultimately benefit Obama, Hillary, and Huma Abedin, who are seen as enablers of a planned Holocaust in the Middle East.


In [2]:
def cleanSpans(all_spans):
    all_spans.sort(key=lambda x: x[0])
    cleaned_spans = []
    current_max_end = float('-inf')  # Initialize with negative infinity

    for span in all_spans:
        start, end, _ = span
        # If the span is not contained within the current range
        if start > current_max_end:
            cleaned_spans.append(span)
            current_max_end = end
        # If the span is contained, skip it
    return cleaned_spans

spans = [(1, 5, ""), (2, 4, ""), (6, 10, ""), (8, 9, "")]
result = cleanSpans(spans)
print(result)  # Output: [(1, 5), (6, 10)]


[(1, 5, ''), (6, 10, '')]
