In [1]:
import requests
from bs4 import BeautifulSoup
import os
from openai import OpenAI
import re
import warnings
import ast

In [2]:
from IPython.display import display, HTML

display(HTML("""
<style>
.output_area {
    overflow-wrap: break-word;
    word-wrap: break-word;
    white-space: pre-wrap;
    word-break: break-all;
    white-space: normal;
}
</style>
"""))


In [3]:
def check_sub(small, large):
    # Split the smaller string into sentences based on punctuation
    small_split = re.split(r'(?<=[.!?]) +', small)

    # Initialize a flag to track if all substrings are found
    all_found = True

    # Check each sentence in small_split against the large string
    for ele in small_split:
        if ele not in large:
            warnings.warn(f"Sentence not in article: sentence {small_split.index(ele)}, '{ele}'")
            all_found = False

    return all_found

In [4]:
assignment = "You are an AI model designed to assist with \'card-cutting\' for policy debate. You may NOT modify the text by deleting or adding any words to the chosen sentences. You may only leave out the sentences you deem unimportant. Within each sentence you deem important enough to return, you MAY NOT remove ANY words from the sentence, nor change punctuation or capitalization. In fact, your return, if broken into individual sentences, should ensure that each and every returned sentence matches EXACTLY with a given sentence in the initial article. You will fail at your task if you do not follow these instructions. Your goal is to take the text of an article, understand what the article's key arguments are, and return a response including ONLY the key sentences based on your reading. Pick out the key parts according to the following request:"

In [5]:
client = OpenAI()

In [6]:
def get_key_sentences(task, article):
  completion = client.chat.completions.create(
  model="gpt-3.5-turbo",
  messages=[
    {"role": "system", "content": assignment + task},
    {"role": "user", "content": article}
  ]
)

  key_sentences = completion.choices[0].message.content
  return key_sentences


In [16]:
#highlight
def highlight(key_sentences, task):
    """
    Takes important lines extracted from an article and distills them into
    the most critical phrases or words to construct an argument.

    Parameters:
    - important_lines: A string containing the important lines as extracted by the previous API call.
    - topics: A list of topics or arguments to guide the extraction of key phrases.

    Returns:
    A string with the most critical phrases or words highlighted.
    """
    # Construct the API call with a new prompt
    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are an assistant specialized in identifying and highlighting key phrases within specific sentences. Your goal is to extract and highlight the phrases that are most crucial for constructing a well-warranted argument based on specified topics. Highlight the essential phrases but ensure you do not exclude reasoning, argumentative warrants, and key examples in the key sentences. As a rule of thumb, try to highlight a third OR MORE of the article depending on how dense it is in arguments. The highlighting ought to be readable and meaningful in terms of conveying information but not overly wordy. Return the selected phrases IN A PYTHON LIST with each key phrase being an element of that list. You will fail at your task if there is anything outside the individual strings in the list. PLEASE make sure each string that is a list element is enclosed in DOUBLE quotation marks like \""},
            {"role": "user", "content": f"From the following important lines, extract and highlight the phrases most crucial for the argument based on the following: {task}. Here are the important lines: {key_sentences}"}
        ]
    )

    # Extracting the highlighted phrases from the response
    highlighted_phrases = response.choices[0].message.content if response.choices else ""


    highlighted_phrases = ast.literal_eval(highlighted_phrases)

    return highlighted_phrases


In [41]:
from docx import Document
from docx.shared import RGBColor
from docx.enum.text import WD_COLOR_INDEX

def final_cut(article, keywords, doc_path, color=WD_COLOR_INDEX.YELLOW):
    # Load the existing Word document
    doc = Document(doc_path)
    
    # Add a new paragraph at the end of the document
    paragraph = doc.add_paragraph()

    # Escape regex special characters in keywords
    escaped_keywords = [re.escape(keyword) for keyword in keywords]

    # Create a regex pattern to find keywords, considering word boundaries
    pattern = r'\b(?:' + '|'.join(escaped_keywords) + r')\b'

    # Split the text by the pattern to preserve the original text for non-matches
    parts = re.split(pattern, article, flags=re.IGNORECASE)

    # Use regex to find all matches
    matches = re.findall(pattern, article, flags=re.IGNORECASE)

    match_iter = iter(matches)

    # Alternate between non-matches and matches
    for part in parts:
        paragraph.add_run(part)
        try:
            match = next(match_iter)
            run = paragraph.add_run(match)
            run.font.highlight_color = color
            run.font.underline = True
        except StopIteration:
            pass

    # Save the changes to the existing document
    doc.save(doc_path)


In [65]:
def card_cutter(task, article, document):
  ## get key sentences
  key_sentences = get_key_sentences(task, article)

  highlighted = highlight(task, key_sentences)
  final_cut(article, highlighted, document)