Written by Can Erozer

# Extracting the Relevant Text from the Articles:

In [1]:
import requests
from bs4 import BeautifulSoup

In [2]:
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI
from langchain.callbacks import get_openai_callback
from langchain.schema import StrOutputParser

In [None]:
api_key=""

In [4]:
directory="/Users/canerozer/Desktop/BU/FALL2024/DS701/DS701_Proje/question_answering_experiement/"
test_files=["row28_article_text.txt","row63_article_text.txt","row124_article_text.txt","row126_article_text.txt","row127_article_text.txt","row201_article_text.txt","row202_article_text.txt","row208_article_text.txt","row209_article_text.txt","row267_article_text.txt","row452_article_text.txt","row463_article_text.txt"]


In [5]:
urls=["https://www.cbs58.com/news/34-wisconsin-men-were-arrested-by-immigration-and-customs-enforcement",\
     "https://abc7chicago.com/ice-raid-los-angeles-southern-california-four-day-record-244-criminal-immigrants/964411/",\
     "https://www.justice.gov/opa/pr/fourteen-alleged-gang-members-and-associates-indicted-charleston-south-carolina-federal",\
     "https://www.hjnews.com/education/cache-high-senior-works-stays-positive-despite-dads-deportation/article_96d1bd75-0ac0-5ed5-a885-cad032bee9ea.html",\
     "https://www.noozhawk.com/12_arrested_40_in_custody_after_santa_maria_police_led_sweep/",\
     "https://www.azcentral.com/story/noticias/2016/06/17/arrestan-39-inmigrantes-durante-redada-en-wisconsin/86054218/",\
     "https://www.ice.gov/news/releases/more-100-arrested-los-angeles-area-ice-operation-targeting-convicted-criminal-aliens",\
     "https://www.dnainfo.com/chicago/20160811/avondale/this-is-wrong-latino-day-laborer-declares-of-ice-raids-at-job-sites/",\
     "https://www.fox8live.com/story/33050506/undocumented-immigrants-arrested-for-operating-new-orleans-sex-brothel/",\
     "https://weartv.com/news/local/investigators-man-shared-child-pornography-on-kid-friendly-kik-app",\
     "https://www.justice.gov/usao-nh/pr/45-individuals-indicted-participating-fentanyl-trafficking-conspiracy",\
     "https://www.nj.com/news/2018/04/ice_arrests_60_in_nj_in_5-day_enforcement_operatio.html"]


In [6]:
prompt_extract_info="""I have scraped the text of an article, but it contains extra information like side titles, navigation menus, and other irrelevant content. I also have the title of the article. Please extract only the relevant parts of the text that match the main topic of the article based on its title. Ignore anything unrelated to the topic. 

Here is the contex: {context}

Here is the title: {title}

Instructions:

Focus only on the parts of the text that relate directly to the title and the main topic of the article.
Don't forget to include the important dates in the article. Also, include the publication date of the article if it is present in the text.
Remove any information that is clearly unrelated, like website headers, footers, links, or side titles.
Provide the cleaned-up and relevant text as the output.

Output format:

A cleaned and concise version of the article text, free of irrelevant information. Don't make any explanations."""


In [7]:
prompt_similarity_score="""I have two texts: the real text of an article and a cleaned version generated by a model. I want to assess how similar these two texts are. Please compare them based on content relevance, key topics, and information retention. Provide a similarity score on a scale of from 0 to 100, where:

0 means the texts are completely dissimilar.
100 means the texts are identical in terms of information and content.

Here is the real text of the article: {text_truth}
Here is the cleaned text generated by the model: {text_cleaned}

Instructions:

Compare the two texts based on:
Shared key ideas and topics.
Retention of the main points in the cleaned text.
Any notable missing or extra information in the cleaned text.
Provide a similarity score (0–100).
Don't make any explanations. Just give the score.
"""


In [8]:
#this returns a tuple(str, str). The first element of the tuple is the title of the article
#and the second element of the tuple is the all of the text present in the url
def get_all_texts_from_url(article_urls):
    
    all_texts=[]
    
    for url in article_urls:
    
        # Fetch the webpage content
        response = requests.get(url)

        # Check if the request was successful
        if response.status_code == 200:
            # Parse the HTML
            soup = BeautifulSoup(response.text, 'html.parser')

            # Get all visible text from the webpage
            article_text = soup.get_text(strip=True)
            
            # Extract text from h1, h2, and p tags
            headings_and_paragraphs = []
            for tag in soup.find_all(['h1']):
                headings_and_paragraphs.append(tag.get_text(strip=True))
                
            all_texts +=[(headings_and_paragraphs[0], article_text)]
                

        else:
            print(f"Failed to fetch the webpage. Status code: {response.status_code}")

    
    
    return all_texts
    
    

In [9]:
results=get_all_texts_from_url(urls)

In [10]:
def get_only_relevant_text(title_text_pairs, model_name):
    
    main_text=[]
    cost=0
    
    for title, context in title_text_pairs:
    
        PROMPT0 = PromptTemplate(template=prompt_extract_info, input_variables=["context", "title"])

        with get_openai_callback() as cb0:
            llm = LLMChain(
                llm = ChatOpenAI(openai_api_key=api_key,
                         temperature=0.01, model=model_name), prompt=PROMPT0)

            response0 = llm.predict(context=context, title=title)

            cost +=cb0.total_cost


        main_text +=[title + response0]
        
        
    return main_text, cost

        
    

In [11]:
def get_similarity(pred_text, true_text):
    
    cost=0
    
    PROMPT0 = PromptTemplate(template=prompt_similarity_score, input_variables=["text_truth", "text_cleaned"])

    with get_openai_callback() as cb0:
        llm = LLMChain(
            llm = ChatOpenAI(openai_api_key=api_key,
                     temperature=0.01, model="gpt-4o-mini"), prompt=PROMPT0)

        response0 = llm.predict(text_cleaned=pred_text, text_truth=true_text)

        cost +=cb0.total_cost

        
    return response0, cost
    
    

In [12]:
def write_pred_texts(pred_texts, out_file_names):
    
    directory_out="/Users/canerozer/Desktop/BU/FALL2024/DS701/DS701_Proje/text_extraction_experiment/"
    
    file_names=[]
     
    for name in out_file_names:
        name = "pred_" + name
        file_names +=[name]
        
    if len(file_names)!=len(pred_texts):
        print("pred_text format is not good!")
        return 
        
    for i in range(len(file_names)):

        with open(directory_out+file_names[i], "w",encoding='utf-8') as file:

            file.write(pred_texts[i])
            print(f"{file_names[i]} written to {directory_out}")
        
    

In [20]:
def load_true_texts(test_files):
    
    true_texts=[]
    
    for file in test_files:
        info=""
        with open(directory+file,"r") as f:

            for line in f:

                info += line
                
        
        true_texts += [info]     
        
        
    return true_texts

        

In [24]:
def pipeline(urls, test_files):
    
    
    results=get_all_texts_from_url(urls)
    
    cleaned_texts, cost=get_only_relevant_text(results, "gpt-4o-mini")
    
    true_texts=load_true_texts(test_files)
    
    for cleaned_text, true_text in zip(cleaned_texts, true_texts):
        
        similarity, add_cost=get_similarity(cleaned_text, true_text)
        print(f"Similarity is %{similarity}")
        cost +=add_cost
        
        
    write_pred_texts(cleaned_texts, test_files)
    print(f"Total cost is {cost}")
    
    


In [25]:
pipeline(urls, test_files)

Similarity is %95
Similarity is %Similarity Score: 95
Similarity is %85
Similarity is %90
Similarity is %85
Similarity is %100
Similarity is %85
Similarity is %90
Similarity is %Similarity Score: 95
Similarity is %85
Similarity is %85
Similarity is %85
pred_row28_article_text.txt written to /Users/canerozer/Desktop/BU/FALL2024/DS701/DS701_Proje/text_extraction_experiment/
pred_row63_article_text.txt written to /Users/canerozer/Desktop/BU/FALL2024/DS701/DS701_Proje/text_extraction_experiment/
pred_row124_article_text.txt written to /Users/canerozer/Desktop/BU/FALL2024/DS701/DS701_Proje/text_extraction_experiment/
pred_row126_article_text.txt written to /Users/canerozer/Desktop/BU/FALL2024/DS701/DS701_Proje/text_extraction_experiment/
pred_row127_article_text.txt written to /Users/canerozer/Desktop/BU/FALL2024/DS701/DS701_Proje/text_extraction_experiment/
pred_row201_article_text.txt written to /Users/canerozer/Desktop/BU/FALL2024/DS701/DS701_Proje/text_extraction_experiment/
pred_row202