In [1]:
import pandas as pd
import time
from bs4 import BeautifulSoup
import requests 

In [2]:
def search_web(df, num_results):
    """
    Searches the web using DuckDuckGo and returns the top results.

    Args:
        df (pd.DataFrame): DataFrame with 'question' column.
        num_results (int): Number of top results to return.
        
    Returns:
        pd.DataFrame: Updated DataFrame with a new column "web_results" (list of URLs).
    """
    if "question" not in df.columns:
        raise ValueError("DataFrame must contain a 'question' column.")

    web_results = []

    for idx, question in enumerate(df["question"], start=1):
        print(f"Searching the web for question {idx}/{len(df)}: {question}")

        search_url = f"https://duckduckgo.com/html/?q={question}"
        headers = {"User-Agent": "Mozilla/5.0"}
        
        try:
            response = requests.get(search_url, headers=headers, timeout=10)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, "html.parser")

            # Downloading the first `num_results` results
            links = [a["href"] for a in soup.select(".result__url")][:num_results]
            print(f"Found {len(links)} results.")
        except requests.RequestException as e:
            print(f"Error fetching search results for '{question}': {e}")
            links = []
        
        web_results.append(links)
        time.sleep(2)  

    df["web_results"] = web_results
    return df

def fetch_page_content(url):
    """
    Fetches the content of a webpage.

    Args:
        url (str): URL of the webpage.

    Returns:
        str: Content of the webpage.
    """
    headers = {"User-Agent": "Mozilla/5.0"}
    
    try:
        headers = {'User-Agent': 'Mozilla/5.0'}
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()

        soup = BeautifulSoup(response.text, 'html.parser')

        # Remove unnecessary sections
        for tag in soup(['script', 'style', 'header', 'footer', 'nav', 'aside']):
            tag.decompose()

        # Extracting the main content of the page
        text = ' '.join(soup.stripped_strings)

        return text[:10000]  # Limit of 10,000 characters
    except requests.RequestException as e:
        print(f"Błąd pobierania strony {url}: {e}")
        return None
        
def extract_web_content(df):
    """"
    Retrieves article content from pages stored in 'web_results' and adds them to DataFrame.

    Args:
        df (pd.DataFrame): DataFrame with a 'web_results' column containing lists of URLs.

    Returns:
        pd.DataFrame: DataFrame with a new column “extracted_text” (list of article content).
    """
    if "web_results" not in df.columns:
        raise ValueError("DataFrame must contain a 'web_results' column.")

    extracted_texts = []

    for idx, urls in enumerate(df["web_results"], start=1):
        print(f"Pobieranie treści dla zapytania {idx}/{len(df)}...")
        page_texts = [fetch_page_content(url) for url in urls if url]  
        extracted_texts.append(page_texts)
        time.sleep(2)  

    df["extracted_text"] = extracted_texts
    return df

In [3]:
questions = [
    "Kiedy założono miasto Poznań?",
    "Jaka epoka nastąpiła po neolicie?",
    "Kto jest autorem tak zwanej Trzynastej Ksiegi Pana Tadeusza?"
    ]

df = pd.DataFrame({"question": questions})
num_results = 3

In [4]:
test_df = search_web(df, 3)

Searching the web for question 1/3: Kiedy założono miasto Poznań?
Found 3 results.
Searching the web for question 2/3: Jaka epoka nastąpiła po neolicie?
Found 3 results.
Searching the web for question 3/3: Kto jest autorem tak zwanej Trzynastej Ksiegi Pana Tadeusza?
Found 3 results.


In [5]:
test_df.head()

Unnamed: 0,question,web_results
0,Kiedy założono miasto Poznań?,[//duckduckgo.com/l/?uddg=https%3A%2F%2Fpl.wik...
1,Jaka epoka nastąpiła po neolicie?,[//duckduckgo.com/l/?uddg=https%3A%2F%2Fpl.wik...
2,Kto jest autorem tak zwanej Trzynastej Ksiegi ...,[//duckduckgo.com/l/?uddg=https%3A%2F%2Fcodzie...


In [6]:
test_df = extract_web_content(test_df)

Pobieranie treści dla zapytania 1/3...
Błąd pobierania strony //duckduckgo.com/l/?uddg=https%3A%2F%2Fpl.wikipedia.org%2Fwiki%2FPozna%25C5%2584&rut=e36f921c6edc541d6b8725426e0dc86f994bee10136042bdfc139647baa52513: Invalid URL '//duckduckgo.com/l/?uddg=https%3A%2F%2Fpl.wikipedia.org%2Fwiki%2FPozna%25C5%2584&rut=e36f921c6edc541d6b8725426e0dc86f994bee10136042bdfc139647baa52513': No scheme supplied. Perhaps you meant https:////duckduckgo.com/l/?uddg=https%3A%2F%2Fpl.wikipedia.org%2Fwiki%2FPozna%25C5%2584&rut=e36f921c6edc541d6b8725426e0dc86f994bee10136042bdfc139647baa52513?
Błąd pobierania strony //duckduckgo.com/l/?uddg=https%3A%2F%2Fpl.wikipedia.org%2Fwiki%2FHistoria_Poznania&rut=71078d6c7a7e8936ed79f9702b062b99f9af694cea3e5c26ec20805d141b76cf: Invalid URL '//duckduckgo.com/l/?uddg=https%3A%2F%2Fpl.wikipedia.org%2Fwiki%2FHistoria_Poznania&rut=71078d6c7a7e8936ed79f9702b062b99f9af694cea3e5c26ec20805d141b76cf': No scheme supplied. Perhaps you meant https:////duckduckgo.com/l/?uddg=https%3A%2F

In [7]:
df.head()

Unnamed: 0,question,web_results,extracted_text
0,Kiedy założono miasto Poznań?,[//duckduckgo.com/l/?uddg=https%3A%2F%2Fpl.wik...,"[None, None, None]"
1,Jaka epoka nastąpiła po neolicie?,[//duckduckgo.com/l/?uddg=https%3A%2F%2Fpl.wik...,"[None, None, None]"
2,Kto jest autorem tak zwanej Trzynastej Ksiegi ...,[//duckduckgo.com/l/?uddg=https%3A%2F%2Fcodzie...,"[None, None, None]"
