In [None]:
# necessary imports
import pandas as pd
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TextStreamer
import torch
import time
from bs4 import BeautifulSoup
import requests 
import urllib.parse

In [None]:
# zmienne do usunięcia przy przejściu na skrypt
dataset_URL = "To Do - wstawić link do datasetu"
# device = "cuda"  if torch.cuda.is_available() else "cpu"
model_name = "speakleash/Bielik-11B-v2.2-Instruct"

In [None]:
# defining the main function
def main():
    '''
    The script's main function.
    '''
    print("Getting started...")
    # loading the dataset
   
    
    

In [4]:
# necessary definitions
def load_dataset(dataset_URL):
    """
    Loads data from CSV and removes redundant columns

    Args:
        dataset_url (str): URL of the CSV file.

    Returns:
        pd.DataFrame: Filtered DataFrame.
    """
    
    # Loading dataset
    df = pd.read_csv(dataset_URL)

    # Removing excessive columns (e.g. 'Unnamed')
    df.drop(columns=df.columns[df.columns.str.contains('^Unnamed')], inplace=True)
    
    return df

def load_model_and_tokenizer(model_name):
    """
    Loads model in 4-bit quantization mode along with its tokenizer.

    Args:
        model_name (str): The name of the model from Hugging Face.

    Returns:
        model (AutoModelForCausalLM): Quantized model.
        tokenizer (AutoTokenizer): Tokenizer for the model.

    """
    # Configuring BitsAndBytes for 4-bit quantization
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_use_double_quant=True,
        bnb_4bit_compute_dtype=torch.float16,
    )

    # Loading the model with quantization
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=bnb_config,
        device_map="auto"
    )

    # Loading the tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    return model, tokenizer

# def create_pipeline(model, tokenizer):
#     """
#     Creates a pipeline for the model and tokenizer.

#     Args:
#         model (transformers.PreTrainedModel): Model.
#         tokenizer (transformers.PreTrainedTokenizer): Tokenizer.

#     Returns:
#         transformers.Pipeline: Pipeline.
#     """
#     # Creating pipeline
#     pipeline = pipeline('text-generation', model=model, tokenizer=tokenizer)

#     return pipeline

In [5]:
def generate_text(model, tokenizer, prompt, max_new_tokens = 256):
     """
    Generates text based on the given prompt using the quantized model.

    Args:
        model: The loaded model.
        tokenizer: The tokenizer for the model.
        prompt (str): The input text prompt.
        max_new_tokens (int, optional): Maximum number of tokens to generate.

    Returns:
        str: The generated text.
    """
     inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
     
     with torch.no_grad():
        output = model.generate(**inputs, max_new_tokens=max_new_tokens)
        
        return tokenizer.decode(output[0], skip_special_tokens=True)

In [6]:
def querry_llm(df, model, tokenizer):
    """
    Generates answers for the questions in the 'question' column and saves them in a new column.

    Args:
        df (pd.DataFrame): DataFrame with 'question' column.
        model: Quantized Bielik model.
        tokenizer: Tokenizer for the model.

    Returns:
       Returns:
        pd.DataFrame: Updated DataFrame with a new column "llm_answer".
    """
    # Checking if the DataFrame contains the 'question' column
    if "question" not in df.columns:
        raise ValueError("DataFrame must contain a 'question' column.")
    # Iterate through questions and generate answers
    answers = []
    for inx, question in enumerate(df["question"], start=1):
        print(f"Processing question {inx}/{len(df)}...")
        generated_text = generate_text(model, tokenizer, question)
        print(f"Response: {generated_text}")
        answers.append(generated_text)
        
    # Adding answers to the DataFrame
    df["llm_answer"] = answers

    return df

In [7]:
def search_web(df, num_results):
    """
    Searches the web using DuckDuckGo and returns the top results.

    Args:
        df (pd.DataFrame): DataFrame with 'question' column.
        num_results (int): Number of top results to return.
        
    Returns:
        pd.DataFrame: Updated DataFrame with a new column "web_results" (list of URLs).
    """
    if "question" not in df.columns:
        raise ValueError("DataFrame must contain a 'question' column.")

    web_results = []

    for idx, question in enumerate(df["question"], start=1):
        print(f"Searching the web for question {idx}/{len(df)}: {question}")

        search_url = f"https://duckduckgo.com/html/?q={question}"
        headers = {"User-Agent": "Mozilla/5.0"}
        
        try:
            response = requests.get(search_url, headers=headers, timeout=10)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, "html.parser")

            # Downloading the first `num_results` results
            raw_links = [a["href"] for a in soup.select(".result__url")][:num_results]
            print(f"Found {len(raw_links)} results.")

            # Convert relative URLs to absolute URLs
            links = []
            for link in raw_links:
                if link.startswith("//duckduckgo.com/l/?uddg="):
                    cleaned_link = urllib.parse.unquote(link.split("uddg=")[-1])
                    cleaned_link = cleaned_link.split("&")[0]
                    links.append(cleaned_link)
                elif link.startswith("http"):
                    cleaned_link = link.split("&")[0]
                    links.append(cleaned_link)
        except requests.RequestException as e:
            print(f"Error fetching search results for '{question}': {e}")
            links = []
        
        web_results.append(links)
        time.sleep(2)  

    df["web_results"] = web_results
    return df

def fetch_page_content(url):
    """
    Fetches the content of a webpage.

    Args:
        url (str): URL of the webpage.

    Returns:
        str: Content of the webpage.
    """
    headers = {"User-Agent": "Mozilla/5.0"}
    
    try:
        headers = {'User-Agent': 'Mozilla/5.0'}
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()

        soup = BeautifulSoup(response.text, 'html.parser')

        # Remove unnecessary sections
        for tag in soup(['script', 'style', 'header', 'footer', 'nav', 'aside']):
            tag.decompose()

        # Extracting the main content of the page
        text = ' '.join(soup.stripped_strings)

        return text[:10000]  # Limit of 10,000 characters
    except requests.RequestException as e:
        print(f"Błąd pobierania strony {url}: {e}")
        return None
        
def extract_web_content(df):
    """"
    Retrieves article content from pages stored in 'web_results' and adds them to DataFrame.

    Args:
        df (pd.DataFrame): DataFrame with a 'web_results' column containing lists of URLs.

    Returns:
        pd.DataFrame: DataFrame with a new column “extracted_text” (list of article content).
    """
    if "web_results" not in df.columns:
        raise ValueError("DataFrame must contain a 'web_results' column.")

    extracted_texts = []

    for idx, urls in enumerate(df["web_results"], start=1):
        print(f"Pobieranie treści dla zapytania {idx}/{len(df)}...")
        page_texts = [fetch_page_content(url) for url in urls if url]  
        extracted_texts.append(page_texts)
        time.sleep(2)  

    df["extracted_text"] = extracted_texts
    return df

In [None]:
# querrying model with context from webscrapping
def answer_with_context(df, model, tokenizer, max_new_tokens = 256):
    """
    Queries the LLM model based on the question and context from the network.

    Args:
        df (pd.DataFrame): DataFrame with 'question' and 'extracted_text' columns.
        model: loaded LLM model.
        tokenizer: Tokenizer for the model.
        max_new_tokens (int, optional): The maximum number of tokens to generate.

    Returns:
        pd.DataFrame: Updated DataFrame with 'llm_answer_with_context' column.
    """
    if "question" not in df.columns or "extracted_text" not in df.columns:
        raise ValueError("DataFrame must contain 'question' and 'extracted_text' columns.")
    
    answers = []

    for idx, (question, context_list) in enumerate(zip(df["question"], df["extracted_text"]), start=1):
        print(f"Processing question {idx}/{len(df)}...")
        
        # Combining page content into a single string
        context_text = " ".join(filter(None, context_list))[:5000] if context_list else "No web rexsults found."

        # creating prompt
        prompt = f""" Odpowiedz na pytanie na podstawie poniższego kontekstu:
        KONTEKST: {context_text}
        PYTANIE: {question}
        ODPOWIEDŹ:
        """
        # generating answer
        answer = generate_text(model, tokenizer, prompt, max_new_tokens=max_new_tokens)
        print(f"Response: {answer}")

        answers.append(answer)
        time.sleep(1)

    df["llm_answer_with_context"] = answers
    return df

# saving the results
def save_results(df, output_file):
    """
    Saves the DataFrame to a CSV file.

    Args:
        df (pd.DataFrame): DataFrame to save.
        output_file (str): Path to the output CSV file.
    """
    print(f"Saving results to a file: {output_file}")
    df.to_csv(output_file, index=False)