In [None]:
# necessary imports
import pandas as pandas
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
import torch
from time import sleep
from bs4 import BeautifulSoup
import requests 

In [None]:
# zmienne do usunięcia przy przejściu na skrypt
dataset_URL = "To Do - wstawić link do datasetu"
model_id = "To Do - wstawić id modelu"

In [None]:
#defining the main function
def main():
    '''
    The script's main function.
    '''
    print("Getting started...")
    # loading the dataset
    df = pd.read_csv(dataset_URL)
    

In [None]:
# necessary definitions
def load_dataset(dataset_URL):
    """
    Loads data from CSV and removes redundant columns

    Args:
        dataset_url (str): URL of the CSV file.

    Returns:
        pd.DataFrame: Filtered DataFrame.
    """
    
    # Loading dataset
    df = pd.read_csv(dataset_URL)

    # Removing excessive columns (e.g. 'Unnamed')
    df.drop(columns=df.columns[df.columns.str.contains('^Unnamed')], inplace=True)
    
    return df

def load_model_and_tokenizer(model_name):
    """
    Loading model Bielik-11-B-v2.3-Instruct from Huggingface 
    and tokenizer for the model.
    """
    # Loading model
    model = AutoModelForCausalLM.from_pretrained(model_name)

    # Loading tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    return model, tokenizer

def create_pipeline(model, tokenizer):
    """
    Creates a pipeline for the model and tokenizer.

    Args:
        model (transformers.PreTrainedModel): Model.
        tokenizer (transformers.PreTrainedTokenizer): Tokenizer.

    Returns:
        transformers.Pipeline: Pipeline.
    """
    # Creating pipeline
    pipeline = pipeline('text-generation', model=model, tokenizer=tokenizer)

    return pipeline

In [None]:
def querry_llm(df, pipeline):
    """
    Generates answers for the questions in the 'question' column and saves them in a new column.

    Args:
        df (pd.DataFrame): DataFrame with 'question' column.
        pipe: Pipeline of the language model.
        pipeline (transformers.Pipeline): Pipeline.

    Returns:
       Returns:
        pd.DataFrame: Updated DataFrame with a new column "llm_answer".
    """
    # Querying the model
    if "question" not in df.columns:
        raise ValueError("DataFrame must contain a 'question' column.")
    # Iterate through questions and generate answers
    answers = []
    for inx, question in enumerate(df["question"], start=1):
        print(f"Processing question {inx}/{len(df)}...")
        response = pipeline(question)
        generated_text = response[0]["generated_text"]
        print(f"Response: {generated_text}")

        answers.append(generated_text)
    # Adding answers to the DataFrame
    df["llm_answer"] = answers

    return df

In [None]:
def search_web(df, num_results):
    """
    Searches the web for the question and returns the top results.

    Args:
        df (pd.DataFrame): DataFrame with 'question' column.
        num_results (int): Number of top results to return."
        
    Returns:
        pd.DataFrame: Updated DataFrame with a new column "web_results" (list of URLs).
        """
    # Searching the web
    if "question" not in df.columns:
        raise ValueError("DataFrame must contain a 'question' column.")
    # Iterate through questions and search the web
    web_results = []
    for idx, question in enumerate(df["question"], start=1):
        print(f"Searching the web for question {idx}/{len(df)}...")
        search_url = f"https://duckduckgo.com/html/?q={question}"
        headers = {"User-Agent": "Mozilla/5.0"}
        response = requests.get(search_url, headers=headers)
        
        # get the URL of the first num_results
        links = [line for line in response.text.split('"') if line.startswith("http")][:num_results]
        print(f"Found {len(links)} results.")
        web_results.append(links)
        sleep(2)
    # Adding web results to the DataFrame
    df["web_results"] = web_results
    
    return df