In [None]:
# necessary imports
import warnings
import pandas as pd
import argparse
import gspread # chatGPT sugestion
from oauth2client.service_account import ServiceAccountCredentials # chatGPT sugestion

In [None]:
warnings.filterwarnings("ignore")

In [None]:
# definitions

def main():
    model_name, output_file, samples_number, batch_size, prompt_ver, sheet_url, sheet_name = load_parameters()

    df = load_dataset(sheet_url, sheet_name) # Downloading data from Google Sheets
    
    if samples_number is None:
        samples_number = df.shape[0]  # Number of samples for analysis

    # Model parameters
    llm_params = {  
        "max_new_tokens": 1024,
        "do_sample": False
    }
    if output_file is None:
        output_file = "answers_prompt.csv"

    # Calculating responses for the selected model
    df_subset = df[:samples_number]  
    df_with_answers = calculate_for_model(model_name, df_subset, llm_params, prompt_ver, batch_size)

    # Saving the results to Google Sheets and a local file
    save_to_google_sheets(df_with_answers, sheet_url, sheet_name)
    df_with_answers.to_csv(output_file, index=False)

def load_parameters():
    parser = argparse.ArgumentParser(description="Script that generates answers for the benchmark.")
    parser.add_argument('--model_name', type=str, required=True, help='Name of the model to calculate the answers')
    parser.add_argument('--samples_number', type=int, default=None, help='Number of samples to generate answers')
    parser.add_argument('--output_file', type=str, default="generated_answers.csv", help='Name of output file to save generated answers')
    parser.add_argument('--batch_size', type=int, default=1, help="Batch size")
    parser.add_argument('--prompt_ver', type=int, default=1, help="Versions of generation prompt")
    parser.add_argument('--sheet_url', type=str, required=True, help="URL to the Google Sheets document")
    parser.add_argument('--sheet_name', type=str, required=True, help="Name of the sheet within the Google Sheets document")
    args = parser.parse_args()
    return args.model_name, args.output_file, args.samples_number, args.batch_size, args.prompt_ver, args.sheet_url, args.sheet_name

def load_dataset(sheet_url, sheet_name):
    # Authentication with Google Sheets API
    scope = ["https://spreadsheets.google.com/feeds", "https://www.googleapis.com/auth/drive"]
    credentials = ServiceAccountCredentials.from_json_keyfile_name("credentials.json", scope)
    client = gspread.authorize(credentials)

    # Reading data from Google Sheets
    sheet = client.open_by_url(sheet_url).worksheet(sheet_name)
    data = sheet.get_all_records()
    
    # Conversion to DataFrame
    df = pd.DataFrame(data)
    return df

def calculate_for_model(model_name, df, llm_params, prompt_ver=1, batch_size=1):  # suggested changes from chatGPT
    """
    Przetwarza odpowiedzi dla jednego modelu LLM na podstawie podanego zestawu danych.

    Args:
        model_name (str): Nazwa modelu do użycia.
        df (pd.DataFrame): DataFrame zawierający dane wejściowe z kolumną 'Opinia'.
        llm_params (dict): Parametry generowania odpowiedzi dla modelu.
        prompt_ver (int): Wersja promptu (domyślnie 1).
        batch_size (int): Liczba danych przetwarzanych w jednym batchu.

    Returns:
        pd.DataFrame: DataFrame z dodaną kolumną zawierającą wygenerowane odpowiedzi.
    """
    print(f"Przetwarzanie modelu: {model_name}")
    
    # Initialization of tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = load_model(model_name)
    pipe = load_pipe(model, tokenizer)

    # The name of the resulting column for the answer
    answer_column = f'answer_{model_name.split("/")[-1]}'

    # Measuring processing time
    start_time = time.time()

    # Generating responses in batches
    answers = []
    for i in range(0, len(df), batch_size):
        # Pobranie danych dla batcha
        batch = df['Opinia'].iloc[i:i + batch_size].tolist()
        
        # Downloading data for batcha
        batch_answers = generate_answers_batch(batch, tokenizer, pipe, llm_params, prompt_ver=prompt_ver)
        answers.extend(batch_answers)

        # Information on progress
        print(f"Przetworzono batch {i // batch_size + 1}/{(len(df) - 1) // batch_size + 1}")

    # Adding an answer column to the DataFrame
    df[answer_column] = pd.Series(answers, index=df.index[:len(answers)])

    end_time = time.time()
    print(f"Czas przetwarzania dla modelu {model_name}: {end_time - start_time:.2f} sekund")

    return df

def save_to_google_sheets(df, sheet_url, sheet_name):
    # Authentication with Google Sheets API
    scope = ["https://spreadsheets.google.com/feeds", "https://www.googleapis.com/auth/drive"]
    credentials = ServiceAccountCredentials.from_json_keyfile_name("credentials.json", scope)
    client = gspread.authorize(credentials)

    # Opening Google Sheets
    sheet = client.open_by_url(sheet_url).worksheet(sheet_name)

    # Replace the contents of the sheet with new data
    sheet.clear()
    sheet.update([df.columns.values.tolist()] + df.values.tolist())

In [None]:
if __name__ == '_main_':
    main()