In [1]:
import os
import csv
import torch
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM
from tqdm import tqdm

In [2]:
input_csv_folder = '/home/ubuntu/Desktop/capstone/answers_combined/'  
chunked_documents_dir = '/home/ubuntu/Desktop/capstone/chunked_data/' 
output_folder = '/home/ubuntu/Desktop/capstone/graded_answers/' 

In [3]:
os.makedirs(output_folder, exist_ok=True)

In [4]:
tokenizer = AutoTokenizer.from_pretrained('google/gemma-2b-it')
model = AutoModelForCausalLM.from_pretrained('google/gemma-2b-it', token="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", device_map='auto',\
                                            torch_dtype=torch.bfloat16)



`config.hidden_act` is ignored, you should use `config.hidden_activation` instead.
Gemma's activation function will be set to `gelu_pytorch_tanh`. Please, use
`config.hidden_activation` if you want to override this behaviour.
See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
def generate_response(context, evaluation_text, max_new_tokens=100):
    # Prompt to generate question
    input_text = (f"Read the context and grade the Evaluation Text. Only respond with a number 1 through 5.\
    1 is terrible (illegible and doesn't answer the question) while 5 is perfect in grammer and answers the question\
    :\n\nContext:{context}\n\Evaluation Text:{evaluation_text}\n\nResponse:",context, evaluation_text)
    inputs = tokenizer(input_text, return_tensors='pt', padding=True, truncation=True).to('cuda')
    
    # Generate question
    with torch.no_grad():
        output_ids = model.generate(
            input_ids=inputs['input_ids'], 
            attention_mask=inputs['attention_mask'],
            max_new_tokens=max_new_tokens, 
            num_return_sequences=1, 
            no_repeat_ngram_size=2, 
            top_p=0.95, 
            top_k=25,
            temperature=1,
            do_sample=True,
        )
    
    generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    
    # Extract the question and answer parts (if provided)
    question_start = generated_text.find("Response:") + len("Response:")
    question = generated_text[question_start:].strip()
    
    return question


In [6]:
def read_csv(file_path):
    return pd.read_csv(file_path)

In [7]:
def generate_response(context, question, answer, max_new_tokens=100):
    input_text = f":You are grading the answer provided. Only respond with a number 1 through 5. 1 is poor quality based on the context \
    or doesn't answer the question while 5 is a great answer. Answers are based on the Context\
    \n\nContext:{context}\n\Question{question}\nAnswer:{answer}\n\nResponse:"
    inputs = tokenizer(input_text, return_tensors='pt', padding=True, truncation=True).to('cuda')
    
    # Generate response
    with torch.no_grad():
        output_ids = model.generate(
            input_ids=inputs['input_ids'], 
            attention_mask=inputs['attention_mask'],
            max_new_tokens=max_new_tokens, 
            num_return_sequences=1, 
            no_repeat_ngram_size=2, 
            top_p=0.95, 
            top_k=25,
            temperature=1,
            do_sample=True,
        )
    
    generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    
    # Extract the response
    response_start = generated_text.find("Response:") + len("Response:")
    response = generated_text[response_start:].strip()
    
    return response

In [8]:
def save_results(row, output_path, header=False):
    with open(output_path, 'a', newline='', encoding='utf-8') as f:
        if header:
            row.to_csv(f, header=True, index=False)
        else:
            row.to_csv(f, header=False, index=False)

In [9]:
def process_csv_files(input_csv_folder, chunked_documents_dir, output_folder):
    for filename in os.listdir(input_csv_folder):
        if filename.endswith('.csv'):
            input_csv_path = os.path.join(input_csv_folder, filename)
            output_csv_path = os.path.join(output_folder, f'scored_{filename}')

            # Read the header
            df_header = pd.read_csv(input_csv_path, nrows=0)
            
            # Add columns for scores to the header
            for col in df_header.columns:
                if col.endswith('Docs'):
                    df_header[f'{col} Score'] = None
            df_header['Question Score'] = None
            
            # Save the header to the output file
            save_results(df_header, output_csv_path, header=True)

            # Determine chunk size from the CSV file name
            temp_df = pd.read_csv(input_csv_path, nrows=1)
            chunk_size = int(temp_df['Chunk Size'].iloc[0])
            size_dir = os.path.join(chunked_documents_dir, str(chunk_size))

            # Process each row
            for df_row in pd.read_csv(input_csv_path, chunksize=1):
                # Add context text to the row
                context_filename = df_row['Filename'].values[0]
                context_path = os.path.join(size_dir, context_filename)
                if os.path.exists(context_path):
                    with open(context_path, 'r', encoding='utf-8') as file:
                        context_text = file.read()
                else:
                    context_text = ""

                question = df_row['Generated Question'].values[0]
                
                # Generate scores for questions and answers
                df_row['Question Score'] = generate_response(context_text, question, "", max_new_tokens=50)
                for col in df_row.columns:
                    if col.endswith('Docs'):
                        answer = df_row[col].values[0]
                        df_row[f'{col} Score'] = generate_response(context_text, question, answer, max_new_tokens=50)

                # Save the processed row
                save_results(df_row, output_csv_path)
                torch.cuda.empty_cache()  # Clear GPU cache

            print(f"Processed and saved: {output_csv_path}")

In [10]:
def read_csv(file_path):
    return pd.read_csv(file_path)

In [None]:
process_csv_files(input_csv_folder, chunked_documents_dir, output_folder)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
