In [8]:
import pandas as pd

def display_statistics_csv(input_file):
    # Load the CSV file
    data = pd.read_csv(input_file)

    # Counting non-null entries in each relevant column
    question_count = data['Question'].notnull().sum()
    option_count = data[['Option A', 'Option B', 'Option C', 'Option D']].notnull().all(axis=1).sum()
    correct_answer_count = data['Correct Answer'].notnull().sum()
    explanation_count = data['Explanation'].notnull().sum()

    # Calculating the number of valid entries (rows with all required fields filled)
    valid_entries_count = data[['Question', 'Option A', 'Option B', 'Option C', 'Option D', 'Correct Answer', 'Explanation']].notnull().all(axis=1).sum()

    # Print the statistics
    print(f"Found {question_count} questions")
    print(f"Found {option_count} complete sets of options")
    print(f"Found {correct_answer_count} correct answers")
    print(f"Found {explanation_count} explanations")
    print(f"Valid entries count: {valid_entries_count}")

# Example usage
input_file = '/content/qcm_output (1) - qcm_output (1).csv'  # Replace with the actual path
display_statistics_csv(input_file)


Found 430 questions
Found 427 complete sets of options
Found 421 correct answers
Found 165 explanations
Valid entries count: 159


In [13]:
import google.generativeai as genai
import csv
import time

def generate_answers(prompts, api_key):
    if not api_key:
        raise ValueError("API key for Generative AI is not set.")

    # Configure the API
    genai.configure(api_key=api_key)

    # Configuration for the model
    generation_config = {
        "temperature": 0.7,
        "max_output_tokens": 50,  # Adjust this depending on the number of questions in the batch
    }

    # Join all prompts into one, separated by a newline
    full_prompt = "\n\n".join(prompts)

    # Call the generative model with the configured settings
    try:
        model = genai.GenerativeModel(model_name="gemini-1.0-pro", generation_config=generation_config)
        response = model.generate_content(full_prompt)

        # Split the response by newlines to get individual answers
        generated_texts = response.candidates[0].content.parts[0].text.strip().splitlines()
        return [text.strip() for text in generated_texts]
    except Exception as e:
        print(f"Error during API call: {e}")
        return [None] * len(prompts)

# Main function to process the TSV file
def process_questions_file(filename, api_key, batch_size=2):
    correct_count = 0
    total_count = 0

    # Open the file with the correct encoding
    with open(filename, mode='r', encoding='latin-1') as file:
        tsv_reader = csv.DictReader(file, delimiter='\t')
        batch_prompts = []
        batch_answers = []

        for i, row in enumerate(tsv_reader):
            question = row.get('Question', '').strip() if row.get('Question') else ""
            option_a = row.get('Option A', '').strip() if row.get('Option A') else ""
            option_b = row.get('Option B', '').strip() if row.get('Option B') else ""
            option_c = row.get('Option C', '').strip() if row.get('Option C') else ""
            option_d = row.get('Option D', '').strip() if row.get('Option D') else ""
            correct_answer = row.get('Correct Answer', '').strip().upper() if row.get('Correct Answer') else None

            # Create the prompt for the current question
            prompt = f"""You are a cybersecurity expert specializing in cyber threat intelligence. You are given a
multiple-choice question (MCQ) from a Cyber Threat Intelligence (CTI) knowledge benchmark dataset. Your task is to
choose the best option among the four provided. Return your answer as a single uppercase letter: A, B, C, or D.
Question: {question}
Options:
A) {option_a}
B) {option_b}
C) {option_c}
D) {option_d}
Important: The last line of your answer should contain only the single letter corresponding to the best option, with no additional text."""

            if correct_answer:  # Only add to batch if correct_answer exists
                batch_prompts.append(prompt)
                batch_answers.append(correct_answer)

            # Process batch when the batch size is met or when it's the last row
            if (i + 1) % batch_size == 0 or (i + 1) == sum(1 for _ in open(filename, encoding='latin-1')):
                generated_texts = generate_answers(batch_prompts, api_key)

                for generated_text, correct_answer in zip(generated_texts, batch_answers):
                    if generated_text == correct_answer:
                        correct_count += 1
                    total_count += 1

                    # Print the result
                    print(f"Generated Answer: {generated_text}")
                    print(f"Correct Answer: {correct_answer}")
                    print(f"{'Correct' if generated_text == correct_answer else 'Incorrect'}\n")

                # Reset batch
                batch_prompts = []
                batch_answers = []

                # Wait for a short time before making the next request to avoid hitting rate limits
                time.sleep(4)

    # Print the overall accuracy
    print(f"Total Questions: {total_count}")
    print(f"Correct Answers: {correct_count}")
    print(f"Accuracy: {correct_count / total_count * 100:.2f}%")

# Example usage
if __name__ == "__main__":
    api_key = "AIzaSyA6C0BzRxHy75Z3nW2eULOMfdlOo4BNqaM"  # Replace with your actual API key
    process_questions_file("/content/qcm_output (1).tsv", api_key, batch_size=2)


Generated Answer: A
Correct Answer: B
Incorrect

Generated Answer: D
Correct Answer: A
Incorrect

Generated Answer: B
Correct Answer: D
Incorrect

Generated Answer: D
Correct Answer: A
Incorrect

Generated Answer: D
Correct Answer: D
Correct

Generated Answer: B
Correct Answer: B
Correct

Generated Answer: B
Correct Answer: A
Incorrect

Generated Answer: B
Correct Answer: B
Correct

Generated Answer: A
Correct Answer: A
Correct

Generated Answer: A
Correct Answer: A
Correct

Generated Answer: A
Correct Answer: A
Correct

Generated Answer: A
Correct Answer: A
Correct

Generated Answer: B
Correct Answer: B
Correct

Generated Answer: C
Correct Answer: C
Correct

Generated Answer: D
Correct Answer: D
Correct

Generated Answer: A
Correct Answer: A
Correct

Generated Answer: C
Correct Answer: A
Incorrect

Error during API call: list index (0) out of range
Generated Answer: None
Correct Answer: B
Incorrect

Generated Answer: None
Correct Answer: D
Incorrect

Generated Answer: A
Correct Answer