In [None]:
import openai
from docx import Document
from google.colab import files
import re

# Manually set your OpenAI API key
openai.api_key = "enter your key here"  # Replace with your actual key

def extract_text_from_docx(file_path):
    """Extracts and returns text from a .docx file."""
    doc = Document(file_path)
    full_text = []
    for paragraph in doc.paragraphs:
        full_text.append(paragraph.text)
    return "\n".join(full_text)

def extract_case_numbers(text):
    """Extracts case numbers using a more robust regex."""
    case_number_pattern = r'\b([A-Z]+\s?\d{1,4}\s?\/?\s?\d{1,4})\b'
    case_numbers = re.findall(case_number_pattern, text)
    return case_numbers

def generate_qa_pairs(text, instructions, model="gpt-4", max_tokens=2000):
    """
    Generates Q&A pairs using OpenAI's GPT model, with concise and relevant answers for Hong Kong law.

    Args:
        text (str): The input text (e.g., from a document).
        instructions (str): Instructions for GPT to generate Q&A pairs.
        model (str): The GPT model to use (default: gpt-4).
        max_tokens (int): Maximum tokens for the response.

    Returns:
        str: Generated Q&A pairs.
    """
    prompt = f"{instructions}\n\nText:\n{text}"
    response = openai.ChatCompletion.create(
        model=model,
        messages=[
            {"role": "system", "content": "You are a legal expert in Hong Kong law. Generate concise Q&A pairs with references to case numbers and legal principles."},
            {"role": "user", "content": prompt}
        ],
        max_tokens=max_tokens,
        temperature=0.7
    )
    return response['choices'][0]['message']['content']

def split_text_into_chunks(text, max_tokens=2000):
    """Splits text into smaller chunks, each under the token limit."""
    paragraphs = text.split("\n")
    chunks = []
    current_chunk = ""
    for paragraph in paragraphs:
        estimated_tokens = len(paragraph.split())
        if len(current_chunk.split()) + estimated_tokens > max_tokens:
            chunks.append(current_chunk)
            current_chunk = paragraph
        else:
            current_chunk += "\n" + paragraph
    if current_chunk:
        chunks.append(current_chunk)
    return chunks

def process_files(files_list):
    """Process a list of files to generate concise and relevant Q&A pairs for Hong Kong law."""
    all_qa_pairs = ""

    for file_path in files_list:
        print(f"Processing file: {file_path}")

        # Extract text from the document
        document_text = extract_text_from_docx(file_path)

        # Extract case numbers from the document
        case_numbers = extract_case_numbers(document_text)
        if case_numbers:
            print(f"Extracted case numbers: {', '.join(case_numbers)}")
        else:
            print("No case numbers found in the document.")

        # Instructions for concise Q&A generation specific to Hong Kong law
        instructions = (
            "You are an expert in Hong Kong law. Generate concise Q&A pairs based on the legal content of the text provided. "
            "Keep answers short, focusing on the most relevant legal principles, case numbers, and Hong Kong-specific laws. "
            "For each question, provide a concise answer with references to relevant cases, principles, or laws. "
            "Use the case number if available. If no case number is mentioned, refer to the applicable legal principle. "
            "Please format the output as follows:\n\n"
            "QA Pair 1\nQuestion: [Your question here, referencing applicable laws or case details]\nAnswer: [Your short, concise answer here, including case details or relevant laws]\n\n"
            "QA Pair 2\nQuestion: [Your question here]\nAnswer: [Your short, concise answer here]\n\n"
        )

        # Split the document into chunks
        chunks = split_text_into_chunks(document_text, max_tokens=2000)

        # Process each chunk separately
        for i, chunk in enumerate(chunks):
            print(f"Generating Q&A pairs for chunk {i+1} of {len(chunks)} in file: {file_path}...")
            qa_pairs = generate_qa_pairs(chunk, instructions)
            all_qa_pairs += qa_pairs + "\n\n"

    return all_qa_pairs

def main():
    # Upload multiple files
    uploaded = files.upload()
    files_list = list(uploaded.keys())  # List of uploaded files

    # Process the files
    print("Processing files...")
    all_qa_pairs = process_files(files_list)

    # Save the Q&A pairs to a file
    output_file = "/content/qa_pairs.txt"
    with open(output_file, "w", encoding="utf-8") as f:
        f.write(all_qa_pairs)

    print(f"Q&A pairs have been saved to {output_file}")

    # Allow the user to download the result file
    files.download(output_file)

if __name__ == "__main__":
    main()
