In [None]:
import pandas as pd
import spacy
import random
import string
from collections import Counter
import re
import requests
import json

In [None]:
!pip install --upgrade openai

Extract key terms

In [None]:
%env OPENAI_API_KEY=''

In [None]:
from openai import OpenAI

client = OpenAI()

def read_input_file_by_chunks(file_path, chunk_size=20):
    """Yields chunks of the input file, with each chunk containing up to 'chunk_size' rows."""
    with open(file_path, 'r', encoding='utf-8') as file:
        chunk = []
        for line in file:
            chunk.append(line.strip())
            if len(chunk) == chunk_size:
                yield " ".join(chunk)
                chunk = []
        if chunk:  # Handle any remaining lines
            yield " ".join(chunk)

def process_chunk_and_extract_terms(chunk, subject_context):
    """Extracts key terms and their definitions from the input text and returns them in JSON format."""
    # Few-shot examples to guide the model
    few_shot_examples = {
        "chemistry": {
            "MHz": "Megahertz, a unit of frequency equal to one million hertz.",
            "compounds": "a substance made up of two or more different chemical elements combined in a fixed ratio."
        },
        "physics": {
            "speed": "The rate at which someone or something is able to move or operate.",
            "Hz": "Hertz, the unit of frequency in the International System of Units, representing one cycle per second."
        },
        "computer science": {
            "memory-mapped I/O": "A method for performing I/O between the CPU and peripheral devices using memory addresses",
            "MD5": "Message Digest Algorithm 5, a widely used cryptographic hash function that produces a 128-bit hash value."
        },
        "history": {
            "Louis XIV": "King of France known for his long reign and his centralized control of the government and the economy",
            "Great Awakening": "a series of religious revivals in American colonies during the 18th and 19th centuries"
        },
        "economics": {
            "MP": "marginal product, the additional output produced by using one more unit of input",
            "marginal product of labor": "The change in output resulting from employing one more unit of labor."
        }
    }

    examples = few_shot_examples.get(subject_context, {})
    system_message = "As an expert glossary generator, here are some examples of key terms and their definitions:\n"
    for term, definition in examples.items():
        system_message += f'"{term}": "{definition}",\n'

    system_message += (f"Now, for {subject_context}, extract key terms from the provided text, ensuring each term has a unique and universally applicable definition within the field. "
                       "Terms should be extracted exactly as they appear, without alteration. Focus on significant concepts or units within the {subject_context} context, "
                       "providing clear, informative, and comprehensive definitions. Avoid repetition of terms with varying definitions; each term should have one definitive explanation. "
                       "Ignore overly simple or generic terms unless they are of specific relevance. Output should be in JSON format with terms as keys and their concise yet complete definitions as values.")

    response = client.chat.completions.create(
        model="gpt-3.5-turbo-0125",
        response_format={"type": "json_object"},
        temperature=0.5,
        max_tokens=4096,
        messages=[
            {"role": "system", "content": system_message},
            {"role": "user", "content": chunk}
        ]
    )

    print(response.choices[0].message.content)


def extract_terms_and_definitions(input_file_path, subject_context):
    """Extracts key terms and their definitions from the input file, processing in chunks."""
    for chunk in read_input_file_by_chunks(input_file_path):
        process_chunk_and_extract_terms(chunk, subject_context)

# Example usage:
input_file_path = 'virology.txt'
subject_context = "virology"

extract_terms_and_definitions(input_file_path, subject_context)

Remove duplicate terms in JSON

In [None]:
def remove_duplicate_keys(json_file):
    # Read the original JSON data
    with open(json_file, 'r') as file:
        data = json.load(file)

    # Create a new dictionary to store unique keys and values
    new_data = {}

    # Loop through the original data, add only unique keys to the new dictionary
    for key, value in data.items():
        if key not in new_data:
            new_data[key] = value

    # Write the new data with unique keys back to the JSON file
    with open(json_file, 'w') as file:
        json.dump(new_data, file, indent=4, ensure_ascii=False)

# Replace 'your_json_file.json' with the actual JSON file path
remove_duplicate_keys('virology_glossary.json')

Replace word using glossary for questions

In [None]:
def load_glossary(file_path):
    """Load and return the glossary from a JSON file."""
    with open(file_path, 'r') as file:
        return json.load(file)

def random_word():
    """Select a random meaningful word."""
    words = ["Cat", "Fish", "Vex", "Point", "Bard", "Book", "Dummy", "Dog",
             "Balance", "Adam", "Winkle", "Winky", "Noise", "Zelly", "Luck", "Jump", "Love", "King", "Queen", "Jack", "Ball","Duck"]
    return random.choice(words)

def escape_regex_term(term):
    """Escape special characters in a term for regex use."""
    return re.escape(term)

def replace_terms_in_question(question, glossary):
    # Convert the question to lower case for comparison
    lower_case_question = question.lower()

    # Sort glossary keys by length in descending order to match longer terms first
    sorted_terms = sorted(glossary.keys(), key=len, reverse=True)

    # Dictionary to store dummy words and their definitions
    replaced_terms = {}

    for term in sorted_terms:
        lower_case_term = term.lower()
        if lower_case_term in lower_case_question:
            dummy_word = random_word()
            definition = glossary[term]
            print(f"Term: {term}, Dummy Word: {dummy_word}, Definition: {definition}")  # Debugging print statement

            # Updated regular expression for term replacement with case-insensitive matching
            term_pattern = r'\b' + re.escape(lower_case_term) + r'\b'
            question, count = re.subn(term_pattern, dummy_word, question, flags=re.IGNORECASE)

            # Only add to replaced_terms if actually replaced
            if count > 0:
                replaced_terms[dummy_word] = definition

    # Construct the definition introduction sentence
    definition_intro = ' '.join([
        "Suppose '{}' refers to {}".format(dummy, definition.strip('"'))
        if definition.startswith('"')
        else "Suppose '{}' means '{}'".format(dummy, definition)
        for dummy, definition in replaced_terms.items()
    ])

    return definition_intro + ' ' + question


    # Return the modified question with the definitions introduced at the beginning
    return definition_intro + ' ' + question

def process_csv_questions(file_path, output_file_path, glossary_file_path):
    # Load the glossary
    glossary = load_glossary(glossary_file_path)

    # Load CSV file
    df = pd.read_csv(file_path)

    # Replace questions in the first column with revised questions
    df.iloc[:, 0] = df.iloc[:, 0].apply(lambda question: replace_terms_in_question(question, glossary))

    # Save the DataFrame with revised questions to a new CSV file
    df.to_csv(output_file_path, index=False)

# Example usage
input_file_path = 'college_biology_test.csv'
output_file_path = 'question_only_college_biology_test_easy.csv'
glossary_file_path = 'biology_glossary.json'
process_csv_questions(input_file_path, output_file_path, glossary_file_path)


Replace answers

In [None]:
import json
import random
import re
import pandas as pd

def load_glossary(file_path):
    with open(file_path, 'r') as file:
        return json.load(file)

def random_word():
    words = ["Cat", "Fish", "Vex", "Point", "Bard", "Book", "Dummy", "Dog",
             "Balance", "Adam", "Winkle", "Winky", "Noise", "Zelly", "Luck", "Jump", "Love", "King", "Queen", "Jack", "Ball","Duck"]
    return random.choice(words)

def escape_regex_term(term):
    return re.escape(term)

def replace_terms_in_text(text, glossary):
    # Convert text to string to avoid AttributeError when calling .lower()
    text = str(text)
    lower_case_text = text.lower()
    sorted_terms = sorted(glossary.keys(), key=len, reverse=True)
    replaced_terms = {}

    for term in sorted_terms:
        lower_case_term = term.lower()
        if lower_case_term in lower_case_text:
            dummy_word = random_word()
            definition = glossary[term]
            term_pattern = r'\b' + re.escape(lower_case_term) + r'\b'
            text, count = re.subn(term_pattern, dummy_word, text, flags=re.IGNORECASE)
            if count > 0:
                replaced_terms[dummy_word] = definition

    definition_intro = ' '.join([
        "Suppose '{}' refers to {}".format(dummy, definition.strip('"'))
        if definition.startswith('"')
        else "Suppose '{}' means '{}'".format(dummy, definition)
        for dummy, definition in replaced_terms.items()
    ])

    return definition_intro + ' ' + text

def process_csv_questions(file_path, output_file_path, glossary_file_path):
    glossary = load_glossary(glossary_file_path)
    df = pd.read_csv(file_path)

    # Replace terms in the answer columns (assumed to be columns 2 to 5)
    for column in range(1, 5):
        df.iloc[:, column] = df.iloc[:, column].apply(lambda text: replace_terms_in_text(text, glossary))

    df.to_csv(output_file_path, index=False)

# Example usage
input_file_path = 'college_medicine_test.csv'
output_file_path = 'answer_only_college_medicine_test.csv'
glossary_file_path = 'medicine_glossary.json'
process_csv_questions(input_file_path, output_file_path, glossary_file_path)
