In [None]:
pip install openai==0.28.1

### Imports

In [None]:
import language_tool_python
import pandas as pd
import openai
import yaml
from bs4 import BeautifulSoup
import html
import json
import re

### Reading data

In [None]:
data_path = r"your_path.parquet"
data = pd.read_parquet(data_path)
tool = language_tool_python.LanguageTool('en-US')



### Open AI passwords

In [None]:
def read_passwords_from_yaml(file_path_):
    with open(file_path_, 'r') as yaml_file:
        password = yaml.safe_load(yaml_file)
    return password

file_path = ''
passwords = read_passwords_from_yaml(file_path)
openai.api_type = passwords['openai.api_type']
openai.api_version = passwords['openai.api_version']
openai.api_base = passwords['openai.api_base']
openai.api_key = passwords['openai.api_key']

### Removing html tags

In [None]:
def remove_html_tags(input_string):
    soup = BeautifulSoup(input_string, "html.parser")  # to remove <p> etc
    text_without_tags = soup.get_text()
    text_without_nbsp = text_without_tags.replace('\xa0', ' ')  # replacing non-breaking space with regular space
    return html.unescape(text_without_nbsp)  # to decode &nbsp to space

### GPT's Turn

In [None]:
with open("/template.html", "r") as file:
    html_template = file.read()


def get_gpt_correction(original_text):
    prompt = (
    "Please proofread thoroughly the given text, identifying and correcting any grammatical, punctuation or spelling,"
    "errors, as well as improving sentence structure and clarity where necessary. Ensure that the document is polished and adheres"
    "to the highest standards of written English. DO NOT suggest word improvements. Try to classify the errors into the three main categories, Grammar"
    "Spelling and Punctuation"
    "Return ONLY a JSON object using double quotes based on the template provided. "
    "Below is the template with descriptions for each field:"
    "\n"
    "{\n"
    '   "Number of mistakes": "Total number of mistakes found in the text",\n'
    '   "mistakes": [\n'
    '       {\n'
    '           "original_text": "Text that will be replaced due to the mistake",\n'
    '           "new_text": "Corrected version of the mistake",\n'
    '           "explanation": "Reason why this is considered a mistake",\n'
    '           "mistake_type": "Type of the error (e.g. Grammar, Punctuation, Spelling).\n'
    '           "nth occurrence": "n, where "n" represents which word is wrong out of all the common words.For example in a sentence\n'
    '                              "Thank you for you time,Thank you." , the word "you" appears three times, and the word which is wrong is the second "you"\n'
    '                              that needs to be replaced with "your". So n=2'
    '       }\n'
    '   ]\n'
    "}\n"
    
)

    response = openai.ChatCompletion.create(
        engine="gpt-4-32k",  # The deployment name you chose when you deployed the ChatGPT or GPT-4 model.
        # "text-davinci-003" "gpt-35-turbo" "gpt-4-32k"
        temperature=0,
        messages=[
            {"role": "system", "content": prompt},
            {"role": "user", "content": original_text}
            
        ]

    )

    response_content = response['choices'][0]['message']['content']
    # Count occurrences of the key 'original_text' in the response content
    gpt_error_count = response_content.count("original_text")

    return response_content, gpt_error_count

In [None]:
def split_text(text, max_length=80):
    lines = []
    current_line = []
    words = text.split()
    for word in words:
        if len(' '.join(current_line + [word])) <= max_length:
            current_line.append(word)
        else:
            lines.append(' '.join(current_line))
            current_line = [word]
    # Add the last line
    if current_line:
        lines.append(' '.join(current_line))
    for line in lines:
        print(line)

import json

### Generating HTML

In [None]:
def generate_html_from_response(response, original_text, essay_number):

    response_data = json.loads(response)
    #copying original_text
    corrected_version = original_text[:]

    candidate_id = response_data.get('candidate_id')
    q_id = response_data.get('Q_ID')
    mistakes = response_data["mistakes"]
    processed_positions = {}

    start_pos = 0
    occurrence_counter = {}
    mistake_groups = {}  # Dictionary to hold mistakes grouped by type

    for mistake in mistakes:

        original_mistake = mistake["original_text"]
        mistake_type = mistake["mistake_type"] #MA = Mistake Analysis
        explanation = mistake["explanation"] #MA
        new_text = mistake["new_text"]

        nth_occurrence = mistake.get("nth occurrence",1)
        occurrence_counter.setdefault(original_mistake,1)


        mistake_item = f'<li class="analysis-item">{original_mistake} &rarr; {new_text} - {explanation}</li>' #MA
        # Group mistakes by type - MA
        if mistake_type not in mistake_groups:
            mistake_groups[mistake_type] = []
        mistake_groups[mistake_type].append(mistake_item)



        # finds the starting index of the first occurance of original_mistake
        # Check if the mistake ends with a punctuation mark
        if any(original_mistake.endswith(punct) for punct in [',',';']):
            # If so, include the punctuation in the regex pattern
            regex_pattern = r'\b{}\b(?=[.,!?;:])'.format(re.escape(original_mistake.rstrip('.,!?;:')))
        else:
            # Otherwise, use a standard word boundary
            regex_pattern = r'\b{}\b'.format(re.escape(original_mistake))

        # Find the position of the mistake
        mistake_pos = next((m.start() for m in re.finditer(regex_pattern, corrected_version)), -1) # in order to not replace a part of a word,ex. in "Yours" not replace "You" with smthing




        if original_mistake in processed_positions and mistake_pos in processed_positions[original_mistake]: # In order to not replace a mistake in original text more times that it should.
            continue


        while True:
            if mistake_pos != -1: # If .find doesn't find the substring, it returns -1
                if occurrence_counter[original_mistake] == int(nth_occurrence):
                    new_text = f'<span class="{mistake["mistake_type"]}">{mistake["new_text"]}</span>' #the replacement
                    corrected_version = corrected_version[:mistake_pos] + new_text + corrected_version[mistake_pos + len(original_mistake):]
                    start_pos = mistake_pos + len(new_text)
                

                    # Update the processed_positions dictionary here:
                    if original_mistake not in processed_positions:
                        processed_positions[original_mistake] = []
                    processed_positions[original_mistake].append(mistake_pos)

                    break
                else:
                    # If the occurrence is not the one we want to replace, update the occurrence counter
                    occurrence_counter[original_mistake] += 1

                    # Move the start_pos beyond the current occurrence to search for the next one
                    start_pos = mistake_pos + len(original_mistake)
                    mistake_pos = start_pos + next((m.start() for m in re.finditer(regex_pattern, corrected_version[start_pos:])),-1)



    # Compile the mistake analysis html
    mistake_analysis_list = []
    for mistake_type, items in mistake_groups.items():
        # Add mistake type header (e.g., "Grammar:")
        mistake_analysis_list.append(
            f'<li class="analysis-item"><strong><span class="{mistake_type}">{mistake_type.capitalize()}</span>:</strong></li>')
        # Add the mistakes for this type
        mistake_analysis_list.extend(items)
    
    mistake_analysis_html = "\n".join(mistake_analysis_list) #MA

    with open("/template.html", "r") as file:
        template = file.read()

    html_output = template.replace("{{candidate_id}}", candidate_id).replace("{{Q_ID}}",q_id).replace("{{original_text}}", original_text).replace("{{corrected_text}}", corrected_version).replace("{{mistake_analysis}}", mistake_analysis_html)

    return html_output

### Adding Candidate ID to JSON

In [None]:
def add_candidate_question_id(gpt_response, essay_number):
    try:
        gpt_response = json.loads(gpt_response)
    except json.JSONDecodeError as e:
        print(f"Warning: Unable to parse response for essay #{essay_number} as JSON. Skipping...")
        print(f"Error while parsing JSON: {e}")
        return ""

    additional_info = {
        'candidate_id': data['Candidate_ID'].iloc[essay_number],
        'Q_ID': data['Q_ID'].iloc[essay_number]  # Assuming 'Q_ID' is a column in your DataFrame
    }
    gpt_response = {**additional_info, **gpt_response}
    gpt_response = json.dumps(gpt_response)
    return gpt_response


In [None]:
all_html_content = ""
# all_response_data = []

for essay_number, doc in enumerate(data['Answer_Selected'].iloc[0:1]):
    clean_text = remove_html_tags(doc)
    gpt_response, gpt_error_count = get_gpt_correction(clean_text)
    gpt_response = add_candidate_question_id(gpt_response, essay_number)
    print(gpt_response)
    
    html_content = generate_html_from_response(gpt_response, clean_text, essay_number)
    all_html_content += html_content

    # all_response_data.append(response_data)
 
with open("/outputgpt.html", "w") as file:
    file.write(all_html_content)

# with open("/output.json", 'w', encoding="utf-8") as outfile:
#     json.dump(all_response_data, outfile, indent=4)

tool.close()

### Querrying JSON

In [None]:
# from collections import Counter

# mistake_type_counter = Counter()

# for entry in all_response_data:
#     for mistake in entry.get('mistakes', []):
#         mistake_type = mistake.get('mistake_type')
#         if mistake_type:
#             mistake_type_counter[mistake_type] += 1

# mistake_type_counter