In [None]:
# Simple Jaccard Similarity in Arabic (Google Colab)
def jaccard_similarity(text1, text2):
    # split into words (tokens)
    words1 = set(text1.split())
    words2 = set(text2.split())

    # intersection and union
    intersection = words1.intersection(words2)
    union = words1.union(words2)

    # jaccard formula
    if not union:
        return 0.0
    return len(intersection) / len(union)

In [None]:
def _to_percent(value):

    if isinstance(value, str):
        s = value.strip()
        if s.endswith('%'):
            s = s[:-1].strip()
        val = float(s)
    elif isinstance(value, (int, float)):
        val = float(value)
    else:
        raise TypeError(f"Unsupported type for Jaccard/target: {type(value)}")

    return val * 100.0 if val <= 1.0 else val


def select_best_output(Best_Output: dict, target):
    """
    Select the (output, jaccard) pair from Best_Output whose Jaccard is closest to 'target'.

    Best_Output keys follow: 'Output-1', 'Jaccard-1', 'Output-2', 'Jaccard-2', ...
    Scenarios supported: 1 pair, 2 pairs, or 3 pairs (i.e., 2/4/6 keys).
    Jaccard and target can be ratios (0..1), percentages (0..100), or strings like "76%".

    Returns:
        (output, jaccard_original)  # jaccard_original is exactly what was stored in Best_Output.
    Raises:
        ValueError if no valid (Output-n, Jaccard-n) pairs are found.
    """

    # Gather contiguous pairs starting at index 1
    pairs = []
    i = 1
    while True:
        o_key = f"Output-{i}"
        j_key = f"Jaccard-{i}"

        if o_key in Best_Output and j_key in Best_Output:
            out = Best_Output[o_key]
            jac = Best_Output[j_key]
            try:
                jac_pct = _to_percent(jac)
            except Exception as e:
                # Skip malformed entries
                i += 1
                continue
            pairs.append((out, jac, jac_pct))
            i += 1
        else:
            # Stop when both are missing (we assume contiguous numbering as per your scenarios)
            if o_key not in Best_Output and j_key not in Best_Output:
                break
            i += 1  # keep scanning forward just in case

    if not pairs:
        raise ValueError("No valid (Output-n, Jaccard-n) pairs found in Best_Output.")

    # If only one pair, return it directly
    if len(pairs) == 1:
        return pairs[0][0], pairs[0][1]

    # Normalize target to percentage
    target_pct = _to_percent(target)

    # Choose the pair with Jaccard closest to target
    best_out, best_jac_original, _ = min(pairs, key=lambda x: abs(x[2] - target_pct))
    return best_out, best_jac_original


In [None]:
import json
import pandas as pd
import re
import openai
import time

from openai import OpenAI


# Create an OpenAI client with your deepinfra token and endpoint
openai = OpenAI(
    api_key="API-KEY",
    base_url="https://api.deepinfra.com/v1/openai",
)

# ################ Promting Funtion #####################

def ragPrompt(messages):

  resp = openai.chat.completions.create(
    model="Model Name",
    messages= messages
)


  result = resp.choices[0].message.content
  return result


def craftPrompt():
        prompt = '''
          You are an editor that polishes Arabic text.
          Your task is to edit a given Arabic article by changing a specific percentage of words.
          You must follow these rules:
          - Keep the meaning and message of the text unchanged.
          - Do not shorten or expand the length significantly.
          - Replace/rephrase only the target percentage of words.
          - Output only the polished Arabic text, nothing else.

          ### Example Article:
          الرياض مدينة كبيرة جميلة حديثة مزدهرة مليئة بالحدائق والأسواق
          ---

          ### Polished Example 1 (≈ 10% words changed):
          الرياض مدينة كبيرة رائعة حديثة مزدهرة مليئة بالحدائق والأسواق.
          ---

          ### Polished Example 2 (≈ 25% words changed):
          الرياض عاصمة كبيرة رائعة حديثة مزدهرة مليئة بالحدائق والمتاجر.
          ---

          ### Polished Example 3 (≈ 50% words changed):
          الرياض عاصمة ضخمة رائعة متطورة نشيطة مليئة بالمنتزهات والمتاجر.
          ---

          ### Polished Example 4 (≈ 75% words changed):
          الرياض حاضرة ضخمة مذهلة متطورة نشيطة مزدهرة مليئة بالمنتزهات والمراكز


          ### Now polish this new text by 75% only:
          '''

        return prompt


In [None]:

# Load your JSON file
with open('/content/Ar-APT.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

json_file = "/content/Polished_Text.json"

# Store results for DataFrame
results = []
i = 0
LastEnd = 0
# Iterate over each item
for item in data:
    messages = []
    messages = [
        {"role": "system", "content": "You are a helpful assistant who always responds with helpful information. You are asked to provide a polished version of the following text. Only generate the arabic polished text."},
      ]
    if i < LastEnd:
      i += 1
    else:
          if i == 50 or i == 100 or i == 150 or i == 200 or i == 250 or i == 300:
          # Save back to JSON file
            with open(json_file, "w", encoding="utf-8") as f:
                json.dump(results, f, ensure_ascii=False, indent=2)

            print("✅ Result appended to model_responses.json")

          print(i)
          i += 1


          # 1) Build the prompt using the variables above

          prompt = craftPrompt()
          prompt = prompt + "\n" + item['original']
          polish_word_limit = 0.50
          Target_Jaccard = 0.50
          lower_bound = Target_Jaccard - 0.08
          upper_bound = Target_Jaccard + 0.08
          Best_Output = {}


          # Generate the model's answer
          try:
              messages.append({"role": "user", "content": prompt})
              output = ragPrompt(messages)
          except Exception as e:
              output = f"Error: {str(e)}"

          J_similarity = jaccard_similarity(item['original'], output)
          print("J_similarity " , J_similarity)

          Best_Output['Output-1'] = output
          Best_Output['Jaccard-1'] = J_similarity


          if J_similarity < upper_bound and J_similarity > lower_bound:
            print("Done in round 1")

          else:
              messages.append({"role": "assistant", "content": output})
              if J_similarity > upper_bound:
                Added_Info = " The Jaccard similarity is" + str(J_similarity) + " You need to reduce it and add more polish words to make it close to " + str(Target_Jaccard * 100)  + "%"
              else:
                Added_Info = " The Jaccard similarity is" + str(J_similarity) + " You need to increase it and reduce number of polish words to make it close to " + str(Target_Jaccard * 100)  + "%"
              messages.append({"role": "user", "content": Added_Info})

              # Prompt the model again
              try:
                  output = ragPrompt(messages)
                  print("new output " , output)
              except Exception as e:
                  output = f"Error: {str(e)}"

              J_similarity = jaccard_similarity(item['original'], output)
              print("2nd J_similarity " , J_similarity)

              Best_Output['Output-2'] = output
              Best_Output['Jaccard-2'] = J_similarity

              if J_similarity < upper_bound and J_similarity > lower_bound:
                print("Done in round 2")

              else:
                messages.append({"role": "assistant", "content": output})
                if J_similarity > upper_bound:
                  Added_2_Info = " The Jaccard similarity is" + str(J_similarity) + " You need to reduce it and add more polish words to make it close to " + str(Target_Jaccard * 100)  + "%"
                else:
                   Added_2_Info = " The Jaccard similarity is" + str(J_similarity) + " You need to increase it and reduce number of polish words to make it close to " + str(Target_Jaccard * 100)  + "%"

                messages.append({"role": "user", "content": Added_2_Info})
                # Generate the model's answer
                try:
                    #messages.append({"role": "user", "content": prompt})
                    output = ragPrompt(messages)
                    print("new output " , output)
                except Exception as e:
                    output = f"Error: {str(e)}"
                J_similarity = jaccard_similarity(item['original'], output)
                print("3rd J_similarity 3 " , J_similarity)


                Best_Output['Output-3'] = output
                Best_Output['Jaccard-3'] = J_similarity

                print("Done in round 3")



          # Save to results list
          #print("ORG:  ",original_text)
          #print("Ploish:  ",output)
          output, J_similarity = select_best_output(Best_Output, Target_Jaccard*100)

          print("=========================")
          new_entry = {
              "original": item['original'],
              "Polish_text": output,
              "Polish_Percentage": polish_word_limit,
              "domain": item['domain'],
              "J_similarity": J_similarity
          }
          results.append(new_entry)




# Save back to JSON file
with open(json_file, "w", encoding="utf-8") as f:
    json.dump(results, f, ensure_ascii=False, indent=2)

print("✅ Result appended to model_responses.json")
