# InkPulse

## Import Dependency

In [56]:
import json
import os
import pandas as pd
from collections import Counter
from datetime import datetime
from openai import OpenAI
import openai
import random
random.seed(23)
import csv
import numpy as np

## Set Parameters(dataset name, open ai key and path)

In [57]:
dataset_name = "legislation_formal_study_update"

In [None]:
os.environ["OPENAI_API_KEY"] = "YOUR_KEY"
openai.api_key = os.getenv("OPENAI_API_KEY")

## Quick Link:

Use the link below to see the implementation of the scoring functions. You can add or modify the current scoring methods or features.

[LLM Judge Score](#calculate-text-quality)

[Semantic Score](#calculate-semantic-score)

[Feature Implementations](#calculate-features)

In [59]:
# # In Colab
# import zipfile
# import shutil
# script_dir = os.getcwd()
# static_dir = script_dir

# # Reading dataset
# zip_path = f"{dataset_name}.zip"
# import_data_dir = "/content/import_dataset"
# csv_path = f"{dataset_name}.csv"
# final_extract_path = os.path.join(import_data_dir, dataset_name)

# if os.path.exists(final_extract_path):
#     shutil.rmtree(final_extract_path)
# os.makedirs(import_data_dir, exist_ok=True)

# with zipfile.ZipFile(zip_path, 'r') as zip_ref:
#     members = zip_ref.namelist()
#     top_level_dirs = set(m.split('/')[0] for m in members if '/' in m)
#     if len(top_level_dirs) == 1:
#         top_dir = list(top_level_dirs)[0]
#         for member in members:
#             relative_path = member[len(top_dir)+1:] if member.startswith(top_dir + '/') else member
#             if relative_path:
#                 target_path = os.path.join(final_extract_path, relative_path)
#                 os.makedirs(os.path.dirname(target_path), exist_ok=True)
#                 with zip_ref.open(member) as source, open(target_path, 'wb') as target:
#                     target.write(source.read())
#     else:
#         zip_ref.extractall(final_extract_path)
# if os.path.exists(csv_path):
#     target_csv_path = os.path.join(import_data_dir, f"{dataset_name}.csv")
#     shutil.move(csv_path, target_csv_path)

# Local
script_dir = os.getcwd()
static_dir = os.path.dirname(script_dir)
csv_path = os.path.join(static_dir, "import_dataset", f"{dataset_name}.csv")

In [60]:
def path_exists(path):
    if not os.path.exists(path):
      os.makedirs(path)
      print(f"Create folder in {path}")
    else:
      print(f"Folder already exist in {path}.")

In [61]:
import_data_dir = os.path.join(static_dir, "import_dataset")
json_path = os.path.join(static_dir, "import_dataset", f"{dataset_name}")
session_id_collection = []
for filename in os.listdir(json_path):
    filename = filename.removesuffix(".jsonl")
    session_id_collection.append(filename)

new_path = os.path.join(static_dir, "dataset", f"{dataset_name}")
path_exists(new_path)

new_json_path = os.path.join(static_dir, f"dataset/{dataset_name}/json")
path_exists(new_json_path)

new_segment_path = os.path.join(static_dir, f"dataset/{dataset_name}/segment")
path_exists(new_segment_path)

new_segment_results_path = os.path.join(static_dir, "dataset", f"{dataset_name}", "segment_results")
path_exists(new_segment_results_path)

Folder already exist in d:\Study\Lab\Vitualization\Ink-Pulse\static\dataset\legislation_formal_study_update.
Folder already exist in d:\Study\Lab\Vitualization\Ink-Pulse\static\dataset/legislation_formal_study_update/json.
Folder already exist in d:\Study\Lab\Vitualization\Ink-Pulse\static\dataset/legislation_formal_study_update/segment.
Folder already exist in d:\Study\Lab\Vitualization\Ink-Pulse\static\dataset\legislation_formal_study_update\segment_results.


## Data Preprocessing

### Sentence Segmentation

In [62]:
def load_json(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    return data

def write_json(data, file_path, session):
    actual_session = session+'.json'
    new_file_path = os.path.join(file_path, actual_session)
    with open(new_file_path, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=4)
    print(f"Data written to {new_file_path}")

In [63]:
def collect_data(snapshots):
    segments = []
    current_text = ""
    current_source = None
    current_start_time = None
    current_end_time = None
    last_event_time = None

    for snap in snapshots:
        text = snap['text']
        source = snap['eventSource']
        event_time = snap['event_time']
        event_name = snap['eventName']

        if current_source is None:
            current_source = source
            current_start_time = event_time
        if source != current_source or event_name == "suggestion-open":
            if current_text:
                segments.append({
                    "text": current_text,
                    "source": current_source,
                    "start_time": current_start_time,
                    "end_time": current_end_time,
                    "last_event_time": last_event_time
                })
            current_text = text
            if event_name != "suggestion-open":
                current_source = source
            current_start_time = event_time
        else:
            current_text = text

        current_end_time = event_time
        last_event_time = event_time

    if current_text:
        segments.append({
            "text": current_text,
            "source": current_source,
            "start_time": current_start_time,
            "end_time": current_end_time,
            "last_event_time": last_event_time
        })

    return segments

### Calculate progress

In [64]:
def convert_and_clean(data, delta):
    total_length = len(data[-1]['text'])
    current_progress = 0
    for entry in data:
        entry['start_progress'] = current_progress
        entry['end_progress'] = len(entry['text']) / total_length
        current_progress = entry['end_progress']
    base_time_str = data[0]['start_time']
    base_time = datetime.strptime(base_time_str, "%Y-%m-%d %H:%M:%S")
    for event in data:
      if 'start_time' in event:
          start_time = datetime.strptime(event['start_time'], "%Y-%m-%d %H:%M:%S")
          event['start_time'] = (start_time - base_time).total_seconds()
      if 'end_time' in event:
          end_time = datetime.strptime(event['end_time'], "%Y-%m-%d %H:%M:%S")
          event['end_time'] = (end_time - base_time).total_seconds()
      if 'last_event_time' in event:
          last_event_time = datetime.strptime(event['last_event_time'], "%Y-%m-%d %H:%M:%S")
          event['last_event_time'] = (last_event_time - base_time).total_seconds()
    filtered_sentences = []
    for i, entry in enumerate(data):
      text = entry.get("text", "").strip()
      if text == "":
          continue
      if filtered_sentences:
          prev_text = filtered_sentences[-1]["text"]
          delta_chars = sum(1 for a, b in zip(prev_text, text) if a != b) + abs(len(prev_text) - len(text))
          if delta_chars < delta:
              continue
      filtered_sentences.append(entry)
      data = filtered_sentences
    return data

In [65]:
def convert_and_calculate(data):
    info = data["actions"]
    total_length = len(data['text'][0])
    for i in info:
        i['progress'] = round(len(i['current_text']) / total_length, 2)
        # i.pop('current_text', None)
        
    return data

In [66]:
def merge_time(actions):
    result = []
    prev = None
    for action in actions:
        if action['name'] == 'text-insert' and action.get('text'):
            if prev and prev['name'] == 'text-insert' \
               and prev['eventSource'] == action['eventSource'] \
               and prev['event_time'] == action['event_time']:
                prev['text'] += action['text']
            else:
                result.append(action)
                prev = action
        else:
            result.append(action)
            prev = action

    actions.clear()
    actions.extend(result)
    
    return actions

### Reconstruct text

In [67]:
info_data = []
sentence_data = []
TEXT = []
def get_data(dataset_name, session_id_collection, static_dir, is_json):
    if is_json:
        json_path = os.path.join(static_dir, f"dataset/{dataset_name}/json")
    else:
        json_path = os.path.join(static_dir, f"dataset/{dataset_name}/segment")
    for session in session_id_collection:
        extracted_data = {'init_text': [], 'init_time': [], 'json': [], 'text': [], 'actions': [], 'end_time': [], 'snapshots': []}
        file_path = os.path.join(static_dir, "import_dataset", f"{dataset_name}")
        actual_session = session + '.jsonl'
        new_file_path = os.path.join(file_path, actual_session)
        with open(new_file_path, 'r', encoding='utf-8') as file:
            for line_number, line in enumerate(file, start=1):
                cleaned_line = line.replace('\0', '')
                if cleaned_line.strip():
                    json_data = json.loads(cleaned_line)
                    if line_number == 1:
                        init_text = json_data.get('currentDoc', '')
                        init_timestamp = json_data.get('eventTimestamp')
                        init_time = datetime.fromtimestamp(init_timestamp / 1000).strftime("%Y-%m-%d %H:%M:%S")
                    event_num = json_data.get('eventNum')
                    event_name = json_data.get('eventName')
                    event_source = json_data.get('eventSource')
                    event_timestamp = json_data.get('eventTimestamp')
                    event_time = datetime.fromtimestamp(event_timestamp / 1000).strftime("%Y-%m-%d %H:%M:%S")
                    last_event_time = event_time
                    text_delta = json_data.get('textDelta', {})
                    current_suggestions = json_data.get('currentSuggestions', {})
                    entry = {'eventNum': event_num, 'eventName': event_name, 'eventSource': event_source, 'event_time': event_time, 'textDelta': text_delta, 'currentSuggestions': current_suggestions}
                    extracted_data['json'].append(entry)
        extracted_data['init_time'].append(init_time)
        extracted_data['init_text'].append(init_text)
        text = ''.join(extracted_data['init_text'])
        previous_event_name = None
        if text != "" and text != "\n":
            extracted_data['snapshots'].append({
                'text': text,
                'eventName': '',
                'eventSource': 'api',
                'event_time': init_time,
                'eventNum': 0
            })
        for entry in extracted_data['json']:
            text_delta = entry.get('textDelta', {})
            if not isinstance(text_delta, dict):
                if isinstance(text_delta, str) and text_delta.strip():
                    try:
                        text_delta = json.loads(text_delta)
                    except json.JSONDecodeError:
                        text_delta = {}
                else:
                    text_delta = {}
            ops = text_delta.get('ops', [])
            event_name = entry.get('eventName')
            event_source = entry.get('eventSource', 'unknown')
            event_time = entry.get('event_time')
            event_num = entry.get('eventNum')
            pos = entry.get('currentCursor', 0)
            for op in ops:
                if 'retain' in op:
                    pos += op['retain']
                elif 'insert' in op:
                    inserts = op['insert']
                    if not isinstance(inserts, str):
                        # print(f"skip image insert: {inserts}")
                        continue
                    source = event_source
                    if previous_event_name == "suggestion-close" and len(inserts) > 5:
                        source = "api"
                    text = text[:pos] + inserts + text[pos:]
                    extracted_data['actions'].append({
                        'id': event_num,
                        'name': 'text-insert',
                        'text': inserts,
                        'eventSource': source,
                        'event_time': event_time,
                        'count': len(inserts),
                        'pos': pos,
                        'current_text': text,
                    })
                    pos += len(inserts)
                elif 'delete' in op:
                    delete_count = op['delete']
                    deleted_text = text[pos:pos + delete_count]
                    text = text[:pos] + text[pos + delete_count:]
                    extracted_data['actions'].append({
                        'id': event_num,
                        'name': 'text-delete',
                        'text': deleted_text,
                        'eventSource': event_source,
                        'event_time': event_time,
                        'count': delete_count,
                        'pos': pos,
                        'current_text': text,
                    })
            if event_name == 'suggestion-open':
                extracted_data['actions'].append({
                    'id': event_num,
                    'name': event_name,
                    'eventSource': event_source,
                    'event_time': event_time,
                    'current_text': text,
                })
                extracted_data['snapshots'].append({
                    'text': text,
                    'eventName': event_name,
                    'eventSource': event_source,
                    'event_time': event_time,
                    'eventNum': event_num
                })
            if ops:
                extracted_data['snapshots'].append({
                    'text': text,
                    'eventName': event_name,
                    'eventSource': source,
                    'event_time': event_time,
                    'eventNum': event_num
                })
            previous_event_name = event_name
        if entry['eventNum'] == None:
            for entry in extracted_data['actions']:
                if 'id' in entry:
                    del entry['id']
        # print(text)
        TEXT.append({'session_id': session, 'text': text})
        data = collect_data(extracted_data['snapshots'])
        data = convert_and_clean(data, delta = 5)
        extracted_data['text'].append(text)
        extracted_data['end_time'] = last_event_time
        extracted_data = convert_and_calculate(extracted_data)
        extracted_data.pop('json', None)
        extracted_data.pop('snapshots', None)
        extracted_data['init_text'] = ''.join(extracted_data['init_text'])
        extracted_data['init_time'] = ''.join(extracted_data['init_time'])
        extracted_data['text'] = ''.join(extracted_data['text'])
        if is_json:
            extracted_data['text'] = len(extracted_data['text'])
            extracted_data['actions'] = merge_time(extracted_data['actions'])
            for action in extracted_data['actions']:
                action.pop('id', None)
                action.pop('count', None)
                action.pop('current_text', None)
            write_json(extracted_data, json_path, session)
        else:
            write_json(data, json_path, session)
        info_data.append(extracted_data)
        sentence_data.append(data)

## Data Preprocessing - Main

In [68]:
get_data(dataset_name, session_id_collection, static_dir, is_json=True)
# get_data(dataset_name, session_id_collection, static_dir, is_json=False)

Data written to d:\Study\Lab\Vitualization\Ink-Pulse\static\dataset/legislation_formal_study_update/json\0c44adf9178a443a9fd7a5a2edaeb7c4.json
Data written to d:\Study\Lab\Vitualization\Ink-Pulse\static\dataset/legislation_formal_study_update/json\12bc2ababfd341f4b09c4716b3e17541.json
Data written to d:\Study\Lab\Vitualization\Ink-Pulse\static\dataset/legislation_formal_study_update/json\180c987fde934a02a50f81f9e981a087.json
Data written to d:\Study\Lab\Vitualization\Ink-Pulse\static\dataset/legislation_formal_study_update/json\2692060fbf604d9193c8a1a4dbc2b8c6.json
Data written to d:\Study\Lab\Vitualization\Ink-Pulse\static\dataset/legislation_formal_study_update/json\2925c4bef43b49acabcade452c1b3c48.json
Data written to d:\Study\Lab\Vitualization\Ink-Pulse\static\dataset/legislation_formal_study_update/json\295c3df7ff5443aa9afebc8168183a64.json
Data written to d:\Study\Lab\Vitualization\Ink-Pulse\static\dataset/legislation_formal_study_update/json\2c4b5bda6f6f43ab9ffc990e7cae9693.json

### Check Function

In [69]:
if info_data[0]["text"] == sentence_data[0][-1]["text"]:
    print("True")
else:
    print("False")

False


## Calculate Semantic Score

In [70]:
def read_sentences(file_path):
  with open(file_path, 'r', encoding='utf-8') as file:
    data = json.load(file)
    sentences = [
      {
          "text": entry["text"],
          "source": entry.get("source", "unknown"),
          "start_progress": entry["start_progress"],
          "end_progress": entry["end_progress"],
          "start_time": entry["start_time"],
          "end_time": entry["end_time"],
          "last_event_time": entry["last_event_time"]
      }
      for entry in data if "text" in entry
    ]
  return sentences

In [71]:
client = openai.OpenAI(api_key=os.environ["OPENAI_API_KEY"])
def get_openai_embedding(text, model="text-embedding-3-small"):
  response = client.embeddings.create(input=[text], model=model)
  return np.array(response.data[0].embedding)

In [72]:
def compute_vector_norm(residual_vector):
  return float(np.linalg.norm(residual_vector))

In [73]:
def analyze_residuals(sentences, check):
  results = []
  delta = 5
  first_is_empty = not check["init_text"] or check["init_text"].strip() == ""

  for i, sentence in enumerate(sentences):
      text = sentence.get("text", "").strip()

      if first_is_empty:
          if i == 0:
              sentence["residual_vector"] = 0.0
              continue
          elif i == 1:
              sentence["embedding"] = get_openai_embedding(text)
              sentence["residual_vector"] = 1.0
              continue

      if i == 0:
          sentence["embedding"] = get_openai_embedding(text)
          sentence["residual_vector"] = 0.0
      else:
          prev_text = sentences[i - 1]["text"]
          delta_chars = sum(1 for a, b in zip(prev_text, text) if a != b) + abs(len(prev_text) - len(text))
          sentence["embedding"] = get_openai_embedding(text)
          if delta_chars <= delta:
              sentence["residual_vector"] = 0.0
          else:
              prev_embedding = sentences[i - 1]["embedding"]
              residual_vector = sentence["embedding"] - prev_embedding
              sentence["residual_vector"] = compute_vector_norm(residual_vector)
  norms = [s["residual_vector"] for s in sentences]
  min_norm = min(norms)
  max_norm = max(norms)
  norm_range = max_norm - min_norm if max_norm != min_norm else 1.0
  for sentence in sentences:
      raw_norm = sentence["residual_vector"]
      normalized = (raw_norm - min_norm) / norm_range
      sentence["residual_vector_norm"] = normalized
  for sentence in sentences:
      result_entry = {
          "sentence": sentence["text"],
          "source": sentence["source"],
          "start_progress": sentence["start_progress"],
          "end_progress": sentence["end_progress"],
          "start_time": sentence["start_time"],
          "end_time": sentence["end_time"],
          "last_event_time": sentence["last_event_time"],
          "residual_vector": sentence["residual_vector"],
          "residual_vector_norm": sentence["residual_vector_norm"],
      }
      results.append(result_entry)
  return results

In [74]:
def convert_types(obj):
  if isinstance(obj, (np.float32, np.float64)):
      return float(obj)
  elif isinstance(obj, (np.int32, np.int64)):
      return int(obj)
  elif isinstance(obj, np.ndarray):
      return obj.tolist()
  return obj

In [75]:
def save_results_to_json(results, session_id, output_dir):
  output_file = os.path.join(output_dir, f"{session_id}.json")
  results_converted = [
      {k: convert_types(v) for k, v in entry.items()}
      for entry in results
  ]
  with open(output_file, "w", encoding="utf-8") as f:
      json.dump(results_converted, f, indent=4, ensure_ascii=False)
  print(f"Segment results saved to {output_file}")

## Calculate Semantic Score - Main

In [76]:
for file_name in os.listdir(new_segment_path):
  if file_name.endswith(".json"):
      file_path = os.path.join(new_segment_path, file_name)
      session_id = os.path.splitext(file_name)[0]
      sentences = read_sentences(file_path)
      check_file = os.path.join(new_json_path, session_id + ".json")
      check = load_json(check_file)
      results = analyze_residuals(sentences, check)
      save_results_to_json(results, session_id, new_segment_results_path)

Segment results saved to d:\Study\Lab\Vitualization\Ink-Pulse\static\dataset\legislation_formal_study_update\segment_results\0c44adf9178a443a9fd7a5a2edaeb7c4.json
Segment results saved to d:\Study\Lab\Vitualization\Ink-Pulse\static\dataset\legislation_formal_study_update\segment_results\12bc2ababfd341f4b09c4716b3e17541.json
Segment results saved to d:\Study\Lab\Vitualization\Ink-Pulse\static\dataset\legislation_formal_study_update\segment_results\180c987fde934a02a50f81f9e981a087.json
Segment results saved to d:\Study\Lab\Vitualization\Ink-Pulse\static\dataset\legislation_formal_study_update\segment_results\2692060fbf604d9193c8a1a4dbc2b8c6.json
Segment results saved to d:\Study\Lab\Vitualization\Ink-Pulse\static\dataset\legislation_formal_study_update\segment_results\2925c4bef43b49acabcade452c1b3c48.json
Segment results saved to d:\Study\Lab\Vitualization\Ink-Pulse\static\dataset\legislation_formal_study_update\segment_results\295c3df7ff5443aa9afebc8168183a64.json
Segment results saved 

## Calculate Text Quality

In [77]:
def chatgpt_prompter(input_prompt):
    completion = client.chat.completions.create(
        # model="gpt-3.5-turbo",
        model = "gpt-4o",
        messages=[
            {"role": "user", "content": input_prompt}
        ],
        temperature = 0,
    )
    return completion.choices[0].message.content

In [78]:
def process_evaluate(answer):
    answer = answer.strip().removeprefix("```json").removesuffix("```").strip()
    if answer.startswith("[") and answer.endswith("]"):
        answer = "{" + answer[1:-1] + "}"
    try:
        data = json.loads(answer)
        idea_score = data["idea_score"]
        coherence_score = data["coherence_score"]
        score = int (idea_score) + int(coherence_score)
    except Exception as e:
        print("Fail:", answer)
        print(e)
        score = 0
        with open("failed.txt", "a") as f:
            f.write(answer + "\n")

    return score

In [79]:
def longest_common(s1, s2):
    min_len = min(len(s1), len(s2))
    i = 0
    while i < min_len and s1[i] == s2[i]:
        i += 1
    return s1[:i]

In [80]:
def read_sentences(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        data = json.load(file)
        session_id = os.path.splitext(os.path.basename(file_path))[0]
        if "init_text" in data and data["init_text"] and data["init_text"].strip():
            intro = data["init_text"].strip()
            article = ""
            for item in TEXT:
                if str(item["session_id"]) == str(session_id):
                    article = item["text"]
                    break
            prefix = longest_common(intro, article)
            if prefix:
                article = article[len(prefix):].lstrip()
        else:
            intro = ""
            article = ""
            for item in TEXT:
                if str(item["session_id"]) == str(session_id):
                    article = item["text"]
                    break

    return intro, article

In [81]:
def evaluate_prompt(session_id, topic, intro, article):
    result = []
    EVALUATION_PROMPT_TEMPLATE = f"""
        Topic: {topic}
        Introduction: {intro}
        Article: {article}
        —–
        You are evaluating an article co-written by a human and an AI. You must objectively score the topic-article pair based on the two criteria below:
        New Idea (0-5):
        Evaluate how much *new, original thinking* the article contributes *beyond the Introduction*.
        - 0: No ideas; incoherent or irrelevant.
        - 1: Purely obvious or generic.
        - 2: Fragmented or shallow ideas.
        - 3: Standard ideas with some development.
        - 4: Clear new insights or novel angles.
        - 5: Multiple strong original ideas; deep or creative expansion beyond the intro.

        Coherence (0-5):
        Evaluate how well the article maintains *logical structure*, *natural transitions*, and *stylistic consistency* throughout.
        - 0: Disjointed or jarring; abrupt shifts in tone or topic.
        - 1: Minimal cohesion; sections feel stitched together.
        - 2: Mostly smooth but has several awkward transitions.
        - 3: Generally coherent; occasional unevenness.
        - 4: Well-structured with clear, natural progression.
        - 5: Seamless flow and unity; stylistically and structurally refined.

        Note: The Introduction is provided for context only and should NOT be considered part of the article content for scoring purposes.
        Your response should be in JSON format as follows:
        ["session_id": {session_id}, "idea_score": "idea_score","coherence_score": "coherence_score", "reason": "Explain briefly why you gave these scores, citing specific examples or patterns from the article."]
        —–
        score:
    """
    answer = chatgpt_prompter(EVALUATION_PROMPT_TEMPLATE)
    # print("system_prompt: ", EVALUATION_PROMPT_TEMPLATE)
    # print(answer)
    score = process_evaluate(answer)
    # print(score)
    result.append({
        'Prompt': EVALUATION_PROMPT_TEMPLATE,
        'Evaluation': answer,
        'Score': score
    })

    return result

In [82]:
topic_dir = os.path.join(import_data_dir, f"{dataset_name}.csv")

judge_score = []
topic_df = pd.read_csv(topic_dir)
for file_name in os.listdir(new_json_path):
    if file_name.endswith(".json"):
        file_path = os.path.join(new_json_path, file_name)
        session_id = os.path.splitext(file_name)[0]
        session = topic_df[topic_df["session_id"] == session_id]
        if not session.empty:
            topic = session["prompt_code"].values[0]
            intro, article = read_sentences(file_path)
            result = evaluate_prompt(session_id, topic, intro, article)
            for item in result:
              item["score"] = result[0]["Score"]
            judge_score.append({
                "session_id": session_id,
                "judge_score": result[0]
            })

## Clean Data

Convert the sentence into length of the sentence

In [83]:
def clean(data):
    for entry in data:
        length = len(entry["sentence"])
        entry.pop("residual_vector", None)
        entry["sentence"] = length / 3000
    return data

for file_name in os.listdir(new_segment_results_path):
    if file_name.endswith(".json"):
        file_path = os.path.join(new_segment_results_path, file_name)
        cleaned_data = clean(load_json(file_path))
        with open(file_path, "w", encoding="utf-8") as f:
            json.dump(cleaned_data, f, ensure_ascii=False, indent=4)

Clean the evaluation score, drop explanation

In [84]:
def clean_judge_score(data):
    judge = data.get("judge_score", {})
    score = judge.get("score", judge.get("Score", None)) if isinstance(judge, dict) else judge
    data["judge_score"] = score

    return data
judge_score = [clean_judge_score(score) for score in judge_score]

## Calculate Features

In [85]:
prompt_code = []
def find_prompt_code(csv_path):
    with open(csv_path, mode='r', encoding='utf-8-sig') as file:
        csv_reader = csv.DictReader(file)
        for row in csv_reader:
            session_id = row.get('session_id')
            topic = row.get('prompt_code')
            if session_id and topic:
                prompt_code.append({
                    "session_id": session_id,
                    "prompt_code": topic
                })
    
    return prompt_code

find_prompt_code(csv_path)

[{'session_id': '0c44adf9178a443a9fd7a5a2edaeb7c4', 'prompt_code': 'writing'},
 {'session_id': '2c4b5bda6f6f43ab9ffc990e7cae9693', 'prompt_code': 'writing'},
 {'session_id': '6df918d72046461a98d275bf3fac31d0', 'prompt_code': 'writing'},
 {'session_id': '8c2eb554dcc64935b8bbb450f4acc622', 'prompt_code': 'writing'},
 {'session_id': '12bc2ababfd341f4b09c4716b3e17541', 'prompt_code': 'writing'},
 {'session_id': '69fec119c2304acf9b079da625d9bfec', 'prompt_code': 'writing'},
 {'session_id': '82a43f771d0545b1b5bcc2f0e6d7b46c', 'prompt_code': 'writing'},
 {'session_id': '180c987fde934a02a50f81f9e981a087', 'prompt_code': 'writing'},
 {'session_id': '295c3df7ff5443aa9afebc8168183a64', 'prompt_code': 'writing'},
 {'session_id': '688f06a6d9b447c08e2cde69034971d9', 'prompt_code': 'writing'},
 {'session_id': '2925c4bef43b49acabcade452c1b3c48', 'prompt_code': 'writing'},
 {'session_id': '2692060fbf604d9193c8a1a4dbc2b8c6', 'prompt_code': 'writing'},
 {'session_id': 'ffa00e543db5434a9c09f1ea48bf7c7b', 

In [86]:
length = []
def calculate_length(path):
  for file_name in os.listdir(path):
      session_id = os.path.splitext(file_name)[0]
      if file_name.endswith(".json"):
          file_path = os.path.join(path, file_name)
          data = load_json(file_path)
          sentence = data[-1]["sentence"] * 3000
          length.append({
              "session_id": session_id,
              "length": sentence
          })
  return length

calculate_length(new_segment_results_path)

[{'session_id': '0c44adf9178a443a9fd7a5a2edaeb7c4', 'length': 2893.0},
 {'session_id': '12bc2ababfd341f4b09c4716b3e17541', 'length': 2750.0},
 {'session_id': '180c987fde934a02a50f81f9e981a087', 'length': 3108.0},
 {'session_id': '2692060fbf604d9193c8a1a4dbc2b8c6', 'length': 2981.0},
 {'session_id': '2925c4bef43b49acabcade452c1b3c48',
  'length': 3098.9999999999995},
 {'session_id': '295c3df7ff5443aa9afebc8168183a64', 'length': 2826.0},
 {'session_id': '2c4b5bda6f6f43ab9ffc990e7cae9693', 'length': 3122.0},
 {'session_id': '688f06a6d9b447c08e2cde69034971d9',
  'length': 3010.0000000000005},
 {'session_id': '69fec119c2304acf9b079da625d9bfec', 'length': 2612.0},
 {'session_id': '6df918d72046461a98d275bf3fac31d0', 'length': 3520.0},
 {'session_id': '82a43f771d0545b1b5bcc2f0e6d7b46c', 'length': 2602.0},
 {'session_id': '8c2eb554dcc64935b8bbb450f4acc622', 'length': 2651.0},
 {'session_id': 'ffa00e543db5434a9c09f1ea48bf7c7b', 'length': 2731.0}]

In [89]:
AI_ratio = []
def calculate_AI_ratio(path):
  for file_name in os.listdir(path):
      ai_num = 0
      human_num = 0
      session_id = os.path.splitext(file_name)[0]
      if file_name.endswith(".json"):
          file_path = os.path.join(path, file_name)
          data = load_json(file_path)
          for d in data["actions"]:
              if d["name"] == "text-insert":
                  if d["eventSource"] == "api":
                      ai_num += len(d["text"])
                  else:
                      human_num += len(d["text"])
          all = ai_num + human_num
          AI_ratio.append({
              "session_id": session_id,
              "AI_ratio": ai_num / all
          })
  return AI_ratio

calculate_AI_ratio(new_json_path)

[{'session_id': '0c44adf9178a443a9fd7a5a2edaeb7c4',
  'AI_ratio': 0.16197623514696685},
 {'session_id': '12bc2ababfd341f4b09c4716b3e17541',
  'AI_ratio': 0.03415607080528547},
 {'session_id': '180c987fde934a02a50f81f9e981a087',
  'AI_ratio': 0.031906407870247275},
 {'session_id': '2692060fbf604d9193c8a1a4dbc2b8c6',
  'AI_ratio': 0.2697684578418523},
 {'session_id': '2925c4bef43b49acabcade452c1b3c48',
  'AI_ratio': 0.5093733261917515},
 {'session_id': '295c3df7ff5443aa9afebc8168183a64',
  'AI_ratio': 0.3422087745839637},
 {'session_id': '2c4b5bda6f6f43ab9ffc990e7cae9693',
  'AI_ratio': 0.2592028433612592},
 {'session_id': '688f06a6d9b447c08e2cde69034971d9',
  'AI_ratio': 0.4188303173354314},
 {'session_id': '69fec119c2304acf9b079da625d9bfec',
  'AI_ratio': 0.04336999377464204},
 {'session_id': '6df918d72046461a98d275bf3fac31d0',
  'AI_ratio': 0.39946342841005367},
 {'session_id': '82a43f771d0545b1b5bcc2f0e6d7b46c',
  'AI_ratio': 0.06017191977077364},
 {'session_id': '8c2eb554dcc64935b8b

In [90]:
sum_semantic_score = []
def calculate_sum_semantic_score(path):
  for file_name in os.listdir(path):
      session_id = os.path.splitext(file_name)[0]
      if file_name.endswith(".json"):
          file_path = os.path.join(path, file_name)
          data = load_json(file_path)
          score = 0
          for d in data:
            score += d["residual_vector_norm"]
          sum_semantic_score.append({
              "session_id": session_id,
              "sum_semantic_score": score
          })
  return sum_semantic_score

calculate_sum_semantic_score(new_segment_results_path)

[{'session_id': '0c44adf9178a443a9fd7a5a2edaeb7c4',
  'sum_semantic_score': 4.139451463697411},
 {'session_id': '12bc2ababfd341f4b09c4716b3e17541',
  'sum_semantic_score': 3.5075625079944888},
 {'session_id': '180c987fde934a02a50f81f9e981a087',
  'sum_semantic_score': 3.913163072523042},
 {'session_id': '2692060fbf604d9193c8a1a4dbc2b8c6',
  'sum_semantic_score': 8.866636043859767},
 {'session_id': '2925c4bef43b49acabcade452c1b3c48',
  'sum_semantic_score': 5.099654504022341},
 {'session_id': '295c3df7ff5443aa9afebc8168183a64',
  'sum_semantic_score': 6.207847484538334},
 {'session_id': '2c4b5bda6f6f43ab9ffc990e7cae9693',
  'sum_semantic_score': 5.221915516353953},
 {'session_id': '688f06a6d9b447c08e2cde69034971d9',
  'sum_semantic_score': 6.351442753271263},
 {'session_id': '69fec119c2304acf9b079da625d9bfec',
  'sum_semantic_score': 4.914721369998366},
 {'session_id': '6df918d72046461a98d275bf3fac31d0',
  'sum_semantic_score': 13.58427963999893},
 {'session_id': '82a43f771d0545b1b5bcc2

## Make Feature File

In [91]:
feature_names = ['prompt_code', 'judge_score', 'length', 'AI_ratio', 'sum_semantic_score']

def merge_2_csv_json(feature_names):
  data_lists = {
     'prompt_code': prompt_code,
      'judge_score': judge_score,
      'length': length,
      'AI_ratio': AI_ratio,
      'sum_semantic_score': sum_semantic_score,
  }

  dfs = [pd.DataFrame(data_lists[feature]) for feature in feature_names]
  df = dfs[0]
  for other_df in dfs[1:]:
      df = df.merge(other_df, on='session_id', how='outer')
  output_path_csv = os.path.join(static_dir, f"dataset/{dataset_name}", "session.csv")
  df.to_csv(output_path_csv, index=False)
  output_path_json = os.path.join(static_dir, f"dataset/{dataset_name}", "session.json")
  data = df.to_dict(orient='records')
  with open(output_path_json, "w", encoding="utf-8") as f:
      json.dump(data, f, ensure_ascii=False, indent=4)
merge_2_csv_json(feature_names)

In [92]:
dataset_name_path = os.path.join(static_dir, "dataset_name.json")
data_name = load_json(dataset_name_path)
if dataset_name not in data_name:
    data_name.append(dataset_name)
    with open(dataset_name_path, "w", encoding="utf-8") as f:
        json.dump(data_name, f, ensure_ascii=False, indent=4)
else:
    print(f"{dataset_name} already exist.")
