In [None]:
import os
import json
from dotenv import load_dotenv
from langchain_groq import ChatGroq

load_dotenv()

In [None]:
with open("../data/json/conversations.json", "r", encoding="utf-8") as f:
    data = json.load(f)

In [None]:
all_conversations = []

In [None]:
for convo in data[:5]:
    convo_title = convo.get("title", "Untitled Conversation")
    mapping = convo.get("mapping", {})

    user_messages = []

    for node_id, node in mapping.items():
        msg = node.get("message")
        if not msg:
            continue

        # Only keep messages written by user
        # if msg.get("author", {}).get("role") != "user":
        #     continue

        parts = msg.get("content", {}).get("parts", [])
        for p in parts:
            if isinstance(p, str) and p.strip():
                user_messages.append(p.strip())

    # Save only if there is at least one user message
    if user_messages:
        all_conversations.append({
            "title": convo_title,
            "messages": user_messages
        })

In [None]:
for c in all_conversations[50:57]:
    print("Conversation Title:", c["title"])
    print("\n".join(c["messages"]))

In [None]:
for convo in data[:1]:
    convo_title = convo['title']
    # convo_content = convo['content']
    print(f"Conversation Title: {convo_title}")
    print(convo.keys())

In [None]:
for convo in data[:1]:
    convo_title = convo['title']
    # convo_content = convo['content']
    print(f"Conversation Title: {convo_title}")
    content = convo['mapping']
    key = list(content.keys())[2]
    print(content[key].keys())
    for k in content[key]:
        print(k, ":", content[key][k])

In [None]:
for convo in data[:1]:
    convo_title = convo['title']
    # convo_content = convo['content']
    print(f"Conversation Title: {convo_title}")
    content = convo['mapping']
    key = list(content.keys())[5]
    print(content[key]['message']['author']['role'], ":", content[key]['message']['content']['parts'][0])
    

In [None]:
output_lines = []

In [None]:
import re

def extract_text(parts):
    """Extract only text items from parts, ignoring dicts."""
    texts = []
    for p in parts:
        if isinstance(p, str):
            texts.append(p)
        elif isinstance(p, dict) and "text" in p:
            texts.append(p["text"])
    text = "".join(texts).strip()
    
    text = re.sub(r'(\*\*|\*|__|_|`+)', '', text)
    text = re.sub(r'^\s*#{1,6}\s*', '', text, flags=re.MULTILINE)
    text = re.sub(r'^\s*>\s*', '', text, flags=re.MULTILINE)
    text = re.sub(r'\n{2,}', '\n', text)
    text = re.sub(r'[ \t]+\n', '\n', text)
    
    return text

In [None]:
for convo in data:
    convo_title = convo.get("title", "Untitled")
    mapping = convo.get("mapping", {})

    output_lines.append(f"### Conversation: {convo_title}")
    for key, node in mapping.items():
        msg = node.get("message")
        if not msg:
            continue
        role = msg.get("author", {}).get("role")
        parts = msg.get("content", {}).get("parts", [])
        text = extract_text(parts)
        
        if role in ["user", "assistant"] and text:
            output_lines.append(f"\n{role.upper()}: {text}")

In [None]:
with open("../data/json/clean_chat_history.txt", "w", encoding="utf-8") as out:
    out.write("".join(output_lines))

print("Saved to clean_chat_history.txt")