In [None]:
import pandas as pd
import re

In [None]:
df = pd.read_csv('/content/daily_dialogue.csv')

In [None]:
df.drop(['act','emotion'],axis=1, inplace=True)

In [None]:
df

Unnamed: 0,dialog
0,"['Say , Jim , how about going for a few beers ..."
1,"['Can you do push-ups ? '\n "" Of course I can ..."
2,"['Can you study with the radio on ? '\n ' No ,..."
3,['Are you all right ? '\n ' I will be all righ...
4,"['Hey John , nice skates . Are they new ? '\n ..."
...,...
13113,"['Hello , who is speaking ? '\n ' Hello , this..."
13114,['Ahh ... What a fine day ! I do feel like an ...
13115,"[""I'm so sorry about your brother , Mr . Wang ..."
13116,"['Hi , Jeny.Are still working ? '\n "" Hi , Nac..."


In [None]:
df['dialog_str'] = df['dialog'].astype(str)

In [None]:
df['dialog_str'].apply(lambda x: re.sub(r'\[\]','',x))

Unnamed: 0,dialog_str
0,"['Say , Jim , how about going for a few beers ..."
1,"['Can you do push-ups ? '\n "" Of course I can ..."
2,"['Can you study with the radio on ? '\n ' No ,..."
3,['Are you all right ? '\n ' I will be all righ...
4,"['Hey John , nice skates . Are they new ? '\n ..."
...,...
13113,"['Hello , who is speaking ? '\n ' Hello , this..."
13114,['Ahh ... What a fine day ! I do feel like an ...
13115,"[""I'm so sorry about your brother , Mr . Wang ..."
13116,"['Hi , Jeny.Are still working ? '\n "" Hi , Nac..."


In [None]:
df['new_dialog_str']=df['dialog_str'].apply(lambda x: x[1:-2])

In [None]:
type(df['new_dialog_str'][1])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13118 entries, 0 to 13117
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   dialog          13118 non-null  object
 1   dialog_str      13118 non-null  object
 2   new_dialog_str  13118 non-null  object
dtypes: object(3)
memory usage: 307.6+ KB


In [None]:
df['new_dialog_str'][0]

'\'Say , Jim , how about going for a few beers after dinner ? \'\n \' You know that is tempting but is really not good for our fitness . \'\n \' What do you mean ? It will help us to relax . \'\n " Do you really think so ? I don\'t . It will just make us fat and act silly . Remember last time ? "\n " I guess you are right.But what shall we do ? I don\'t feel like sitting at home . "\n \' I suggest a walk over to the gym where we can play singsong and meet some of our friends . \'\n " That\'s a good idea . I hear Mary and Sally often go there to play pingpong.Perhaps we can make a foursome with them . "\n \' Sounds great to me ! If they are willing , we could ask them to go dancing with us.That is excellent exercise and fun , too . \'\n " Good.Let \' s go now . " \' All right . '

In [None]:
df.head()

Unnamed: 0,dialog,dialog_str,new_dialog_str
0,"['Say , Jim , how about going for a few beers ...","['Say , Jim , how about going for a few beers ...","'Say , Jim , how about going for a few beers a..."
1,"['Can you do push-ups ? '\n "" Of course I can ...","['Can you do push-ups ? '\n "" Of course I can ...","'Can you do push-ups ? '\n "" Of course I can ...."
2,"['Can you study with the radio on ? '\n ' No ,...","['Can you study with the radio on ? '\n ' No ,...","'Can you study with the radio on ? '\n ' No , ..."
3,['Are you all right ? '\n ' I will be all righ...,['Are you all right ? '\n ' I will be all righ...,'Are you all right ? '\n ' I will be all right...
4,"['Hey John , nice skates . Are they new ? '\n ...","['Hey John , nice skates . Are they new ? '\n ...","'Hey John , nice skates . Are they new ? '\n '..."


In [None]:
df.columns

Index(['dialog', 'dialog_str', 'new_dialog_str'], dtype='object')

In [None]:
df.head()

Unnamed: 0,dialog,act,emotion
0,"['Say , Jim , how about going for a few beers ...",[3 4 2 2 2 3 4 1 3 4],[0 0 0 0 0 0 4 4 4 4]
1,"['Can you do push-ups ? '\n "" Of course I can ...",[2 1 2 2 1 1],[0 0 6 0 0 0]
2,"['Can you study with the radio on ? '\n ' No ,...",[2 1 2 1 1],[0 0 0 0 0]
3,['Are you all right ? '\n ' I will be all righ...,[2 1 1 1],[0 0 0 0]
4,"['Hey John , nice skates . Are they new ? '\n ...",[2 1 2 1 1 2 1 3 4],[0 0 0 0 0 6 0 6 0]


In [None]:
import re, json

# ===================================
# CLEANING FUNCTIONS
# ===================================

def clean_line(line, strip_outer_quotes=True):
    line = line.strip()
    line = line.replace('\\r\\n', '\n').replace('\\n', '\n')
    if strip_outer_quotes:
        line = re.sub(r"^[\s'\"]+|[\s'\"]+$", "", line)
    try:
        line = bytes(line, "utf-8").decode("unicode_escape")
    except:
        pass
    return line


def row_to_messages(s, strip_outer_quotes=True):
    if s is None:
        return []

    s = s.replace('\r\n', '\n')
    if "\\n" in s and "\n" not in s:
        s = s.replace("\\n", "\n")

    lines = [ln for ln in s.split('\n') if ln.strip() != ""]
    cleaned = [clean_line(ln, strip_outer_quotes) for ln in lines]

    messages = []
    for i, txt in enumerate(cleaned):
        role = "user" if i % 2 == 0 else "assistant"
        messages.append({"role": role, "content": txt})

    return messages


def df_to_jsonl(df, col, out_path='dialogs.jsonl', strip_outer_quotes=True):
    with open(out_path, 'w', encoding='utf-8') as fout:
        for idx, s in enumerate(df[col].astype(str)):
            messages = row_to_messages(s, strip_outer_quotes)
            obj = {"messages": messages}
            json.dump(obj, fout, ensure_ascii=False)
            fout.write('\n')
    return out_path


# ===================================
# RUN ON YOUR EXISTING DATAFRAME
# ===================================

# Use your actual df already defined earlier in the notebook
output_path = "/content/processed_dialogues.jsonl"
df_to_jsonl(df, col="new_dialog_str", out_path=output_path)

print("JSONL saved to:", output_path)

# Preview first few lines
with open(output_path, "r", encoding="utf-8") as f:
    for _ in range(3):
        print(f.readline())


JSONL saved to: /content/processed_dialogues.jsonl
{"messages": [{"role": "user", "content": "Say , Jim , how about going for a few beers after dinner ?"}, {"role": "assistant", "content": "You know that is tempting but is really not good for our fitness ."}, {"role": "user", "content": "What do you mean ? It will help us to relax ."}, {"role": "assistant", "content": "Do you really think so ? I don't . It will just make us fat and act silly . Remember last time ?"}, {"role": "user", "content": "I guess you are right.But what shall we do ? I don't feel like sitting at home ."}, {"role": "assistant", "content": "I suggest a walk over to the gym where we can play singsong and meet some of our friends ."}, {"role": "user", "content": "That's a good idea . I hear Mary and Sally often go there to play pingpong.Perhaps we can make a foursome with them ."}, {"role": "assistant", "content": "Sounds great to me ! If they are willing , we could ask them to go dancing with us.That is excellent ex

##After I created the processed_dialogues.jsonl, I will start working on it

In [None]:
!pip install ftfy
import json, re
from ftfy import fix_text

input_path = "/content/processed_dialogues.jsonl"
output_path = "/content/processed_dialogues_cleaned.jsonl"

def clean_minimal(text):
    if not isinstance(text, str):
        return text

    # 1. Fix mojibake (â€™ → ’ etc.)
    text = fix_text(text)

    # 2. Remove weird quote artifacts like: `" '`, `' '`, `''`, `""`
    text = re.sub(r"[\"']\s*[\"']", " ", text)
    text = re.sub(r"[\"']{2,}", " ", text)   # handles '' or ""

    # 3. Remove trailing/leading stray quotes
    text = re.sub(r"^[\"'\s]+|[\"'\s]+$", "", text)

    # 4. Normalize spacing: remove double/triple spaces
    text = re.sub(r"\s{2,}", " ", text)

    # 5. Space before punctuation (fix "word ." → "word.")
    text = re.sub(r"\s+([.,!?])", r"\1", text)

    # 6. Ensure one space after punctuation when appropriate
    text = re.sub(r"([.,!?])([A-Za-z])", r"\1 \2", text)

    return text.strip()

# -------------------------
# PROCESS THE JSONL FILE
# -------------------------

with open(input_path, "r", encoding="utf-8") as fin, \
     open(output_path, "w", encoding="utf-8") as fout:

    for line in fin:
        obj = json.loads(line)
        for msg in obj["messages"]:
            msg["content"] = clean_minimal(msg["content"])
        json.dump(obj, fout, ensure_ascii=False)
        fout.write("\n")

print("Saved cleaned file to:", output_path)


In [None]:
import json
import re

input_path = "/content/processed_dialogues_cleaned.jsonl"
output_path = "/content/processed_dialogues_apostrophes_fixed.jsonl"

# Regex to collapse spaced apostrophes between word characters:
#   (word)  '  (word)  →  word'word
APOSTROPHE_FIX = re.compile(r"(\w)\s*'\s*(\w)")

def fix_apostrophes(text):
    if not isinstance(text, str):
        return text

    # Collapse patterns like: I ' m → I'm
    text = APOSTROPHE_FIX.sub(r"\1'\2", text)

    return text


# -------------------------
# PROCESS
# -------------------------

with open(input_path, "r", encoding="utf-8") as fin, \
     open(output_path, "w", encoding="utf-8") as fout:

    for line in fin:
        obj = json.loads(line)
        for msg in obj["messages"]:
            msg["content"] = fix_apostrophes(msg["content"])
        json.dump(obj, fout, ensure_ascii=False)
        fout.write("\n")

print("Saved:", output_path)

Saved: /content/processed_dialogues_apostrophes_fixed.jsonl


##Now let's work with human chat file

In [None]:
conversations = []
current_convo = []

with open("/content/human_chat.txt", "r", encoding="utf-8") as fin:
  for raw_line in fin:
    line = raw_line.strip()
    if not line:
      continue

    if line.startswith('Human 1:'):
      content = line.split(':', 1)[1].strip() # here we used [1] indexing to copy the content of the message because the splitting will convert this to a list and index 1 has the contents of the message

      if content.lower().startswith('hi'):
        if current_convo:
          conversations.append(current_convo)
        current_convo=[]

      current_convo.append(('user',content))

    elif line.startswith('Human 2:'):
      content = line.split(':',1)[1].strip()
      current_convo.append(('assistant',content))

if current_convo:
  conversations.append(current_convo)

print('Total convos: ',len(conversations))

Total convos:  94


In [None]:
with open('/content/processed_human_chat.jsonl','w',encoding='utf-8') as fout:
  for convo in conversations:
    msg = [{"role":role,"content":text} for (role,text) in convo]
    obj = {"messages":msg}
    fout.write(json.dumps(obj,ensure_ascii=False)+"\n")