In [None]:
!pip install -U transformers datasets accelerate gradio

In [None]:
from transformers import AutoTokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, AutoModelForCausalLM, DataCollatorForLanguageModeling, pipeline
from datasets import load_dataset, Dataset
import torch
import json
import math

In [None]:
from google.colab import drive
drive.mount('/content/drive')


In [None]:
import sqlite3
import pandas as pd

db_path = "/content/drive/My Drive/chat_backup.db"

conn = sqlite3.connect(db_path)

first_chat = 'FIRST NUMBER'

first_query = f"""
SELECT
  datetime(m.date / 1000000000 + strftime('%s', '2001-01-01'), 'unixepoch') AS timestamp,
  m.is_from_me,
  COALESCE(h.id, 'You') AS sender,
  m.text
FROM
  message m
LEFT JOIN
  handle h ON m.handle_id = h.rowid
JOIN
  chat_message_join cmj ON m.rowid = cmj.message_id
JOIN
  chat c ON cmj.chat_id = c.rowid
WHERE
  c.chat_identifier = '{first_chat}'
  AND m.text IS NOT NULL
ORDER BY
  timestamp ASC;
"""


df1 = pd.read_sql_query(first_query, conn)

second_chat = 'SECOND NUMBER'

second_query = f"""
SELECT
  datetime(m.date / 1000000000 + strftime('%s', '2001-01-01'), 'unixepoch') AS timestamp,
  m.is_from_me,
  COALESCE(h.id, 'You') AS sender,
  m.text
FROM
  message m
LEFT JOIN
  handle h ON m.handle_id = h.rowid
JOIN
  chat_message_join cmj ON m.rowid = cmj.message_id
JOIN
  chat c ON cmj.chat_id = c.rowid
WHERE
  c.chat_identifier = '{second_chat}'
  AND m.text IS NOT NULL
ORDER BY
  timestamp ASC;
"""

df2 = pd.read_sql_query(second_query, conn)

third_chat = 'THIRD NUMBER'

third_query = f"""
SELECT
  datetime(m.date / 1000000000 + strftime('%s', '2001-01-01'), 'unixepoch') AS timestamp,
  m.is_from_me,
  COALESCE(h.id, 'You') AS sender,
  m.text
FROM
  message m
LEFT JOIN
  handle h ON m.handle_id = h.rowid
JOIN
  chat_message_join cmj ON m.rowid = cmj.message_id
JOIN
  chat c ON cmj.chat_id = c.rowid
WHERE
  c.chat_identifier = '{third_chat}'
  AND m.text IS NOT NULL
ORDER BY
  timestamp ASC;
"""

df3 = pd.read_sql_query(third_query, conn)

fourth_chat = 'FOURTH NUMBER'

fourth_query = f"""
SELECT
  datetime(m.date / 1000000000 + strftime('%s', '2001-01-01'), 'unixepoch') AS timestamp,
  m.is_from_me,
  COALESCE(h.id, 'You') AS sender,
  m.text
FROM
  message m
LEFT JOIN
  handle h ON m.handle_id = h.rowid
JOIN
  chat_message_join cmj ON m.rowid = cmj.message_id
JOIN
  chat c ON cmj.chat_id = c.rowid
WHERE
  c.chat_identifier = '{fourth_chat}'
  AND m.text IS NOT NULL
ORDER BY
  timestamp ASC;
"""

df4 = pd.read_sql_query(fourth_query, conn)

fifth_chat = 'FIFTH NUMBER'

fifth_query = f"""
SELECT
  datetime(m.date / 1000000000 + strftime('%s', '2001-01-01'), 'unixepoch') AS timestamp,
  m.is_from_me,
  COALESCE(h.id, 'You') AS sender,
  m.text
FROM
  message m
LEFT JOIN
  handle h ON m.handle_id = h.rowid
JOIN
  chat_message_join cmj ON m.rowid = cmj.message_id
JOIN
  chat c ON cmj.chat_id = c.rowid
WHERE
  c.chat_identifier = '{fifth_chat}'
  AND m.text IS NOT NULL
ORDER BY
  timestamp ASC;
"""

df5 = pd.read_sql_query(fifth_query, conn)

In [None]:
def parser(df):
    messages = df.to_dict("records")
    conversations = []
    i = 0

    while i < len(messages) - 1:
        msg = messages[i]

        # Only start a pair when the message is not me
        if msg["is_from_me"] == 0:
            incoming = [msg["text"]]
            i += 1

            # Collect multiple messages (before i reply)
            while i < len(messages) and messages[i]["is_from_me"] == 0:
                incoming.append(messages[i]["text"])
                i += 1

            # Now collect my reply/replies
            outgoing = []
            while i < len(messages) and messages[i]["is_from_me"] == 1:
                outgoing.append(messages[i]["text"])
                if i + 1 < len(messages) and messages[i + 1]["is_from_me"] == 0:
                    break
                i += 1

            # Save the pair if both sides have content
            if incoming and outgoing:
                prompt = "Friend: " + "\n".join(incoming) + "\nYou:"
                completion = " " + "\n".join(outgoing)
                conversations.append({
                    "prompt": prompt.strip(),
                    "completion": completion.strip()
                })
        else:
            i += 1
    return conversations

In [None]:
convo1 = parser(df1)
convo2 = parser(df2)
convo3 = parser(df3)
convo4 = parser(df4)
convo5 = parser(df5)
all_convos = convo1 + convo2 + convo3 + convo4 + convo5

In [None]:
dataset = Dataset.from_list(all_convos)

In [None]:
# Now split into train/test
split_dataset = dataset.train_test_split(test_size=0.1)

In [None]:
#tokenizing data
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

def tokenize(example):
  text = example["prompt"] + example["completion"]
  return tokenizer(text, padding=True, truncation=True, max_length=1024)

tokenized = split_dataset.map(tokenize, batched=False)

data_collator = DataCollatorForLanguageModeling(tokenizer = tokenizer, mlm = False)

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    # Flatten and ignore -100s
    loss = torch.nn.functional.cross_entropy(
        torch.tensor(logits).view(-1, logits.shape[-1]),
        torch.tensor(labels).view(-1),
        ignore_index=-100,
        reduction='mean'
    )
    return {"perplexity": math.exp(loss.item())}

In [None]:
model = AutoModelForCausalLM.from_pretrained("gpt2")
model.resize_token_embeddings(len(tokenizer))

training_args = TrainingArguments(
    output_dir = "./gpt2-finetuned",
    per_device_train_batch_size = 2,
    per_device_eval_batch_size = 1,
    num_train_epochs = 3,
    logging_steps = 100,
    save_total_limit = 2,
    fp16 = True,
    push_to_hub = False
)

trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = tokenized["train"],
    eval_dataset = tokenized["test"],
    data_collator = data_collator,
    compute_metrics = compute_metrics
)

trainer.train()

In [None]:
model.save_pretrained("Mymic")
tokenizer.save_pretrained("Mymic")

In [None]:
generator = pipeline("text-generation", model = model, tokenizer = tokenizer)
prompt = "Friend:\nyou wanna hang?\nYou:"
output = generator(prompt, max_length = 50, do_sample = True, temperature = 0.6, top_p = 0.9)

print(output[0]["generated_text"])

In [None]:
import re

def clean_two_line_response(generated_text):
    # Extract everything after "You:"
    match = re.search(r"You:\s*(.*)", generated_text, re.DOTALL)
    response = match.group(1) if match else generated_text

    # Split into non-empty lines
    lines = [line.strip() for line in response.strip().splitlines() if line.strip()]

    # Return the first two lines (joined by newline)
    return "\n".join(lines[:2])

In [None]:
prompt = "Friend:\nwanna hang?\nYou:"

raw_output = generator(prompt, max_length=60, do_sample=True, temperature=0.8)[0]["generated_text"]
cleaned = clean_two_line_response(raw_output)

print("Mymic:", cleaned)

In [None]:
import shutil
shutil.make_archive("Mymic", 'zip', "/content/Mymic")

In [None]:
from google.colab import files
files.download("Mymic.zip")