In [None]:
#휴대폰 포멧(오전오후)
import re
import pandas as pd
from google.colab import files
import io

# 파일 업로드
uploaded = files.upload()
file_content = list(uploaded.values())[0]
decoded = io.StringIO(file_content.decode('utf-8'))

messages = []
last_msg = None

# 정규식
msg_line_pattern = re.compile(r"^(\d{4}\. \d{1,2}\. \d{1,2}\. (오전|오후) \d{1,2}:\d{2}), (.*) : (.*)$")
system_msg_pattern = re.compile(r"^\d{4}\. \d{1,2}\. \d{1,2}\. (오전|오후) \d{1,2}:\d{2}: .*$")
date_line_pattern = re.compile(r"^\d{4}년 \d{1,2}월 \d{1,2}일 \S요일$")

# 한 줄씩 읽기
for line in decoded:
    line = line.strip()

    # 무시할 조건
    if not line or system_msg_pattern.match(line) or date_line_pattern.match(line):
        continue

    # 메시지 매칭
    msg_match = msg_line_pattern.match(line)
    if msg_match:
        _, _, user, text = msg_match.groups()

        # "이모티콘", "사진", "동영상" 이 포함되면 skip
        if text in ["이모티콘", "사진", "동영상"]:
            continue

        last_msg = {'User': user.strip(), 'Message': text.strip()}
        messages.append(last_msg)
    else:
        # 줄바꿈 메시지 이어붙이기
        if last_msg:
            last_msg['Message'] += "\n" + line.strip()

# DataFrame 변환
df = pd.DataFrame(messages)
df.head()


In [None]:
#핸드폰 포멧(24시)
import re
import pandas as pd
from google.colab import files
import io

uploaded = files.upload()

def katalk_msg_parse_simple(uploaded_file):
    messages = []
    last_msg = None

    # 정규식 패턴 정의
    msg_line_pattern = re.compile(r"^(\d{4}\. \d{1,2}\. \d{1,2}\. \d{1,2}:\d{2}), (.*) : (.*)$")
    system_msg_pattern = re.compile(r"^\d{4}\. \d{1,2}\. \d{1,2}\. \d{1,2}:\d{2}: .*$")
    date_line_pattern = re.compile(r"^\d{4}년 \d{1,2}월 \d{1,2}일 \S요일$")

    file_content = list(uploaded_file.values())[0]
    decoded = io.StringIO(file_content.decode('utf-8'))

    for line in decoded:
        line = line.strip()

        if not line or system_msg_pattern.match(line) or date_line_pattern.match(line):
            continue

        msg_match = msg_line_pattern.match(line)
        if msg_match:
            _, user, text = msg_match.groups()
            last_msg = {'User': user.strip(), 'Message': text.strip()}
            messages.append(last_msg)
        else:
            if last_msg:
                last_msg['Message'] += "\n" + line.strip()

    df = pd.DataFrame(messages)
    return df

df = katalk_msg_parse_simple(uploaded)
df.head()


In [None]:
#컴퓨터 포멧
import re
import pandas as pd
from google.colab import files

def parse_kakao_pc_chat():
    uploaded = files.upload()  # 파일 업로드
    file_name = list(uploaded.keys())[0]
    content = uploaded[file_name].decode('utf-8')
    lines = content.strip().split('\n')

    data = []
    last_user = None
    last_msg = ""

    msg_pattern = re.compile(r"^\[(.+?)\] \[(.+?)\] (.+)$")
    skip_patterns = [
        re.compile(r"^-{5,}"),              # 날짜 구분선
        re.compile(r"^\*\(안내\)"),         # 시스템 메시지
    ]

    for line in lines:
        line = line.strip()
        if not line or any(p.match(line) for p in skip_patterns):
            continue
        if "님과 카카오톡 대화" in line or "저장한 날짜" in line:
            continue

        match = msg_pattern.match(line)
        if match:
            if last_user is not None:
                data.append({'User': last_user, 'Message': last_msg.strip()})
            last_user = match.group(1)
            last_msg = match.group(3)
        else:
            if last_user:
                last_msg += "\n" + line

    if last_user:
        data.append({'User': last_user, 'Message': last_msg.strip()})

    return pd.DataFrame(data)

# 실행
df = parse_kakao_pc_chat()
df.head()


In [None]:
pd.DataFrame(df)

In [None]:
#줄바꿈 합치는곳
merged_df = pd.DataFrame(columns=['User', 'Message'])
current_user = None
current_message = ""

for _, row in df.iterrows():
    if current_user is None:
        current_user = row['User']
        current_message = row['Message']
    elif row['User'] == current_user:
        current_message += " " + row['Message']  # ← 여기서 줄바꿈 대신 공백
    else:
        merged_df = pd.concat(
            [merged_df, pd.DataFrame([{'User': current_user, 'Message': current_message.strip()}])],
            ignore_index=True
        )
        current_user = row['User']
        current_message = row['Message']

# 마지막 메시지도 추가
if current_user:
    merged_df = pd.concat(
        [merged_df, pd.DataFrame([{'User': current_user, 'Message': current_message.strip()}])],
        ignore_index=True
    )


In [None]:
#대화 순서 정렬
my_name = "이도현"

def clean_text(text):
    if any(x in text for x in ["이모티콘", "사진", "동영상", "삭제된 메시지입니다."]):
        return None
    return text.strip() if text.strip() else None

pairs = []
i = 0
while i < len(merged_df) - 1:
    user1, msg1 = merged_df.iloc[i]['User'], merged_df.iloc[i]['Message']
    user2, msg2 = merged_df.iloc[i + 1]['User'], merged_df.iloc[i + 1]['Message']

    if user1 == my_name and user2 != my_name:
        cleaned_prompt = clean_text(msg1)
        cleaned_response = clean_text(msg2)
        if cleaned_prompt and cleaned_response:
            pairs.append({
                'user': user1,
                'partner': user2,
                'prompt': cleaned_prompt,
                'response': cleaned_response
            })
        i += 2
    else:
        i += 1

dataa = pd.DataFrame(pairs)


In [None]:
pd.DataFrame(dataa)

In [None]:
from datasets import Dataset
from transformers import PreTrainedTokenizerFast

checkpoint = "skt/kogpt2-base-v2"
tokenizer = PreTrainedTokenizerFast.from_pretrained(
    checkpoint,
    bos_token='<s>', eos_token='</s>', unk_token='<unk>',
    pad_token='<pad>', mask_token='<mask>'
)
tokenizer.pad_token = tokenizer.eos_token


In [None]:
dataset = Dataset.from_pandas(dataa).map(
    lambda x: {"text": f"{x['prompt']}\n{x['response']}</s>"}
)


In [None]:
def tokenize_and_set_labels(example):
    tokenized = tokenizer(
        example["text"],
        truncation=True,
        max_length=1024,
        padding="max_length"
    )
    tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized

dataset = dataset.map(tokenize_and_set_labels, batched=True).train_test_split(test_size=0.1)


In [None]:
import torch
from transformers import GPT2LMHeadModel, TrainingArguments, Trainer, DataCollatorForLanguageModeling

device = "cuda" if torch.cuda.is_available() else "cpu"
model = GPT2LMHeadModel.from_pretrained(checkpoint).to(device)
model.config.pad_token_id = tokenizer.pad_token_id


In [None]:
training_args = TrainingArguments(
    output_dir="kogpt2-finetuned-chat",
    eval_strategy="epoch",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,
    num_train_epochs=3,
    save_total_limit=1,
    logging_steps=20,
    save_strategy="epoch",
    fp16=True if torch.cuda.is_available() else False,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
    tokenizer=tokenizer
)

trainer.train()


In [None]:
prompt = "### Me: 오늘 머함? \n### You:"
input_ids = tokenizer.encode(prompt, return_tensors='pt').to(model.device)

output = model.generate(
    input_ids,
    max_new_tokens=30,           # 딱 한두 문장 분량
    do_sample=True,
    top_k=30,                    # 낮게: 흔한 말 위주로 선택
    top_p=0.6,                   # 확률 누적 제한
    temperature=0.5,             # 덜 산만하게
    repetition_penalty=1.2,      # 반복 감점
    pad_token_id=tokenizer.pad_token_id,
    eos_token_id=tokenizer.eos_token_id
)


print(tokenizer.decode(output[0], skip_special_tokens=True))


In [None]:
from transformers import pipeline

generator = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0)

output = generator(
    "### Me: 여기다가 입력하면 됩니다 \n### You:",
    max_length=100,
    do_sample=True,
    temperature=0.8,     # 다양성 높이되 너무 높지 않게
    top_k=50,            # 높은 확률 토큰만 선택
    top_p=0.9,           # 누적 확률 90% 이내에서 샘플링
    repetition_penalty=1.2,  # 반복 방지 핵심!
    pad_token_id=tokenizer.eos_token_id
)

print(output[0]['generated_text'])
