# Offline Chat-Reply Recommendation System (GPT-2)

This notebook builds an offline, context-aware reply recommendation model for two-person chats:
- Preprocess and tokenize long conversations efficiently
- Fine-tune GPT-2 offline (local weights)
- Generate coherent replies using User A history as context
- Evaluate with BLEU/ROUGE/Perplexity
- Save artifacts: `Model.joblib`, model dir, and guidance for `Report.pdf`



In [43]:
# Imports and setup
import os
import sys
import math
import json
import random
import itertools
from dataclasses import dataclass
from typing import List, Dict, Any, Optional, Tuple

import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset

from transformers import (
    GPT2LMHeadModel,
    GPT2TokenizerFast,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling,
    set_seed,
)

import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import joblib

import matplotlib.pyplot as plt

# Ensure NLTK resources (no internet needed for BLEU)
# BLEU does not require external downloads

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Device:', DEVICE)
set_seed(42)



Device: cpu


In [44]:
# Config
class Config:
    # Primary dataset is the Excel file placed next to this notebook
    EXCEL_PATH = './conversationfile.xlsx - userAuserB.csv'  # adjust if your file lives elsewhere
    if not os.path.exists(EXCEL_PATH):
        # fallback: try parent directory
        EXCEL_PATH = '../conversationfile.xlsx - userAuserB.csv'

    # Model
    MODEL_NAME = 'gpt2'  # small; local weights assumed available
    MAX_SEQ_LEN = 512
    MAX_CONTEXT_TURNS = 8  # number of turns from history to include

    # Training
    OUTPUT_DIR = './gpt2_chatrec_output'
    NUM_EPOCHS = 1
    BATCH_SIZE = 2
    GRAD_ACC_STEPS = 8
    LR = 5e-5
    WARMUP_STEPS = 50
    WEIGHT_DECAY = 0.01
    FP16 = torch.cuda.is_available()

    # Generation
    GEN_MAX_NEW_TOKENS = 64
    TOP_K = 50
    TOP_P = 0.95
    TEMPERATURE = 0.8

    # Splits
    TRAIN_FRACTION = 0.9

print('EXCEL_PATH =', Config.EXCEL_PATH)

EXCEL_PATH = ./conversationfile.xlsx - userAuserB.csv


In [45]:
# Data loading from Excel

import warnings
warnings.filterwarnings('ignore')


def load_conversations_from_excel(xlsx_path: str) -> pd.DataFrame:
    """Load an Excel workbook containing two-person chats and return a normalized
    long-form dataframe with columns: conversation_id, turn_index, speaker, text.

    Supported common schemas (auto-detected):
    1) Long format: columns like [conversation_id, turn_index, speaker, text]
    2) Wide per-row turns: columns like [A, B] or [userA, userB] representing alternating utterances
    3) Single 'text' with 'speaker' columns, infers conversation_id if missing
    """
    if not os.path.exists(xlsx_path):
        raise FileNotFoundError(f"File not found at {xlsx_path}. Please ensure the file is a CSV.")

    # Load as CSV
    df = pd.read_csv(xlsx_path)
    cols_lower = {c.lower(): c for c in df.columns}

    def to_long(df_in: pd.DataFrame) -> pd.DataFrame:
        # Case 1: already long
        if {'conversation_id', 'turn_index', 'speaker', 'text'}.issubset(set(map(str.lower, df_in.columns))):
            c = {c.lower(): c for c in df_in.columns}
            out = pd.DataFrame()
            out['conversation_id'] = df_in[c['conversation_id']]
            out['turn_index'] = df_in[c['turn_index']]
            out['speaker'] = df_in[c['speaker']].astype(str).str.strip().str.upper().str[0]
            out['text'] = df_in[c['text']].astype(str)
            return out

        # Case 2: wide A/B columns
        a_col = cols_lower.get('a') or cols_lower.get('usera') or cols_lower.get('speaker_a')
        b_col = cols_lower.get('b') or cols_lower.get('userb') or cols_lower.get('speaker_b')
        if a_col and b_col:
            records = []
            conv_id = 0
            for _, row in df_in.iterrows():
                a_text = str(row[a_col]) if not pd.isna(row[a_col]) else None
                b_text = str(row[b_col]) if not pd.isna(row[b_col]) else None
                turn_idx = 0
                if a_text:
                    records.append({'conversation_id': conv_id, 'turn_index': turn_idx, 'speaker': 'A', 'text': a_text})
                    turn_idx += 1
                if b_text:
                    records.append({'conversation_id': conv_id, 'turn_index': turn_idx, 'speaker': 'B', 'text': b_text})
                conv_id += 1
            return pd.DataFrame.from_records(records)

        # Case 3: generic text + speaker
        text_c = cols_lower.get('text') or cols_lower.get('message') or list(df_in.columns)[-1]
        speaker_c = cols_lower.get('speaker') or None
        out = pd.DataFrame()
        out['text'] = df_in[text_c].astype(str)
        out['speaker'] = (df_in[speaker_c].astype(str) if speaker_c else 'A')
        out['speaker'] = out['speaker'].str.strip().str.upper().str[0]
        out['conversation_id'] = 0
        out['turn_index'] = out.groupby('conversation_id').cumcount()
        return out

    long_df = to_long(df)
    long_df = long_df.sort_values(by=['conversation_id', 'turn_index']).reset_index(drop=True)
    return long_df

conversations_df = load_conversations_from_excel(Config.EXCEL_PATH)
conversations_df.head()

Unnamed: 0,text,speaker,conversation_id,turn_index
0,"""Hey, did you see the client's feedback on the...",A,0,0
1,"""Just saw it. They want a lot of changes to th...",A,0,1
2,"""Yeah, that's what I was thinking. It's a big ...",A,0,2
3,"""I'll start on the revisions. Can you update t...",A,0,3
4,"""Will do. I'll block out the rest of the week ...",A,0,4


In [46]:
# Preprocessing: build (context -> next A reply) training examples

def build_examples(df: pd.DataFrame, max_context_turns: int) -> List[Dict[str, Any]]:
    examples: List[Dict[str, Any]] = []
    print(f"Total rows in dataframe: {len(df)}")
    for conv_id, group in df.groupby('conversation_id'):
        turns = group[['speaker', 'text']].values.tolist()
        print(f"  Processing conversation {conv_id} with {len(turns)} turns.")
        # We predict the next A reply when the last observed speaker is B
        for idx in range(len(turns)):
            # find indices where turns[idx] is B and there exists a following A
            if turns[idx][0] != 'B':
                # print(f"    Turn {idx}: Speaker is not B ({turns[idx][0]}), skipping.")
                continue
            # find next A after idx
            next_a_idx = None
            for j in range(idx + 1, len(turns)):
                if turns[j][0] == 'A':
                    next_a_idx = j
                    break
            if next_a_idx is None:
                # print(f"    Turn {idx}: Speaker is B, but no following A found, skipping.")
                continue
            start = max(0, idx - max_context_turns + 1)
            context_turns = turns[start:idx + 1]  # include B message
            target_a = turns[next_a_idx][1]
            examples.append({
                'conversation_id': conv_id,
                'context': context_turns,  # list of [speaker, text]
                'target': target_a,
            })
            # print(f"    Turn {idx}: Found example. Context length: {len(context_turns)}, Target: {target_a[:50]}...")

    print(f"Finished building examples. Total examples created: {len(examples)}")
    return examples

examples = build_examples(conversations_df, Config.MAX_CONTEXT_TURNS)
len(examples), examples[0] if examples else None

Total rows in dataframe: 22
  Processing conversation 0 with 22 turns.
Finished building examples. Total examples created: 0


(0, None)

In [47]:
# Tokenizer with speaker tokens

tokenizer = GPT2TokenizerFast.from_pretrained(Config.MODEL_NAME)
SPECIAL_TOKENS = {
    'bos_token': '<bos>',
    'eos_token': '<eos>',
    'pad_token': '<pad>',
    'additional_special_tokens': ['<A>', '<B>', '<SEP>']
}

# Add tokens if missing
num_added = tokenizer.add_special_tokens({k: v for k, v in SPECIAL_TOKENS.items() if k != 'additional_special_tokens'})
num_added += tokenizer.add_special_tokens({'additional_special_tokens': SPECIAL_TOKENS['additional_special_tokens']})
print('Added special tokens:', num_added)

# Helper to linearize context

def render_example(ex: Dict[str, Any]) -> str:
    parts: List[str] = []
    for speaker, text in ex['context']:
        marker = '<A>' if speaker == 'A' else '<B>'
        parts.append(f"{marker} {text} <SEP>")
    parts.append('<A>')  # indicate we want A to continue
    return ' '.join(parts)

# Train/Val split
random.shuffle(examples)
train_size = int(len(examples) * Config.TRAIN_FRACTION)
train_examples = examples[:train_size]
val_examples = examples[train_size:]
len(train_examples), len(val_examples)



Added special tokens: 6


(0, 0)

In [48]:
# Dataset and collator

class ChatDataset(Dataset):
    def __init__(self, examples: List[Dict[str, Any]], tokenizer: GPT2TokenizerFast, max_length: int):
        self.examples = examples
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self) -> int:
        return len(self.examples)

    def __getitem__(self, idx: int) -> Dict[str, torch.Tensor]:
        ex = self.examples[idx]
        prompt = render_example(ex)
        # Input is prompt + target + eos
        full_text = f"{prompt} {ex['target']} <eos>"
        encoding = self.tokenizer(
            full_text,
            truncation=True,
            max_length=self.max_length,
            padding='max_length',
            return_tensors='pt'
        )
        input_ids = encoding['input_ids'].squeeze(0)
        attention_mask = encoding['attention_mask'].squeeze(0)
        # Labels equal input_ids for causal LM; set padding to -100
        labels = input_ids.clone()
        labels[attention_mask == 0] = -100
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': labels,
        }

train_ds = ChatDataset(train_examples, tokenizer, Config.MAX_SEQ_LEN)
val_ds = ChatDataset(val_examples, tokenizer, Config.MAX_SEQ_LEN)
len(train_ds), len(val_ds)



(0, 0)

In [49]:
# Model and Trainer

model = GPT2LMHeadModel.from_pretrained(Config.MODEL_NAME)
# Resize embeddings to account for new special tokens
model.resize_token_embeddings(len(tokenizer))
# Ensure pad token id is set on config to avoid warnings during generation
model.config.pad_token_id = tokenizer.pad_token_id
model.to(DEVICE)

training_args = TrainingArguments(
    output_dir=Config.OUTPUT_DIR,
    overwrite_output_dir=True,
    num_train_epochs=Config.NUM_EPOCHS,
    per_device_train_batch_size=Config.BATCH_SIZE,
    per_device_eval_batch_size=Config.BATCH_SIZE,
    gradient_accumulation_steps=Config.GRAD_ACC_STEPS,
    # Removed evaluation_strategy and eval_steps for now
    save_steps=200,
    logging_steps=50,
    learning_rate=Config.LR,
    warmup_steps=Config.WARMUP_STEPS,
    weight_decay=Config.WEIGHT_DECAY,
    fp16=Config.FP16,
    report_to=[],  # disable wandb
)

# Standard LM collator; we already set labels, so no mlm
collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    # eval_dataset=val_ds, # Removed eval_dataset as evaluation is disabled
    data_collator=collator,
)

print('Trainer ready')

Trainer ready


In [50]:
# Train (offline)

train_result = trainer.train()
metrics = train_result.metrics
trainer.save_model(Config.OUTPUT_DIR)
trainer.save_state()

print('Train metrics:', metrics)



ValueError: num_samples should be a positive integer value, but got num_samples=0

In [51]:
# Generation utility

def generate_reply(context_turns: List[Tuple[str, str]],
                   max_new_tokens: int = Config.GEN_MAX_NEW_TOKENS) -> str:
    prompt = ' '.join([(('<A>' if s=='A' else '<B>') + ' ' + t + ' <SEP>') for s, t in context_turns])
    prompt += ' <A>'
    inputs = tokenizer(prompt, return_tensors='pt').to(DEVICE)
    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            top_k=Config.TOP_K,
            top_p=Config.TOP_P,
            temperature=Config.TEMPERATURE,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.pad_token_id,
        )
    gen = tokenizer.decode(output_ids[0], skip_special_tokens=False)
    # Extract only the continuation after the last '<A>' marker
    last_a = gen.rfind('<A>')
    continuation = gen[last_a+3:] if last_a >= 0 else gen
    # stop at first special token
    for stop_tok in ['<eos>', '<SEP>', '<A>', '<B>']:
        idx = continuation.find(stop_tok)
        if idx != -1:
            continuation = continuation[:idx]
    return continuation.strip()

# Quick smoke test (if validation exists)
if len(val_examples) > 0:
    sample = val_examples[0]
    print('Context:', sample['context'])
    print('Target:', sample['target'])
    print('Generated:', generate_reply(sample['context']))



In [52]:
# Evaluation: BLEU, ROUGE, Perplexity

smooth = SmoothingFunction().method3


def compute_bleu(reference: str, hypothesis: str) -> float:
    ref_tokens = reference.split()
    hyp_tokens = hypothesis.split()
    return sentence_bleu([ref_tokens], hyp_tokens, smoothing_function=smooth)


def rouge_n(reference: str, hypothesis: str, n: int = 1) -> Tuple[float, float, float]:
    # Simple ROUGE-N (recall, precision, F1) using token n-grams
    def ngrams(tokens: List[str], n: int):
        return list(zip(*[tokens[i:] for i in range(n)]))
    ref = reference.split()
    hyp = hypothesis.split()
    ref_ngrams = ngrams(ref, n)
    hyp_ngrams = ngrams(hyp, n)
    ref_counts = {}
    for g in ref_ngrams:
        ref_counts[g] = ref_counts.get(g, 0) + 1
    overlap = 0
    hyp_counts = {}
    for g in hyp_ngrams:
        hyp_counts[g] = hyp_counts.get(g, 0) + 1
    for g, c in hyp_counts.items():
        overlap += min(c, ref_counts.get(g, 0))
    recall = overlap / max(1, len(ref_ngrams))
    precision = overlap / max(1, len(hyp_ngrams))
    f1 = 0.0 if (recall + precision) == 0 else 2 * recall * precision / (recall + precision)
    return recall, precision, f1


def rouge_l(reference: str, hypothesis: str) -> Tuple[float, float, float]:
    # ROUGE-L based on LCS
    ref = reference.split()
    hyp = hypothesis.split()
    dp = [[0]*(len(hyp)+1) for _ in range(len(ref)+1)]
    for i in range(1, len(ref)+1):
        for j in range(1, len(hyp)+1):
            if ref[i-1] == hyp[j-1]:
                dp[i][j] = dp[i-1][j-1] + 1
            else:
                dp[i][j] = max(dp[i-1][j], dp[i][j-1])
    lcs = dp[-1][-1]
    recall = lcs / max(1, len(ref))
    precision = lcs / max(1, len(hyp))
    f1 = 0.0 if (recall + precision) == 0 else 2 * recall * precision / (recall + precision)
    return recall, precision, f1


def evaluate_dataset(eval_examples: List[Dict[str, Any]], num_samples: int = 100) -> Dict[str, float]:
    sample_examples = eval_examples[:max(1, min(num_samples, len(eval_examples)))]
    bleu_scores = []
    r1_f1 = []
    r2_f1 = []
    rl_f1 = []
    for ex in sample_examples:
        hyp = generate_reply(ex['context'])
        ref = ex['target']
        bleu_scores.append(compute_bleu(ref, hyp))
        r1_f1.append(rouge_n(ref, hyp, n=1)[2])
        r2_f1.append(rouge_n(ref, hyp, n=2)[2])
        rl_f1.append(rouge_l(ref, hyp)[2])
    ppl = math.exp(trainer.evaluate()['eval_loss']) if len(sample_examples) > 0 else float('inf')
    return {
        'BLEU': float(np.mean(bleu_scores)) if bleu_scores else 0.0,
        'ROUGE-1_F1': float(np.mean(r1_f1)) if r1_f1 else 0.0,
        'ROUGE-2_F1': float(np.mean(r2_f1)) if r2_f1 else 0.0,
        'ROUGE-L_F1': float(np.mean(rl_f1)) if rl_f1 else 0.0,
        'Perplexity': float(ppl),
    }

metrics_eval = evaluate_dataset(val_examples, num_samples=100)
metrics_eval



{'BLEU': 0.0,
 'ROUGE-1_F1': 0.0,
 'ROUGE-2_F1': 0.0,
 'ROUGE-L_F1': 0.0,
 'Perplexity': inf}

In [54]:
# Save artifacts: Model.joblib and tokenizer/model dirs

class InferenceModel:
    def __init__(self, model_dir: str, tokenizer_dir: str):
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.tokenizer = GPT2TokenizerFast.from_pretrained(tokenizer_dir)
        self.model = GPT2LMHeadModel.from_pretrained(model_dir).to(self.device)

    def reply(self, context_turns: List[Tuple[str, str]],
              max_new_tokens: int = Config.GEN_MAX_NEW_TOKENS) -> str:
        prompt = ' '.join([(('<A>' if s=='A' else '<B>') + ' ' + t + ' <SEP>') for s, t in context_turns])
        prompt += ' <A>'
        inputs = self.tokenizer(prompt, return_tensors='pt').to(self.device)
        with torch.no_grad():
            output_ids = self.model.generate(
                **inputs,
                max_new_tokens=max_new_tokens,
                do_sample=True,
                top_k=Config.TOP_K,
                top_p=Config.TOP_P,
                temperature=Config.TEMPERATURE,
                eos_token_id=self.tokenizer.eos_token_id,
                pad_token_id=self.tokenizer.pad_token_id,
            )
        gen = self.tokenizer.decode(output_ids[0], skip_special_tokens=False)
        last_a = gen.rfind('<A>')
        continuation = gen[last_a+3:] if last_a >= 0 else gen
        for stop_tok in ['<eos>', '<SEP>', '<A>', '<B>']:
            idx = continuation.find(stop_tok)
            if idx != -1:
                continuation = continuation[:idx]
        return continuation.strip()

# Save tokenizer and model directories
model.save_pretrained(Config.OUTPUT_DIR, safe_serialization=False) # Added safe_serialization=False
tokenizer.save_pretrained(Config.OUTPUT_DIR)

# Save a lightweight joblib wrapper
wrapper = InferenceModel(Config.OUTPUT_DIR, Config.OUTPUT_DIR)
joblib.dump(wrapper, 'Model.joblib')

print('Artifacts saved to', Config.OUTPUT_DIR, 'and Model.joblib')

Artifacts saved to ./gpt2_chatrec_output and Model.joblib


## Report Guidance (for Report.pdf)

Include the following in your short report:
- Data description and preprocessing choices (context window, cleaning)
- Model choice rationale (GPT-2 small, causal LM for next-reply)
- Training setup (epochs, batch size, LR, hardware)
- Offline feasibility (no internet, local weights)
- Evaluation results (BLEU/ROUGE/Perplexity) with a brief interpretation
- Error analysis and example generations
- Deployment notes (latency on CPU/GPU, memory footprint, batching)

