In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer

# Load datasets
userA = pd.read_csv("/Desktop/Dataset/userA_chats.csv")
userB = pd.read_csv("/Desktop/Dataset/userB_chats.csv")

# Merge into a conversation dataset
# Assuming both files have 'message' column
conversation = pd.DataFrame({
    'input': userB['message'],    # User B messages as input
    'target': userA['message']    # User A replies as target
})

# Split train/test
train_df, test_df = train_test_split(conversation, test_size=0.1, random_state=42)

# Load tokenizer (GPT-2 example)
tokenizer = AutoTokenizer.from_pretrained("gpt2")

# Tokenize
def tokenize_data(df):
    return tokenizer(
        df['input'].tolist(), df['target'].tolist(),
        padding=True,
        truncation=True,
        return_tensors="pt"
    )

train_encodings = tokenize_data(train_df)
test_encodings = tokenize_data(test_df)

In [None]:
import torch
from transformers import GPT2LMHeadModel

# Load GPT-2 small (offline, preloaded weights)
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Move to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

In [None]:
from torch.utils.data import Dataset, DataLoader

class ChatDataset(Dataset):
    def __init__(self, encodings):
        self.input_ids = encodings['input_ids']
        self.attention_mask = encodings['attention_mask']
        self.labels = encodings['input_ids']  # For GPT2 LM

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_mask[idx],
            'labels': self.labels[idx]
        }

train_dataset = ChatDataset(train_encodings)
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)

# Optimizer
from torch.optim import AdamW
optimizer = AdamW(model.parameters(), lr=5e-5)

# Training loop (simplified)
model.train()
for epoch in range(2):  # Keep epochs low for demo
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1} Loss: {loss.item()}")

In [None]:
def generate_reply(userB_msg, max_length=50):
    model.eval()
    inputs = tokenizer(userB_msg, return_tensors="pt").to(device)
    reply_ids = model.generate(**inputs, max_length=max_length, pad_token_id=tokenizer.eos_token_id)
    reply = tokenizer.decode(reply_ids[0], skip_special_tokens=True)
    return reply

# Example
userB_input = "Hey, are you coming to the party tonight?"
print("User A Reply:", generate_reply(userB_input))