In [2]:
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("conversational", model="microsoft/DialoGPT-large")

In [1]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-large")
model = AutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-large")

Downloading pytorch_model.bin:   0%|          | 0.00/1.75G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [35]:
import pandas as pd
df = pd.read_csv('../data/modmail.csv')
df.columns = ['modmail', 'response']
df.head()

Unnamed: 0,modmail,response
0,I unprivated everything except my last known l...,There’s a save button at the bottom of the rea...
1,"Hello i just got back to rotmg , i used a new ...",Hellos! Feel free to drop me <@103590142984141...
2,😦 can i send foot pic for early unsuspend,if you send a feet pic i’ll perma suspend u
3,how long,"Hello! If you’re referring to your suspension,..."
4,hello¿?,"Hi there, thanks for reaching out¡! How can we..."


In [36]:
import torch
optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)

In [64]:
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'left'

def tokenize_data(df):
    modmail_encodings = tokenizer(df['modmail'].tolist(), padding='max_length', truncation=True, max_length=512)
    response_encodings = tokenizer(df['response'].tolist(), padding='max_length', truncation=True, max_length=512)
    return modmail_encodings, response_encodings

modmail_encodings, response_encodings = tokenize_data(df)

In [65]:
from torch.utils.data import DataLoader, TensorDataset

# Convert tokenized data to PyTorch tensors
modmail_input_ids = torch.tensor(modmail_encodings['input_ids'])
modmail_attention_mask = torch.tensor(modmail_encodings['attention_mask'])
response_input_ids = torch.tensor(response_encodings['input_ids'])

# Create TensorDataset
dataset = TensorDataset(modmail_input_ids, modmail_attention_mask, response_input_ids)

# Create DataLoader
dataloader = DataLoader(dataset, batch_size=8, shuffle=True)

In [67]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(3):  # Number of epochs
    for batch in dataloader:
        input_ids, attention_mask, labels = [b.to(device) for b in batch]

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        optimizer.zero_grad()

In [29]:
query = 'hello, i got disconnected from vc and was suspended. what do i do?'
    
def generate(query):
    # Encode user query
    new_user_input_ids = tokenizer.encode(query + tokenizer.eos_token, return_tensors='pt')

    # Generate a response
    chat_output = model.generate(new_user_input_ids, max_length=1000, pad_token_id=tokenizer.eos_token_id)

    # Extract and decode the output (i.e., the generated response)
    response = tokenizer.decode(chat_output[:, new_user_input_ids.shape[-1]:][0], skip_special_tokens=True)

    return response

In [30]:
generate(query)

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


"You can't play."