In [30]:
''' NO DETAILS'''

import json
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from torch.utils.data import DataLoader, Dataset
from torch.optim import AdamW
from tqdm import tqdm  # for displaying progress bar

# Load the FAQ data from the JSON file
with open('BU_MET_FAQs_small.json', 'r') as file:
    faq_data = json.load(file)

# Extracting only 'module', 'question', and 'answer' fields from faq_data
questions = [{'question': data['question'], 'module': data['module'], 'answer': data['answer']} for data in faq_data]

class FAQDataset(Dataset):
    def __init__(self, tokenizer, questions):
        self.tokenizer = tokenizer
        self.examples = []

        for item in questions:
            prompt = f"{item['question']} [Module: {item['module']}]"
            self.examples.append(tokenizer(prompt, item['answer'], truncation=True, padding='max_length', max_length=512))

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, i):
        return {key: torch.tensor(val) for key, val in self.examples[i].items()}

# Initialize the tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Prepare the data for the GPT model
dataset = FAQDataset(tokenizer, questions)
loader = DataLoader(dataset, batch_size=2, shuffle=True)

# Load the pre-trained GPT-2 model
model = GPT2LMHeadModel.from_pretrained('gpt2')
model.train()

# Optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Training loop
epochs = 3
for epoch in range(epochs):
    loop = tqdm(loader, leave=True)
    for batch in loop:
        optimizer.zero_grad()
        outputs = model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'], labels=batch['input_ids'])
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

# Save the model
model.save_pretrained('./trained_gpt_faq_model')

# Function to generate answers
# def ask_question(question, module):
#     model.eval()
#     prompt = f"{question} [Module: {module}]"
#     inputs = tokenizer.encode(prompt, return_tensors='pt')
#     outputs = model.generate(inputs, max_length=200, num_beams=5, temperature=0.7, top_k=50)
#     answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
#     return answer

def ask_question(question, module):
    model.eval()
    prompt = f"{question} [Module: {module}]"
    inputs = tokenizer.encode(prompt, return_tensors='pt')
    outputs = model.generate(inputs, max_length=200)
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer


  0%|          | 0/21 [00:02<?, ?it/s][A


KeyboardInterrupt: 

In [31]:
# Example usage
print(ask_question("What is the difference between the Master's Degree programs?", "Programs"))


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


What is the difference between the Master's Degree programs? [Module: Programs]

The Master's Degree program is a program that is designed to prepare students for the careers of the next generation of leaders in the field of leadership. The Master's Degree program is designed to prepare students for the careers of the next generation of leaders in the field of leadership.

The Master's Degree program is a program that is designed to prepare students for the careers of the next generation of leaders in the field of leadership.

The Master's Degree program is a program that is designed to prepare students for the careers of the next generation of leaders in the field of leadership.

The Master's Degree program is a program that is designed to prepare students for the careers of the next generation of leaders in the field of leadership.

The Master's Degree program is a program that is designed to prepare students for the careers of the next generation of leaders in the field of leadershi

In [25]:
import json
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from torch.utils.data import DataLoader, Dataset
from torch.optim import AdamW
from tqdm import tqdm  # for displaying progress bar

# Load the FAQ data from the JSON file
with open('BU_MET_FAQs.json', 'r') as file:
    faq_data = json.load(file)

class FAQDataset(Dataset):
    def __init__(self, tokenizer, questions):
        self.tokenizer = tokenizer
        self.examples = []

        for item in questions:
            answer_text = item['answer']
            for url in item['urls']:
                answer_text += f" {url['text']} ({url['url']}) - {url['content']}"
            
            prompt = f"{item['question']} [Module: {item['module']}]"
            self.examples.append(tokenizer(prompt, answer_text, truncation=True, padding='max_length', max_length=512))

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, i):
        return {key: torch.tensor(val) for key, val in self.examples[i].items()}

# Initialize the tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Prepare the data for the GPT model
questions = [{'question': data['question'], 'module': data['module'], 'answer': data['answer'], 'urls': data['urls']} for data in faq_data]
dataset = FAQDataset(tokenizer, questions)
loader = DataLoader(dataset, batch_size=2, shuffle=True)

# Load the pre-trained GPT-2 model
model = GPT2LMHeadModel.from_pretrained('gpt2')
model.train()

# Optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Training loop
epochs = 3
for epoch in range(epochs):
    loop = tqdm(loader, leave=True)
    for batch in loop:
        optimizer.zero_grad()
        outputs = model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'], labels=batch['input_ids'])
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

# Save the model
model.save_pretrained('./trained_gpt_faq_model')

# Function to generate answers
def ask_question(question, module):
    model.eval()
    prompt = f"{question} [Module: {module}]"
    inputs = tokenizer.encode(prompt, return_tensors='pt')
    outputs = model.generate(inputs, max_length=200)
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer



Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

In [26]:
# Example usage
print(ask_question("How can I drop classes from my schedule?", "Registration"))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


How can I drop classes from my schedule? [Module: Registration] [Module: Registration] [Module: Registration] [Module: Registration] [Module: Registration] [Module: Registration] [Module: Registration] [Module: Registration] [Module: Registration] [Module: Registration] [Module: Registration] [Module: Registration] [Module: Registration] [Module: Registration] [Module: Registration] [Module: Registration] [Module: Registration] [Module: Registration] [Module: Registration] [Module: Registration] [Module: Registration] [Module: Registration] [Module: Registration] [Module: Registration] [Module: Registration] [Module: Registration] [Module: Registration] [Module: Registration] [Module: Registration] [Module: Registration] [Module: Registration] [Module: Registration] [Module: Registration] [Module: Registration] [Module: Registration] [Module: Registration] [Module: Registration] [Module: Registration] [


In [15]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW, get_scheduler
from torch.utils.data import DataLoader
import torch
from tqdm.auto import tqdm

# 假设tokenizer和模型已经准备好
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-large')
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

model = GPT2LMHeadModel.from_pretrained('gpt2-large')
model.train()

# 加载数据集
questions = [{'question': data['question'], 'module': data['module'], 'answer': data['answer'], 'urls': data['urls']} for data in faq_data]
dataset = FAQDataset(tokenizer, questions)  # 确保已正确实现FAQDataset类
data_loader = DataLoader(dataset, batch_size=2, shuffle=True)

# 优化器与学习率调度器
optimizer = AdamW(model.parameters(), lr=5e-5)
scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=5 * len(data_loader))

progress_bar = tqdm(range(5 * len(data_loader)))

# 训练循环
for epoch in range(5):
    for batch in data_loader:
        outputs = model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'], labels=batch['input_ids'])
        loss = outputs.loss
        loss.backward()
        
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

model.eval()  # 切换到评估模式

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

  0%|          | 0/105 [00:00<?, ?it/s]


KeyboardInterrupt

