In [3]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/new-data-katz/Train_QA_Pairs.csv
/kaggle/input/new-data-katz/Train_Sentence_Pairs.csv
/kaggle/input/chatbot-test/Test_QA_Pairs.csv
/kaggle/input/katzbot/gpt/gpt2_model.ipynb
/kaggle/input/katzbot/gpt/data/data/val_data.json
/kaggle/input/katzbot/gpt/data/data/train_data.json
/kaggle/input/katzbot/gpt/data/data/sc_val_data.json
/kaggle/input/katzbot/gpt/data/data/sc_train_data.json
/kaggle/input/katzbot/gpt/sc_pairs/checkpoint/config.json
/kaggle/input/katzbot/gpt/sc_pairs/checkpoint/merges.txt
/kaggle/input/katzbot/gpt/sc_pairs/checkpoint/training_args.bin
/kaggle/input/katzbot/gpt/sc_pairs/checkpoint/tokenizer.json
/kaggle/input/katzbot/gpt/sc_pairs/checkpoint/vocab.json
/kaggle/input/katzbot/gpt/sc_pairs/checkpoint/tokenizer_config.json
/kaggle/input/katzbot/gpt/sc_pairs/checkpoint/scheduler.pt
/kaggle/input/katzbot/gpt/sc_pairs/checkpoint/model.safetensors
/kaggle/input/katzbot/gpt/sc_pairs/checkpoint/special_tokens_map.json
/kaggle/input/katzbot/gpt/sc_pairs/checkpoin

In [1]:
!pip install transformers



In [2]:
import os
import torch
import pandas as pd
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torch.nn import CrossEntropyLoss
from tqdm import tqdm
from transformers import GPT2Config, GPT2Tokenizer, GPT2Model, AutoModelForCausalLM
import warnings
from transformers import AutoTokenizer, AutoModelForCausalLM
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer
from tqdm import tqdm
import logging

warnings.filterwarnings("ignore", message="Setting `pad_token_id` to `eos_token_id`*")

In [4]:
class CustomSeq2SeqModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_layers, dropout=0.1):
        super(CustomSeq2SeqModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, input_ids, attention_mask=None):
        # Embedding layer
        x = self.embedding(input_ids)

        # LSTM layers
        outputs, (hidden, cell) = self.lstm(x)

        # Fully connected layer to get logits
        logits = self.fc(outputs)

        return logits

    def generate(self, input_ids, attention_mask=None, max_length=50):
        self.eval()
        with torch.no_grad():
            for _ in range(max_length):
                logits = self.forward(input_ids, attention_mask)
                probabilities = torch.softmax(logits[:, -1, :], dim=-1)
                next_token_ids = torch.argmax(probabilities, dim=-1, keepdim=True)
                input_ids = torch.cat([input_ids, next_token_ids], dim=1)
            return input_ids

In [5]:
class CustomDataset(Dataset):
    def __init__(self, tokenizer, filepath, pair_type):
        self.data = pd.read_csv(filepath)
        self.tokenizer = tokenizer
        self.pair_type = pair_type

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        if self.pair_type == 'sc':
            text = self.data.iloc[idx]['Sentence 1'] + " " + self.data.iloc[idx]['Sentence 2']
        else:  # 'qa'
            text = self.data.iloc[idx]['question'] + " " + self.data.iloc[idx]['answer']
        inputs = self.tokenizer(text, return_tensors="pt", max_length=512, padding="max_length", truncation=True)
        input_ids = inputs['input_ids'].squeeze(0)
        attention_mask = inputs['attention_mask'].squeeze(0)
        return input_ids, attention_mask

In [6]:
def train_model(model, dataset, device, num_epochs=50):
    model.train()
    dataloader = DataLoader(dataset, batch_size=4, shuffle=True)
    optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)
    loss_fn = CrossEntropyLoss()
    for epoch in range(num_epochs):
        for input_ids, attention_mask in tqdm(dataloader, desc=f"Training Epoch {epoch + 1}"):
            input_ids, attention_mask = input_ids.to(device), attention_mask.to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits  # Here is the correction
            #logits = model(input_ids, attention_mask=attention_mask)
            loss = loss_fn(logits.view(-1, logits.size(-1)), input_ids.view(-1))
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
        print(f"Epoch {epoch + 1} complete.")
    return model

In [7]:
def generate_responses(model, tokenizer, test_queries, device):
    model.eval()
    responses = []
    for query in test_queries:
        inputs = tokenizer(query, return_tensors="pt", max_length=512, padding="max_length", truncation=True)
        input_ids = inputs['input_ids'].to(device)
        attention_mask = inputs['attention_mask'].to(device)
        with torch.no_grad():
            output_ids = model.generate(input_ids, attention_mask=attention_mask, max_length=550)
        #responses.append(tokenizer.decode(output_ids[0], skip_special_tokens=True))
        # Decode and clean up the response
        response = tokenizer.decode(output_ids[0], skip_special_tokens=True)
        
        # Remove the input question part from the response if necessary
        # This assumes the response includes the original question
        clean_response = response[len(query):].strip() if response.startswith(query) else response
        responses.append(clean_response)
    return responses

In [None]:
def main():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = 'left'

    config = GPT2Config.from_pretrained("gpt2")
    model_instance = CustomGPT2Model(config)
    model_instance.to(device)

    # Train with Sentence Pairs
    sc_train_dataset = CustomDataset(tokenizer, '/kaggle/input/new-data-katz/Train_Sentence_Pairs.csv', 'sc')
    model_instance = train_model(model_instance, sc_train_dataset, device, num_epochs=2)

    # Save model weights
    torch.save(model_instance.state_dict(), '/kaggle/working/model_weights.pth')

    # Reload model weights
    model_instance.load_state_dict(torch.load('/kaggle/working/model_weights.pth'))
    model_instance.to(device)

    # Continue training with QA Pairs
    qa_train_dataset = CustomDataset(tokenizer, '/kaggle/input/new-data-katz/Train_QA_Pairs.csv', 'qa')
    model_instance = train_model(model_instance, qa_train_dataset, device, num_epochs=2)
    
    # Load the pretrained model for generation

    test_data = pd.read_csv("/kaggle/input/chatbot-test/Test_QA_Pairs.csv")
    test_queries = test_data['Question'].tolist()

    responses = generate_responses(model_instance, tokenizer, test_queries, device)

    test_answers = test_data['Answer'].tolist()
    results_df = pd.DataFrame({
        'Question': test_queries,
        'Reference Answer': test_answers,
        'Generated Response': responses
    })
    results_df.to_csv('/kaggle/working/test_results4.csv', index=False)
    print("Test responses saved to test_results4.csv.")

if __name__ == "__main__":
    main()

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Training Epoch 1:  21%|██        | 327/1570 [1:33:01<5:51:19, 16.96s/it]