In [6]:
!pip install autocorrect

Collecting autocorrect
  Downloading autocorrect-2.6.1.tar.gz (622 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m622.8/622.8 kB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hBuilding wheels for collected packages: autocorrect
  Building wheel for autocorrect (setup.py) ... [?25ldone
[?25h  Created wheel for autocorrect: filename=autocorrect-2.6.1-py3-none-any.whl size=622363 sha256=ce02437a09f84d5e59d3555ad2a6104c4dd0db6020ff03c1d5a34bb9b155dd1d
  Stored in directory: /root/.cache/pip/wheels/b5/7b/6d/b76b29ce11ff8e2521c8c7dd0e5bfee4fb1789d76193124343
Successfully built autocorrect
Installing collected packages: autocorrect
Successfully installed autocorrect-2.6.1


# Load Dataset

In [2]:
from datasets import load_dataset
ds = load_dataset("coeuslearning/customerqueries")

customerqueries_L.csv:   0%|          | 0.00/1.59M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/10000 [00:00<?, ? examples/s]

# Put Dataset In Simple Structure

In [3]:
dataset=[]
dataset_length=ds["train"].num_rows
for index in range(dataset_length):
    dataset.append({"query":ds["train"][index]["Query"],"answer":ds["train"][index]["Answer"]})

In [4]:
dataset[:3]

[{'query': 'How to connect the Printer to a computer?',
  'answer': 'To connect the Printer to a computer, use the provided USB cable and follow the instructions in the user manual.'},
 {'query': 'What are the dimensions of the Scanner?',
  'answer': 'The dimensions of the Scanner are 10 x 8 x 5 inches.'},
 {'query': 'Can the Laptop be used with both Windows and Mac operating systems?',
  'answer': 'Yes, the Laptop is compatible with both Windows and Mac operating systems.'}]

# Preprocessing Dataset

In [7]:
import re
from autocorrect import Speller
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
import string
nltk.download("stopwords")
nltk.download('punkt')
nltk.download("wordnet")

! unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/


def remove_html_tags(text):
    html_pattern = r'<.*?>'
    without_html = re.sub(pattern=html_pattern, repl=' ', string=text)
    return without_html

def convert_to_lower(text):
    return text.lower()

def remove_urls(text):
    url_pattern = r'https?://\S+|www\.\S+'
    without_urls = re.sub(pattern=url_pattern, repl=' ', string=text)
    return without_urls

def spell_checker(text):
    spellChecker = Speller(lang="en")
    correct_words = []
    for word in nltk.word_tokenize(text):
        correct_word = spellChecker(word)
        correct_words.append(correct_word)
    correct_spell_text = " ".join(correct_words)
    return correct_spell_text


def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))


def remove_stopwords(text):
    removed = []
    stop_words = list(stopwords.words("english"))
    tokens = word_tokenize(text)
    for i in range(len(tokens)):
        if tokens[i] not in stop_words:
            removed.append(tokens[i])
    return " ".join(removed)


def lemmatizing(text):
    lemmatizer = WordNetLemmatizer()
    tokens = word_tokenize(text)
    for i in range(len(tokens)):
        lemma_word = lemmatizer.lemmatize(tokens[i])
        tokens[i] = lemma_word
    return " ".join(tokens)

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
Archive:  /usr/share/nltk_data/corpora/wordnet.zip
   creating: /usr/share/nltk_data/corpora/wordnet/
  inflating: /usr/share/nltk_data/corpora/wordnet/lexnames  
  inflating: /usr/share/nltk_data/corpora/wordnet/data.verb  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.adv  
  inflating: /usr/share/nltk_data/corpora/wordnet/adv.exc  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.verb  
  inflating: /usr/share/nltk_data/corpora/wordnet/cntlist.rev  
  inflating: /usr/share/nltk_data/corpora/wordnet/data.adj  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.adj  
  inflating: /usr/share/nltk_data/corpora/w

In [8]:
def clean(text):
    cleaned_text=convert_to_lower(text)
    cleaned_text=remove_html_tags(cleaned_text)
    cleaned_text=remove_urls(cleaned_text)
    cleaned_text=remove_punctuation(cleaned_text)
    cleaned_text=remove_stopwords(cleaned_text)
    cleaned_text=lemmatizing(cleaned_text)
    cleaned_text= spell_checker(cleaned_text)
    return  cleaned_text
    
    

# Apply preprocessing on dataset

In [9]:
cleaned_dataset=[]
for data in dataset:
    cleaned_dataset.append({"query":clean(data["query"]),"answer":clean(data["answer"])})
    

In [10]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW
from torch.utils.data import DataLoader, Dataset

class QADataset(Dataset):
    def __init__(self, data, tokenizer, max_length=512):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        query = self.data[idx]['query']
        answer = self.data[idx]['answer']
        encoding = self.tokenizer(
            query,
            answer,
            truncation=True,
            max_length=self.max_length,
            padding="max_length",
            return_tensors="pt"
        )
        
        input_ids = encoding['input_ids'].flatten()
        labels = encoding['input_ids'].flatten()
        attention_mask = encoding['attention_mask'].flatten()  

        return {
            'input_ids': input_ids,
            'labels': labels,
            'attention_mask': attention_mask 
        }



tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
qa_dataset = QADataset(cleaned_dataset, tokenizer)
train_loader = DataLoader(qa_dataset, batch_size=2, shuffle=True)


The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]



In [11]:
tokenizer.pad_token=tokenizer.eos_token

In [12]:
import torch
from transformers import GPT2LMHeadModel, AdamW
model = GPT2LMHeadModel.from_pretrained('gpt2')
model.train()
optimizer = AdamW(model.parameters(), lr=5e-5)
epochs = 2
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(epochs):
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        labels = batch['labels'].to(device)
        attention_mask = batch['attention_mask'].to(device)  
        optimizer.zero_grad()
        outputs = model(input_ids, labels=labels, attention_mask=attention_mask)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        
    print(f"Epoch {epoch + 1}/{epochs}, Loss: {loss.item()}")

model.save_pretrained("./fine_tuned_gpt2_qa")
tokenizer.save_pretrained("./fine_tuned_gpt2_qa")


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]



Epoch 1/2, Loss: 0.00038444908568635583
Epoch 2/2, Loss: 0.0003422353183850646


('./fine_tuned_gpt2_qa/tokenizer_config.json',
 './fine_tuned_gpt2_qa/special_tokens_map.json',
 './fine_tuned_gpt2_qa/vocab.json',
 './fine_tuned_gpt2_qa/merges.txt',
 './fine_tuned_gpt2_qa/added_tokens.json')

In [13]:
import re
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
fine_tuned_model = GPT2LMHeadModel.from_pretrained('/kaggle/working/fine_tuned_gpt2_qa')
tokenizer = GPT2Tokenizer.from_pretrained('/kaggle/working/fine_tuned_gpt2_qa')
fine_tuned_model.eval()
def generate_answer(query):
    query = clean(query)
    inputs = tokenizer.encode(query, return_tensors='pt')
    attention_mask = (inputs != tokenizer.pad_token_id).long() 
    outputs = fine_tuned_model.generate(inputs, attention_mask=attention_mask, max_length=100, num_return_sequences=1)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)


question = "I try all solutions but printer does not work"
answer = generate_answer(question)
modified_answer = re.sub(question, "", answer)
answer=modified_answer.strip()
print(f"Question: {question}")
print(f"Answer: {answer}")


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Question: I try all solutions but printer does not work
Answer: try solution printer workbenchmarks usb 30yes monitor support resolution printer 110 dpi


In [14]:
import zipfile
import os
folder_to_compress = '/kaggle/working/fine_tuned_gpt2_qa'  
zip_file_name = 'ChatbotCheckpoint.zip'  
with zipfile.ZipFile(zip_file_name, 'w', zipfile.ZIP_DEFLATED) as zip_file:
    for root, dirs, files in os.walk(folder_to_compress):
        for file in files:
            file_path = os.path.join(root, file)
            zip_file.write(file_path, os.path.relpath(file_path, folder_to_compress))

print(f'{zip_file_name} created successfully!')


ChatbotCheckpoint.zip created successfully!
