In [None]:
from torch.utils.data import DataLoader
from transformers import AdamW
import torch
import matplotlib.pyplot as plt
from tqdm import tqdm

In [None]:
import pandas as pd
import uuid
from datasets import Dataset
from transformers import DistilBertTokenizer, DistilBertForQuestionAnswering, DataCollatorWithPadding, TrainingArguments, Trainer,pipeline
from datasets import load_dataset, concatenate_datasets

In [None]:
column_names_qtn=["id", "title", "context", "question", "answers"]
qtndata = []
qtndf=pd.DataFrame(qtndata, columns=column_names_qtn)

In [None]:
lst={}
lst2={}
lst["what is coffee made of?"]={"text":["roasted coffee beans"], "answer_start":[int(36)]}
lst["what is coffee ingredient?"]={"text":["roasted coffee beans"], "answer_start":[int(36)]}
lst["what are the coffee flavors available?"]={"text":["espresso, french press, caffe latte, or already-brewed canned coffee"], "answer_start":[int(644)]}
lst2["what are the coffee types?"]={"text":["C. arabica and C . robusta. "], "answer_start":[int(51)]}
lst2["what are the coffee varieties?"]={"text":["C. arabica and C . robusta. "], "answer_start":[int(51)]}
lst2["which countries cultivate coffee?"]={"text":["70 countries"], "answer_start":[int(116)]}
context1="coffee is a beverage prepared from roasted coffee beans (ingredient). darkly colored, bitter, (appearance) and slightly acidic, coffee has a stimulating effect on humans, primarily due to its caffeine content. it has the highest sales in the world market for hot drinks. the seeds of the coffea plant fruits are separated to produce unroasted green coffee beans. the beans are roasted and then ground into fine particles typically steeped in hot water before being filtered out, producing a cup of coffee. it is usually served hot, although chilled or iced coffee is common. coffee can be prepared and presented in a variety of ways for e.g., espresso, french press, caffe latte, or already-brewed canned coffee (flavors). sugar, sugar substitutes, milk, and cream are often added to mask the bitter taste or enhance the flavor ."
context2="The two most commonly grown coffee bean types are C. arabica and C . robusta. Coffee plants are cultivated in over 70 countries, primarily in the equatorial regions of the Americas, Southeast Asia, the Indian subcontinent, and Africa. As of 2018, Brazil was the leading grower of coffee beans, producing 35% of the world's total. Green, unroasted coffee is traded as an agricultural commodity. Despite sales of coffee reaching billions of dollars worldwide, farmers producing coffee beans disproportionately live in poverty. Critics of the coffee industry have also pointed to its negative impact on the environment and the clearing of land for coffee-growing and water use."

for key, value in lst.items():
    print(value)
    qtndf.loc[len(qtndf.index)] = [str(uuid.uuid4().hex), 'introduction', context, key, value]
for key, value in lst2.items():
    print(value)
    qtndf.loc[len(qtndf.index)] = [str(uuid.uuid4().hex), 'countries', context2, key, value]

In [None]:
temp = Dataset.from_pandas(pd.DataFrame(data=qtndf))


In [None]:
temp = temp.remove_columns(["__index_level_0__"])

In [None]:
dataset = load_dataset('squad')

In [None]:
dataset['train'][0]['answers']

In [None]:
# dataset2 = concatenate_datasets([dataset['train'], temp])
dataset

In [None]:

bert_tokenizer = DistilBertTokenizer.from_pretrained(
    'distilbert-base-uncased-distilled-squad')
qa_bert = DistilBertForQuestionAnswering.from_pretrained(
    'distilbert-base-uncased-distilled-squad')

In [None]:
qa_bert

In [None]:
contexts = []
questions = []
answers = []

In [None]:
vcontexts = []
vquestions = []
vanswers = []

In [None]:
for row in dataset['train']:
    contexts.append(row['context'])
    questions.append(row['question'])
    answers.append(row['answers'])

In [None]:
len(contexts)

In [None]:
for row in temp:
    contexts.append(row['context'])
    questions.append(row['question'])
    answers.append(row['answers'])

In [None]:
len(contexts)

In [None]:
for row in dataset['validation']:
    vcontexts.append(row['context'])
    vquestions.append(row['question'])
    vanswers.append(row['answers'])

In [None]:
len(vcontexts)

In [None]:
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

train_encodings = tokenizer(contexts, questions, truncation=True, padding=True)
val_encodings = tokenizer(vcontexts, vquestions, truncation=True, padding=True)


In [None]:
def add_token_positions(encodings, answers):
    start_positions = []
    end_positions = []
    for i in range(len(answers)):
        start_positions.append(encodings.char_to_token(i, answers[i]['answer_start'][0]))
        end_char = answers[i]['answer_start'][0] + len(answers[i]["text"][0])
        end_positions.append(end_char)
        # end_positions.append(encodings.char_to_token(i, answers[i]['answer_end'][0] - 1))
        # if None, the answer passage has been truncated
        if start_positions[-1] is None:
            start_positions[-1] = tokenizer.model_max_length
        if end_positions[-1] is None:
            end_positions[-1] = tokenizer.model_max_length
    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

add_token_positions(train_encodings, answers)
add_token_positions(val_encodings, vanswers)

In [None]:
class SquadDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

train_dataset = SquadDataset(train_encodings)
val_dataset = SquadDataset(val_encodings)

In [None]:
model=qa_bert

In [None]:

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model.to(device)
model.train()

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)



optim = AdamW(model.parameters(), lr=5e-5)



# Lists to store loss and accuracy for plotting
train_losses = []
train_accuracies = []
val_losses = []
val_accuracies = []


for epoch in range(10):
    model.train()
    total_loss = 0.0
    correct_predictions = 0
    total_samples = 0



    for batch in tqdm(train_loader):
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
        loss = outputs[0]
        total_loss += loss.item()
    
    
    
        # Calculate accuracy
        start_preds = outputs.start_logits.argmax(dim=1)
        end_preds = outputs.end_logits.argmax(dim=1)
        correct_predictions += ((start_preds == start_positions) & (end_preds == end_positions)).sum().item()
        total_samples += len(start_positions)
    
    
    
        loss.backward()
        optim.step()

    # Calculate accuracy and loss for the training set
    epoch_train_loss = total_loss / len(train_loader)
    train_accuracy = correct_predictions / total_samples

    # Validation
    model.eval()
    val_total_loss = 0.0
    val_correct_predictions = 0
    val_total_samples = 0
    
    
    
    with torch.no_grad():
        for val_batch in tqdm(val_loader):
            input_ids = val_batch['input_ids'].to(device)
            attention_mask = val_batch['attention_mask'].to(device)
            start_positions = val_batch['start_positions'].to(device)
            end_positions = val_batch['end_positions'].to(device)
            val_outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
            val_loss = val_outputs[0]
            val_total_loss += val_loss.item()
    
    
    
            # Calculate accuracy
            val_start_preds = val_outputs.start_logits.argmax(dim=1)
            val_end_preds = val_outputs.end_logits.argmax(dim=1)
            val_correct_predictions += ((val_start_preds == start_positions) & (val_end_preds == end_positions)).sum().item()
            val_total_samples += len(start_positions)


    epoch_val_loss = val_total_loss / len(val_loader)
    val_accuracy = val_correct_predictions / val_total_samples


    print(f'Epoch {epoch + 1}/{3}:')
    print(f'  Training Loss: {epoch_train_loss:.4f}, Training Accuracy: {train_accuracy:.4f}')
    print(f'  Validation Loss: {epoch_val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}')



    train_losses.append(epoch_train_loss)
    train_accuracies.append(train_accuracy)
    val_losses.append(epoch_val_loss)
    val_accuracies.append(val_accuracy)



# Plot the learning curves
plt.figure(figsize=(16, 5))



plt.subplot(1, 2, 1)
plt.plot(train_losses, label='Training Loss')
plt.plot(val_losses, label='Validation Loss')
plt.title('Training and Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(train_accuracies, label='Training Accuracy')
plt.plot(val_accuracies, label='Validation Accuracy')
plt.title('Training and Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

plt.show()


In [None]:
import os

# Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()

output_dir = './model_savenew/'

# Create output directory if needed
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

print("Saving model to %s" % output_dir)

# Save a trained model, configuration and tokenizer using `save_pretrained()`.
# They can then be reloaded using `from_pretrained()`
model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
model_to_save.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
