### **Import Modules and Load/Clean Data**

In [1]:
# Import all the required modules
import requests
import textwrap
import random
from transformers import pipeline
import evaluate
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from datasets import Dataset
from transformers import AutoTokenizer
from transformers import AutoModelForQuestionAnswering
from peft import LoraConfig, get_peft_model, TaskType




In [2]:
# Read in the txt file from the url 
url = 'https://www.gutenberg.org/cache/epub/1513/pg1513.txt'
response = requests.get(url)
text = response.text

In [3]:
# Define starting and ending markers to only focus on the text in the book
start_marker = "*** START OF THE PROJECT GUTENBERG EBOOK ROMEO AND JULIET ***"
end_marker = "*** END OF THE PROJECT GUTENBERG EBOOK ROMEO AND JULIET ***"

start_idx = text.find(start_marker)
end_idx = text.find(end_marker)

In [4]:
# Focus on the text in between the start and end markers
if start_idx != -1 and end_idx != - 1:
    romeo_juliet_txt = text[start_idx + len(start_marker):end_idx].strip()

# Display the first 800 characters of the romeo and juliet text
print(romeo_juliet_txt[:800])

THE TRAGEDY OF ROMEO AND JULIET

by William Shakespeare




Contents

THE PROLOGUE.

ACT I
Scene I. A public place.
Scene II. A Street.
Scene III. Room in Capulet’s House.
Scene IV. A Street.
Scene V. A Hall in Capulet’s House.

ACT II
CHORUS.
Scene I. An open place adjoining Capulet’s Garden.
Scene II. Capulet’s Garden.
Scene III. Friar Lawrence’s Cell.
Scene IV. A Street.
Scene V. Capulet’s Garden.
Scene VI. Friar Lawrence’s Cell.

ACT III
Scene I. A public Place.
Scene II. A Room in Capulet’s House.
Scene III. Friar Lawrence’s cell.
Scene IV. A Room in Capulet’s House.
Scene V. An open Gallery to Juliet’s Chamber, overlooking the Garden.

ACT IV
Scene I. Friar Lawrence’s Cell.
Scene II. Hall in Capulet’s House.
Scene III. Juliet’s Chamber.
Scene IV.


In [5]:
# Chunk the text into 400 character chunks
words = romeo_juliet_txt.split()
chunks = []
current = []
max_char = 400

# Traverse through all words in the text
for word in words:
    # if the new word added exceeds chunk size
    if len(" ".join(current + [word])) > max_char:
        # store chunk in chunks
        chunks.append(" ".join(current))
        # start the next chunk (current) with the word
        current = [word]
    else:
        current.append(word) # add word to the current chunk if it does not exceed chunk size

# if there are leftover words in current
if current:
    # append the remaining words as a chunk to chunks
    chunks.append(" ".join(current))

In [6]:
# Display the number of chunks
len(chunks)

355

### **Creating Q&A Pairs**

In [6]:
# Helper method to view the chunks when creating Q&A pairs
print(textwrap.fill(chunks[70], width=100))

BENVOLIO. This wind you talk of blows us from ourselves: Supper is done, and we shall come too late.
ROMEO. I fear too early: for my mind misgives Some consequence yet hanging in the stars, Shall
bitterly begin his fearful date With this night’s revels; and expire the term Of a despised life,
clos’d in my breast By some vile forfeit of untimely death. But he that hath the steerage of my
course


In [11]:
# Act 1

# Create Q&A Pairs from Chunk 10 (2)
qas_chunk10 = [
    {
        "id": "chunk10-1",
        "question": "Which characters from the house of Montagues enter?",
        "answers": [
            {
                "text": "Abram and Balthasar",
                "answer_start": chunks[10].index("Abram and Balthasar")
            }
        ]
    },
    {
        "id": "chunk10-2",
        "question": "What does Sampson say is out?",
        "answers": [
            {
                "text": "naked weapon",
                "answer_start": chunks[10].index("naked weapon")
            }
        ]
    }, 
    {
        "id": "chunk10-3",
        "question": "How does Gregory respond when Sampson says he will back him",
        "answers": [
            {
                "text": "How? Turn thy back and run?",
                "answer_start": chunks[10].index("How? Turn thy back and run?")
            }
        ]
    },
    {
        "id": "chunk10-4",
        "question": "What does Gregory state to convey that he is worried about Sampson?",
        "answers": [
            {
                "text": "No, marry; I fear thee!",
                "answer_start": chunks[10].index("No, marry; I fear thee!")
            }
        ]
    }, 
    {
        "id": "chunk10-5",
        "question": "Which Montague family members make an entrance in the scene?",
        "answers": [
            {
                "text": "Abram and Balthasar",
                "answer_start": chunks[10].index("Abram and Balthasar")
            }
        ]
    },
    {
        "id": "chunk10-6",
        "question": "What does Sampson claim is ready to use?",
        "answers": [
            {
                "text": "naked weapon",
                "answer_start": chunks[10].index("naked weapon")
            }
        ]
    },
    {
        "id": "chunk10-7",
        "question": "How does Gregory react when Sampson says he will support him?",
        "answers": [
            {
                "text": "How? Turn thy back and run?",
                "answer_start": chunks[10].index("How? Turn thy back and run?")
            }
        ]
    },
    {
        "id": "chunk10-8",
        "question": "What does Gregory say to show he is anxious about Sampson?",
        "answers": [
            {
                "text": "No, marry; I fear thee!",
                "answer_start": chunks[10].index("No, marry; I fear thee!")
            }
        ]
    }
]

# Create Q&A Pairs from Chunk 30 (2)
qas_chunk30 = [
    {
        "id": "chunk30-1",
        "question": "Who says 'she'll not be hit with Cupid's arrow'?",
        "answers": [
            {
                "text": "ROMEO",
                "answer_start": chunks[30].index("ROMEO")
            }
        ]
    },
    {
        "id": "chunk30-2",
        "question": "Which goddess's wit is Rosaline said to have?",
        "answers": [
            {
                "text": "Dian’s wit",
                "answer_start": chunks[30].index("Dian’s wit")
            }
        ]
    }, 
    {
        "id": "chunk30-3",
        "question": "How does Romeo describe that Juliet won't listen to words of love?",
        "answers": [
            {
                "text": "She will not stay the siege of loving terms",
                "answer_start": chunks[30].index("She will not stay the siege of loving terms")
            }
        ]
    },
    {
        "id": "chunk30-4",
        "question": "Who does Romeo says misses based on their assessement of beautiful targets?",
        "answers": [
            {
                "text": "BENVOLIO",
                "answer_start": chunks[30].index("BENVOLIO")
            }
        ]
    }, 
    {
        "id": "chunk30-5",
        "question": "Who claims that Cupid’s arrow will not strike her?",
        "answers": [
            {
                "text": "ROMEO",
                "answer_start": chunks[30].index("ROMEO")
            }
        ]
    },
    {
        "id": "chunk30-6",
        "question": "Whose wit is Rosaline described as having?",
        "answers": [
            {
                "text": "Dian’s wit",
                "answer_start": chunks[30].index("Dian’s wit")
            }
        ]
    },
    {
        "id": "chunk30-7",
        "question": "How does Romeo say Juliet reacts to words of love?",
        "answers": [
            {
                "text": "She will not stay the siege of loving terms",
                "answer_start": chunks[30].index("She will not stay the siege of loving terms")
            }
        ]
    },
    {
        "id": "chunk30-8",
        "question": "According to Romeo, who misjudges the beauty of targets?",
        "answers": [
            {
                "text": "BENVOLIO",
                "answer_start": chunks[30].index("BENVOLIO")
            }
        ]
    }

]

# Create Q&A Pairs from Chunk 40 (2)
qas_chunk40 = [
    {
        "id": "chunk40-1",
        "question": "Who asks Romeo if he is mad?",
        "answers": [
            {
                "text": "BENVOLIO",
                "answer_start": chunks[40].index("BENVOLIO")
            }
        ]
    },
    {
        "id": "chunk40-2",
        "question": "Who asks Romeo if he can read?",
        "answers": [
            {
                "text": "SERVANT",
                "answer_start": chunks[40].index("SERVANT")
            }
        ]
    },
    {
        "id": "chunk40-3",
        "question": "How does Romeo describe his torment?",
        "answers": [
            {
                "text": "Shut up in prison, kept without my food, Whipp’d and tormented",
                "answer_start": chunks[40].index("Shut up in prison, kept without my food, Whipp’d and tormented")
            }
        ]
    },
    {
        "id": "chunk40-4",
        "question": "What does the servant ask Romeo regarding reading?",
        "answers": [
            {
                "text": "I pray, sir, can you read?",
                "answer_start": chunks[40].index("I pray, sir, can you read?")
            }
        ]
    }, 
    {
        "id": "chunk40-5",
        "question": "Who questions Romeo about his sanity?",
        "answers": [
            {
                "text": "BENVOLIO",
                "answer_start": chunks[40].index("BENVOLIO")
            }
        ]
    },
    {
        "id": "chunk40-6",
        "question": "Who inquires whether Romeo is able to read?",
        "answers": [
            {
                "text": "SERVANT",
                "answer_start": chunks[40].index("SERVANT")
            }
        ]
    },
    {
        "id": "chunk40-7",
        "question": "How does Romeo explain the way he is being tormented?",
        "answers": [
            {
                "text": "Shut up in prison, kept without my food, Whipp’d and tormented",
                "answer_start": chunks[40].index("Shut up in prison, kept without my food, Whipp’d and tormented")
            }
        ]
    },
    {
        "id": "chunk40-8",
        "question": "What question does the servant pose about reading?",
        "answers": [
            {
                "text": "I pray, sir, can you read?",
                "answer_start": chunks[40].index("I pray, sir, can you read?")
            }
        ]
    }

]

# Create Q&A Pairs from Chunk 70 (2)
qas_chunk70 = [
    {
        "id": "chunk70-1",
        "question": "What does Romeo talk of which makes both Benvolio and Romeo blow themselves from?",
        "answers": [
            {
                "text": "wind",
                "answer_start": chunks[70].index("wind")
            }
        ]
    },
    {
        "id": "chunk70-2",
        "question": "According to Romeo, where are the consequences hanging?",
        "answers": [
            {
                "text": "the stars",
                "answer_start": chunks[70].index("the stars")
            }
        ]
    },
    {
        "id": "chunk70-3",
        "question": "What does Romeo say might bitterly begin?",
        "answers": [
            {
                "text": "Some consequence yet hanging in the stars, Shall bitterly begin his fearful date",
                "answer_start": chunks[70].index("Some consequence yet hanging in the stars, Shall bitterly begin his fearful date")
            }
        ]
    },
    {
        "id": "chunk70-4",
        "question": "How does Romeo describe the life that may end?",
        "answers": [
            {
                "text": "clos’d in my breast By some vile forfeit of untimely death",
                "answer_start": chunks[70].index("clos’d in my breast By some vile forfeit of untimely death")
            }
        ]
    },
    {
        "id": "chunk70-5",
        "question": "What natural element does Romeo mention that affects both him and Benvolio?",
        "answers": [
            {
                "text": "wind",
                "answer_start": chunks[70].index("wind")
            }
        ]
    },
    {
        "id": "chunk70-6",
        "question": "Where does Romeo say the outcomes of events are determined?",
        "answers": [
            {
                "text": "the stars",
                "answer_start": chunks[70].index("the stars")
            }
        ]
    },
    {
        "id": "chunk70-7",
        "question": "What does Romeo warn may start bitterly in the future?",
        "answers": [
            {
                "text": "Some consequence yet hanging in the stars, Shall bitterly begin his fearful date",
                "answer_start": chunks[70].index("Some consequence yet hanging in the stars, Shall bitterly begin his fearful date")
            }
        ]
    },
    {
        "id": "chunk70-8",
        "question": "How does Romeo describe a life that could be cut short?",
        "answers": [
            {
                "text": "clos’d in my breast By some vile forfeit of untimely death",
                "answer_start": chunks[70].index("clos’d in my breast By some vile forfeit of untimely death")
            }
        ]
    }
]

# Create Q&A Pairs from Chunk 80 (2)
qas_chunk80 = [
    {
        "id": "chunk80-1",
        "question": "Who is Tybalt's uncle?",
        "answers": [
            {
                "text": "CAPULET",
                "answer_start": chunks[80].index("CAPULET")
            }
        ]
    },
    {
        "id": "chunk80-2",
        "question": "How does Capulet describe Tybalt to embark humor?",
        "answers": [
            {
                "text": "saucy boy",
                "answer_start": chunks[80].index("saucy boy")
            }
        ]
    },
    {
        "id": "chunk80-3",
        "question": "According to Capulet, what will Tybalt make among his guests?",
        "answers": [
            {
                "text": "make a mutiny",
                "answer_start": chunks[80].index("make a mutiny")
            }
        ]
    },
    {
        "id": "chunk80-4",
        "question": "Who asks if they are the master here?",
        "answers": [
            {
                "text": "CAPULET",
                "answer_start": chunks[80].index("CAPULET")
            }
        ]
    },
    {
        "id": "chunk80-5",
        "question": "Who is the uncle of Tybalt?",
        "answers": [
            {
                "text": "CAPULET",
                "answer_start": chunks[80].index("CAPULET")
            }
        ]
    },
    {
        "id": "chunk80-6",
        "question": "What term does Capulet use to describe Tybalt's attitude?",
        "answers": [
            {
                "text": "saucy boy",
                "answer_start": chunks[80].index("saucy boy")
            }
        ]
    },
    {
        "id": "chunk80-7",
        "question": "What does Capulet say Tybalt might cause among the guests?",
        "answers": [
            {
                "text": "make a mutiny",
                "answer_start": chunks[80].index("make a mutiny")
            }
        ]
    },
    {
        "id": "chunk80-8",
        "question": "Who questions if they are in charge here?",
        "answers": [
            {
                "text": "CAPULET",
                "answer_start": chunks[80].index("CAPULET")
            }
        ]
    }
]

In [12]:
# Act 2

# Create Q&A Pairs from Chunk 90 (2)
qas_chunk90 = [
    {
        "id": "chunk90-1",
        "question": "When Romeo runs away what does he leap??",
        "answers": [
            {
                "text": "orchard wall",
                "answer_start": chunks[90].index("orchard wall")
            }
        ]
    },
    {
        "id": "chunk90-2",
        "question": "How are Romeo and Benvolio related?",
        "answers": [
            {
                "text": "cousins",
                "answer_start": chunks[90].index("cousin")
            }
        ]
    }, 
    {
        "id": "chunk90-3",
        "question": "Where does Romeo go after climbing the wall?",
        "answers": [
            {
                "text": "within it",
                "answer_start": chunks[90].index("within it")
            }
        ]
    },
    {
        "id": "chunk90-4",
        "question": "Who tries to call Romeo after he runs away?",
        "answers": [
            {
                "text": "Mercutio",
                "answer_start": chunks[90].index("Mercutio")
            }
        ]
    }, 
     {
        "id": "chunk90-5",
        "question": "What does Romeo jump over when he runs away?",
        "answers": [
            {
                "text": "orchard wall",
                "answer_start": chunks[90].index("orchard wall")
            }
        ]
    },
    {
        "id": "chunk90-6",
        "question": "What is the family relationship between Romeo and Benvolio?",
        "answers": [
            {
                "text": "cousins",
                "answer_start": chunks[90].index("cousin")
            }
        ]
    },
    {
        "id": "chunk90-7",
        "question": "After leaping the wall, where does Romeo land?",
        "answers": [
            {
                "text": "within it",
                "answer_start": chunks[90].index("within it")
            }
        ]
    },
    {
        "id": "chunk90-8",
        "question": "Who attempts to get Romeo’s attention after he escapes?",
        "answers": [
            {
                "text": "Mercutio",
                "answer_start": chunks[90].index("Mercutio")
            }
        ]
    }
]

# Create Q&A Pairs from Chunk 110 (2)
qas_chunk110 = [
    {
        "id": "chunk110-1",
        "question": "What does Juliet compare her bounty to?",
        "answers": [
            {
                "text": "the sea",
                "answer_start": chunks[110].index("the sea")
            }
        ]
    },
    {
        "id": "chunk110-2",
        "question": "Who calls within when Juliet explains her love?",
        "answers": [
            {
                "text": "Nurse",
                "answer_start": chunks[110].index("[_Nurse calls within._]")
            }
        ]
    }, 
    {
        "id": "chunk110-3",
        "question": "What does Juliet say about her bounty?",
        "answers": [
            {
                "text": "as boundless as the sea",
                "answer_start": chunks[110].index("as boundless as the sea")
            }
        ]
    },
    {
        "id": "chunk110-4",
        "question": "Who calls within while Juliet is speaking?",
        "answers": [
            {
                "text": "Nurse",
                "answer_start": chunks[110].index("Nurse")
            }
        ]
    },
    {
        "id": "chunk110-5",
        "question": "Juliet compares her generosity to what?",
        "answers": [
            {
                "text": "the sea",
                "answer_start": chunks[110].index("the sea")
            }
        ]
    },
    {
        "id": "chunk110-6",
        "question": "Who interrupts Juliet by calling from inside?",
        "answers": [
            {
                "text": "Nurse",
                "answer_start": chunks[110].index("[_Nurse calls within._]")
            }
        ]
    },
    {
        "id": "chunk110-7",
        "question": "How does Juliet describe the depth of her bounty?",
        "answers": [
            {
                "text": "as boundless as the sea",
                "answer_start": chunks[110].index("as boundless as the sea")
            }
        ]
    },
    {
        "id": "chunk110-8",
        "question": "Who calls while Juliet is expressing her love?",
        "answers": [
            {
                "text": "Nurse",
                "answer_start": chunks[110].index("Nurse")
            }
        ]
    }
]

# Create Q&A Pairs from Chunk 140 (2)
qas_chunk140 = [
    {
        "id": "chunk140-1",
        "question": "Who does the nurse call for when talking to Romeo?",
        "answers": [
            {
                "text": "young Romeo",
                "answer_start": chunks[140].index("young Romeo")
            }
        ]
    },
    {
        "id": "chunk140-2",
        "question": "What does Romeo say about Young Romeo?",
        "answers": [
            {
                "text": "young Romeo will be older when you have found him",
                "answer_start": chunks[140].index("young Romeo will be older when you have found him")
            }
        ]
    },
    {
        "id": "chunk140-3",
        "question": "At what time does the Nurse start speaking to Romeo?",
        "answers": [
            {
                "text": "noon",
                "answer_start": chunks[140].index("noon")
            }
        ]
    },
    {
        "id": "chunk140-4",
        "question": "How does Romeo describe himself to the Nurse?",
        "answers": [
            {
                "text": "One, gentlewoman, that God hath made for himself to mar",
                "answer_start": chunks[140].index("One, gentlewoman, that God hath made for himself to mar")
            }
        ]
    },
     {
        "id": "chunk140-5",
        "question": "Who is being called by the Nurse while speaking to Romeo?",
        "answers": [
            {
                "text": "young Romeo",
                "answer_start": chunks[140].index("young Romeo")
            }
        ]
    },
    {
        "id": "chunk140-6",
        "question": "What does Romeo mention about the age of Young Romeo?",
        "answers": [
            {
                "text": "young Romeo will be older when you have found him",
                "answer_start": chunks[140].index("young Romeo will be older when you have found him")
            }
        ]
    },
    {
        "id": "chunk140-7",
        "question": "At what hour does the Nurse begin her conversation with Romeo?",
        "answers": [
            {
                "text": "noon",
                "answer_start": chunks[140].index("noon")
            }
        ]
    },
    {
        "id": "chunk140-8",
        "question": "How does Romeo describe his nature to the Nurse?",
        "answers": [
            {
                "text": "One, gentlewoman, that God hath made for himself to mar",
                "answer_start": chunks[140].index("One, gentlewoman, that God hath made for himself to mar")
            }
        ]
    }
]

# Create Q&A Pairs from Chunk 147 (2)
qas_chunk147 = [
    {
        "id": "chunk147-1",
        "question": "Where does Romeo say Juliet will be shrived and married?",
        "answers": [
            {
                "text": "Friar Lawrence’ cell",
                "answer_start": chunks[147].index("Friar Lawrence’ cell")
            }
        ]
    },
    {
        "id": "chunk147-2",
        "question": "Where does Romeo tell the Nurse to stay?",
        "answers": [
            {
                "text": "behind the abbey wall",
                "answer_start": chunks[147].index("behind the abbey wall")
            }
        ]
    },
    {
        "id": "chunk147-3",
        "question": "When is Juliet supposed to go to Friar Lawrence's cell?",
        "answers": [
            {
                "text": "this afternoon",
                "answer_start": chunks[147].index("this afternoon")
            }
        ]
    },
    {
        "id": "chunk147-4",
        "question": "What will Romeo's man bring to the Nurse?",
        "answers": [
            {
                "text": "cords made like a tackled stair",
                "answer_start": chunks[147].index("cords made like a tackled stair")
            }
        ]
    },
    {
        "id": "chunk147-5",
        "question": "Where does Romeo plan for Juliet to be shrived and married?",
        "answers": [
            {
                "text": "Friar Lawrence’ cell",
                "answer_start": chunks[147].index("Friar Lawrence’ cell")
            }
        ]
    },
    {
        "id": "chunk147-6",
        "question": "Where does Romeo instruct the Nurse to wait?",
        "answers": [
            {
                "text": "behind the abbey wall",
                "answer_start": chunks[147].index("behind the abbey wall")
            }
        ]
    },
    {
        "id": "chunk147-7",
        "question": "At what time should Juliet arrive at Friar Lawrence's cell?",
        "answers": [
            {
                "text": "this afternoon",
                "answer_start": chunks[147].index("this afternoon")
            }
        ]
    },
    {
        "id": "chunk147-8",
        "question": "What item is Romeo's man supposed to deliver to the Nurse?",
        "answers": [
            {
                "text": "cords made like a tackled stair",
                "answer_start": chunks[147].index("cords made like a tackled stair")
            }
        ]
    }
]


# Create Q&A Pairs from Chunk 155 (2)
qas_chunk155 = [
    {
        "id": "chunk155-1",
        "question": "What does the nurse state about Romeo’s legs?",
        "answers": [
            {
                "text": "excels all men's",
                "answer_start": chunks[155].index("excels all men’s")
            }
        ]
    },
    {
        "id": "chunk155-2",
        "question": "What are the nurse's thoughts about Romeo's face?",
        "answers": [
            {
                "text": "his face be better than any man’s",
                "answer_start": chunks[155].index("his face be better than any man’s")
            }
        ]
    },
    {
        "id": "chunk155-3",
        "question": "Who does the Nurse suggest is not the right choice for Juliet?",
        "answers": [
            {
                "text": "Romeo",
                "answer_start": chunks[155].index("Romeo")
            }
        ]
    },
    {
        "id": "chunk155-4",
        "question": "What qualities of Romeo does the Nurse praise?",
        "answers": [
            {
                "text": "his face be better than any man’s, yet his leg excels all men’s",
                "answer_start": chunks[155].index("his face be better than any man’s, yet his leg excels all men’s")
            }
        ]
    },
    {
        "id": "chunk155-5",
        "question": "What does the Nurse say about the excellence of Romeo's legs?",
        "answers": [
            {
                "text": "excels all men's",
                "answer_start": chunks[155].index("excels all men’s")
            }
        ]
    },
    {
        "id": "chunk155-6",
        "question": "How does the Nurse describe Romeo's facial appearance?",
        "answers": [
            {
                "text": "his face be better than any man’s",
                "answer_start": chunks[155].index("his face be better than any man’s")
            }
        ]
    },
    {
        "id": "chunk155-7",
        "question": "Who does the Nurse imply is not suitable for Juliet?",
        "answers": [
            {
                "text": "Romeo",
                "answer_start": chunks[155].index("Romeo")
            }
        ]
    },
    {
        "id": "chunk155-8",
        "question": "Which attributes of Romeo does the Nurse commend?",
        "answers": [
            {
                "text": "his face be better than any man’s, yet his leg excels all men’s",
                "answer_start": chunks[155].index("his face be better than any man’s, yet his leg excels all men’s")
            }
        ]
    }
]

In [13]:
# Act 3

# Create Q&A Pairs from Chunk 180 (2)
qas_chunk180 = [
    {
        "id": "chunk180-1",
        "question": "What happens to Tybalt after he fights Romeo?",
        "answers": [
            {
                "text": "Tybalt falls",
                "answer_start": chunks[180].index("Tybalt falls")
            }
        ]
    },
    {
        "id": "chunk180-2",
        "question": "What does Benvolio repeatedly tell Romeo to do after the fight with Tybalt?",
        "answers": [
            {
                "text": "be gone",
                "answer_start": chunks[180].index("be gone")
            }
        ]
    },
    {
        "id": "chunk180-3",
        "question": "Who informs Romeo to leave after Tybalt is slain?",
        "answers": [
            {
                "text": "Benvolio",
                "answer_start": chunks[180].index("BENVOLIO")
            }
        ]
    },
    {
        "id": "chunk180-4",
        "question": "What does Romeo say about his own fate after killing Tybalt?",
        "answers": [
            {
                "text": "O, I am fortune’s fool",
                "answer_start": chunks[180].index("O, I am fortune’s fool")
            }
        ]
    },
    {
        "id": "chunk180-5",
        "question": "What is the outcome for Tybalt after his duel with Romeo?",
        "answers": [
            {
                "text": "Tybalt falls",
                "answer_start": chunks[180].index("Tybalt falls")
            }
        ]
    },
    {
        "id": "chunk180-6",
        "question": "What does Benvolio insist Romeo should do repeatedly after the fight?",
        "answers": [
            {
                "text": "be gone",
                "answer_start": chunks[180].index("be gone")
            }
        ]
    },
    {
        "id": "chunk180-7",
        "question": "Who tells Romeo to leave following Tybalt's death?",
        "answers": [
            {
                "text": "Benvolio",
                "answer_start": chunks[180].index("BENVOLIO")
            }
        ]
    },
    {
        "id": "chunk180-8",
        "question": "How does Romeo comment on his fate after killing Tybalt?",
        "answers": [
            {
                "text": "O, I am fortune’s fool",
                "answer_start": chunks[180].index("O, I am fortune’s fool")
            }
        ]
    }
]

# Create Q&A Pairs from Chunk 191 (2)
qas_chunk191 = [
    {
        "id": "chunk191-1",
        "question": "Who is Juliet expecting news from?",
        "answers": [
            {
                "text": "Nurse",
                "answer_start": chunks[191].index("Nurse")
            }
        ]
    },
    {
        "id": "chunk191-2",
        "question": "How does Juliet explain the feeling of hearing someone mentioning Romeo?",
        "answers": [
            {
                "text": "heavenly eloquence",
                "answer_start": chunks[191].index("heavenly eloquence")
            }
        ]
    },
    {
        "id": "chunk191-3",
        "question": "What is Juliet impatient for in the beginning of the scene?",
        "answers": [
            {
                "text": "possess’d it",
                "answer_start": chunks[191].index("possess’d it")
            }
        ]
    },
    {
        "id": "chunk191-4",
        "question": "Who enters with cords?",
        "answers": [
            {
                "text": "Nurse",
                "answer_start": chunks[191].index("Nurse")
            }
        ]
    },
    {
        "id": "chunk191-5",
        "question": "From whom is Juliet waiting to receive news?",
        "answers": [
            {
                "text": "Nurse",
                "answer_start": chunks[191].index("Nurse")
            }
        ]
    },
    {
        "id": "chunk191-6",
        "question": "How does Juliet describe the sound of hearing Romeo’s name?",
        "answers": [
            {
                "text": "heavenly eloquence",
                "answer_start": chunks[191].index("heavenly eloquence")
            }
        ]
    },
    {
        "id": "chunk191-7",
        "question": "At the start of the scene, what is Juliet eager to have?",
        "answers": [
            {
                "text": "possess’d it",
                "answer_start": chunks[191].index("possess’d it")
            }
        ]
    },
    {
        "id": "chunk191-8",
        "question": "Who appears carrying cords?",
        "answers": [
            {
                "text": "Nurse",
                "answer_start": chunks[191].index("Nurse")
            }
        ]
    }
]

# Create Q&A Pairs from Chunk 200 (2)
qas_chunk200 = [
    {
        "id": "chunk200-1",
        "question": "Who does Romeo slay in a sword fight?",
        "answers": [
            {
                "text": "Tybalt",
                "answer_start": chunks[200].index("Tybalt")
            }
        ]
    },
    {
        "id": "chunk200-2",
        "question": "Which word according to Juliet has slain ten thousands Tybalts",
        "answers": [
            {
                "text": "banished",
                "answer_start": chunks[200].index("banished")
            }
        ]
    },
    {
        "id": "chunk200-3",
        "question": "How does Juliet refer to Romeo to the Nurse?",
        "answers": [
            {
                "text": "my husband",
                "answer_start": chunks[200].index("my husband")
            }
        ]
    },
    {
        "id": "chunk200-4",
        "question": "How does Juliet describe the impact of Romeo being banished?",
        "answers": [
            {
                "text": "Hath slain ten thousand Tybalts",
                "answer_start": chunks[200].index("Hath slain ten thousand Tybalts")
            }
        ]
    },
    {
        "id": "chunk200-5",
        "question": "Who is killed by Romeo in the duel?",
        "answers": [
            {
                "text": "Tybalt",
                "answer_start": chunks[200].index("Tybalt")
            }
        ]
    },
    {
        "id": "chunk200-6",
        "question": "Which word causes Juliet to feel as if countless Tybalts have died?",
        "answers": [
            {
                "text": "banished",
                "answer_start": chunks[200].index("banished")
            }
        ]
    },
    {
        "id": "chunk200-7",
        "question": "What term does Juliet use to refer to Romeo to the Nurse?",
        "answers": [
            {
                "text": "my husband",
                "answer_start": chunks[200].index("my husband")
            }
        ]
    },
    {
        "id": "chunk200-8",
        "question": "How does Juliet express the effect of Romeo’s banishment?",
        "answers": [
            {
                "text": "Hath slain ten thousand Tybalts",
                "answer_start": chunks[200].index("Hath slain ten thousand Tybalts")
            }
        ]
    }
]

# Q&A pairs from Chunk 210
qas_chunk210 = [
    {
        "id": "chunk210-1",
        "question": "Who calls Romeo a 'fond mad man'?",
        "answers": [
            {
                "text": "FRIAR LAWRENCE",
                "answer_start": chunks[210].index("FRIAR LAWRENCE")
            }
        ]
    },
    {
        "id": "chunk210-2",
        "question": "What word does Friar Lawrence try to help Romeo keep off?",
        "answers": [
            {
                "text": "banished",
                "answer_start": chunks[210].index("banished")
            }
        ]
    },
    {
        "id": "chunk210-3",
        "question": "What does Friar Lawrence offer Romeo to comfort him?",
        "answers": [
            {
                "text": "Adversity’s sweet milk, philosophy",
                "answer_start": chunks[210].index("Adversity’s sweet milk, philosophy")
            }
        ]
    },
    {
        "id": "chunk210-4",
        "question": "What impossible things does Romeo say philosophy cannot do?",
        "answers": [
            {
                "text": "make a Juliet, Displant a town, reverse a Prince’s doom",
                "answer_start": chunks[210].index("make a Juliet, Displant a town, reverse a Prince’s doom")
            }
        ]
    },
    {
        "id": "chunk210-5",
        "question": "Who calls Romeo a foolish and mad man?",
        "answers": [
            {
                "text": "FRIAR LAWRENCE",
                "answer_start": chunks[210].index("FRIAR LAWRENCE")
            }
        ]
    },
    {
        "id": "chunk210-6",
        "question": "Which word is Friar Lawrence trying to shield Romeo from repeating?",
        "answers": [
            {
                "text": "banished",
                "answer_start": chunks[210].index("banished")
            }
        ]
    },
    {
        "id": "chunk210-7",
        "question": "What remedy does Friar Lawrence provide to console Romeo?",
        "answers": [
            {
                "text": "Adversity’s sweet milk, philosophy",
                "answer_start": chunks[210].index("Adversity’s sweet milk, philosophy")
            }
        ]
    },
    {
        "id": "chunk210-8",
        "question": "What does Romeo say philosophy cannot accomplish?",
        "answers": [
            {
                "text": "make a Juliet, Displant a town, reverse a Prince’s doom",
                "answer_start": chunks[210].index("make a Juliet, Displant a town, reverse a Prince’s doom")
            }
        ]
    }
]

# Q&A pairs from Chunk 220
qas_chunk220 = [
    {
        "id": "chunk220-1",
        "question": "Who would have killed Romeo if he had not acted?",
        "answers": [
            {
                "text": "Tybalt",
                "answer_start": chunks[220].index("Tybalt")
            }
        ]
    },
    {
        "id": "chunk220-2",
        "question": "What becomes Romeo’s friend after Tybalt’s death?",
        "answers": [
            {
                "text": "The law that threaten’d death",
                "answer_start": chunks[220].index("The law that threaten’d death")
            }
        ]
    },
    {
        "id": "chunk220-3",
        "question": "What is said to court Romeo in her best array?",
        "answers": [
            {
                "text": "Happiness",
                "answer_start": chunks[220].index("Happiness")
            }
        ]
    },
    {
        "id": "chunk220-4",
        "question": "What does Romeo do to his Fortune and love according to the passage?",
        "answers": [
            {
                "text": "putt’st up thy Fortune and thy love",
                "answer_start": chunks[220].index("putt’st up thy Fortune and thy love")
            }
        ]
    },
    {
        "id": "chunk220-5",
        "question": "Who would have killed Romeo if he hadn’t acted first?",
        "answers": [
            {
                "text": "Tybalt",
                "answer_start": chunks[220].index("Tybalt")
            }
        ]
    },
    {
        "id": "chunk220-6",
        "question": "What changes from threatening death to being Romeo’s ally after Tybalt dies?",
        "answers": [
            {
                "text": "The law that threaten’d death",
                "answer_start": chunks[220].index("The law that threaten’d death")
            }
        ]
    },
    {
        "id": "chunk220-7",
        "question": "Who is described as approaching Romeo in her best form?",
        "answers": [
            {
                "text": "Happiness",
                "answer_start": chunks[220].index("Happiness")
            }
        ]
    },
    {
        "id": "chunk220-8",
        "question": "What does Romeo do to his own fortune and love in the passage?",
        "answers": [
            {
                "text": "putt’st up thy Fortune and thy love",
                "answer_start": chunks[220].index("putt’st up thy Fortune and thy love")
            }
        ]
    }
]

In [14]:
# List of all your chunk-level Q&A
all_chunks_qas = [
    {"context": chunks[10], "qas": qas_chunk10},
    {"context": chunks[30], "qas": qas_chunk30},
    {"context": chunks[40], "qas": qas_chunk40},
    {"context": chunks[70], "qas": qas_chunk70},
    {"context": chunks[80], "qas": qas_chunk80},
    {"context": chunks[90], "qas": qas_chunk90},
    {"context": chunks[110], "qas": qas_chunk110},
    {"context": chunks[140], "qas": qas_chunk140},
    {"context": chunks[147], "qas": qas_chunk147},
    {"context": chunks[155], "qas": qas_chunk155},
    {"context": chunks[180], "qas": qas_chunk180},
    {"context": chunks[191], "qas": qas_chunk191}, 
    {"context": chunks[200], "qas": qas_chunk200}, 
    {"context": chunks[210], "qas": qas_chunk210}, 
    {"context": chunks[220], "qas": qas_chunk220},
]

### **Baseline Pre-Trained Transformer**

In [None]:
# Split the chunks into training/testing by avoiding data leakage

# 15 IDs total (randomly shuffle the chunk id)
chunks_ids = list(range(15))  # 15 chunks total
random.shuffle(chunks_ids)

train_chunks = chunks_ids[:10] # 80 samples in training
test_chunks =  chunks_ids[10:] # 40 samples in testing

# Set up the training qas by cycling through the qa's of the designated train chunks
train_qas = [
    {
        "id": qa["id"],
        "context": qas_chk["context"],
        "question": qa["question"],
        "answers": qa["answers"]
    }
    for idx, qas_chk in enumerate(all_chunks_qas)
    if idx in train_chunks
    for qa in qas_chk["qas"]
]

# Set up the testing qas by cycling through the qa's of the designated testing chunks
test_qas = [
    {
        "id": qa["id"],
        "context": qas_chk["context"],
        "question": qa["question"],
        "answers": qa["answers"]
    }
    for idx, qas_chk in enumerate(all_chunks_qas)
    if idx in test_chunks
    for qa in qas_chk["qas"]
]

In [23]:
# Load in the squad metrics
squad_metric = evaluate.load("squad")

# Set up the qa_pipeline with the distilbert-base-cased-distilled-squad pretrained transformer
qa_pipeline = pipeline("question-answering", model = "distilbert-base-cased-distilled-squad", framework = "pt")

# Load in semantic model
semantic_model = SentenceTransformer('all-MiniLM-L6-v2')

predictions = []
actual = []
semantic_scores = []

for qa in test_qas:
    # Retrieve context, question, ground_truth, and result
    context = qa["context"]
    question = qa["question"]
    ground_truth = qa["answers"][0]["text"]
    result = qa_pipeline(question = question, context = context)

    # Integrate Sentence Transformer to determine semantic similarity
    embeddings = semantic_model.encode([result["answer"], ground_truth])
    semantic_sim = cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]
    semantic_scores.append(semantic_sim)
    
    # Display the question, result, and ground truth
    print(f"\nID: {qa['id']}")
    print(f"Q: {question}")
    print(f"Prediction: {result['answer']}")
    print(f"Ground truth: {ground_truth}")
    print(f"Semantic Similarity: {semantic_sim:.2%}")
    print("-" * 60)

    # Append predictions and actual answers to calculate the squad metrics
    predictions.append({"id": qa["id"], "prediction_text": result["answer"]})
    actual.append({"id": qa["id"], "answers": {"text": [ground_truth], "answer_start": [qa["answers"][0]["answer_start"]]}})

metrics = squad_metric.compute(predictions = predictions, references = actual)
print(metrics)

avg_semantic_score = sum(semantic_scores) / len(semantic_scores) * 100
print(f"Semantic Similarity Score: {avg_semantic_score:.2f}%")

Device set to use cpu



ID: chunk10-1
Q: Which characters from the house of Montagues enter?
Prediction: Enter Abram and Balthasar
Ground truth: Abram and Balthasar
Semantic Similarity: 91.69%
------------------------------------------------------------

ID: chunk10-2
Q: What does Sampson say is out?
Prediction: My naked weapon
Ground truth: naked weapon
Semantic Similarity: 85.85%
------------------------------------------------------------

ID: chunk10-3
Q: How does Gregory respond when Sampson says he will back him
Prediction: quarrel
Ground truth: How? Turn thy back and run?
Semantic Similarity: 15.49%
------------------------------------------------------------

ID: chunk10-4
Q: What does Gregory state to convey that he is worried about Sampson?
Prediction: Fear me not
Ground truth: No, marry; I fear thee!
Semantic Similarity: 52.55%
------------------------------------------------------------

ID: chunk10-5
Q: Which Montague family members make an entrance in the scene?
Prediction: Enter Abram and Balt

### **Fine-Tuned Transformer with LoRA**

In [None]:
# Set up the train and test datasets from the train and test Q&A's
train_dataset = Dataset.from_list(train_qas)
test_dataset = Dataset.from_list(test_qas)

In [None]:
# Call the tokenizer from the pretrained model
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased-distilled-squad")

# Create a preprocessing function to tokenize the Q&A pairs (the idea is to convert raw text into numerical inputs)
def preprocess(examples):
    tokenized = tokenizer(
        examples["question"],
        examples["context"],
        truncation="only_second",
        padding="max_length",
        max_length=384,
        stride=128,
        return_overflowing_tokens=False,
        return_offsets_mapping=True
    )
    
    # Track the start positions and end positions
    start_positions = []
    end_positions = []
    
    # For every question in the dataset
    for i in range(len(examples["question"])):
        offsets = tokenized["offset_mapping"][i]
        
        # Fetch the answer info in terms of the answer and the answer start/end
        answer = examples["answers"][i][0]
        start_char = answer["answer_start"]
        end_char = start_char + len(answer["text"])
        
        # Sequence_ids is the list of identifiers for each token (question or context)
        sequence_ids = tokenized.sequence_ids(i)
        
        # Find the start and end of context in token positions
        context_start = 0
        context_end = len(sequence_ids) - 1
        
        # Find first context token (sequence_id = 1)
        while context_start < len(sequence_ids) and sequence_ids[context_start] != 1:
            context_start += 1
        
        # Find last context token
        while context_end >= 0 and sequence_ids[context_end] != 1:
            context_end -= 1
        
        # If the answer is not found initialize the indices to zero
        token_start_index = 0
        token_end_index = 0
        
        # Check if answer is in the context start and context end range
        if offsets[context_start][0] <= start_char and offsets[context_end][1] >= end_char:
            # Find token positions
            idx = context_start
            while idx <= context_end and offsets[idx][0] <= start_char:
                idx += 1
            token_start_index = idx - 1
            
            idx = context_end
            while idx >= context_start and offsets[idx][1] >= end_char:
                idx -= 1
            token_end_index = idx + 1
        # Append the start and end positions for the answer/context
        start_positions.append(token_start_index)
        end_positions.append(token_end_index)
    
    # Store the start positions and end positions of the answer within the context
    tokenized["start_positions"] = start_positions
    tokenized["end_positions"] = end_positions
    tokenized.pop("offset_mapping")
    
    return tokenized

# Tokenize both the training and testing datasets for model fine-tuning
tokenized_train = train_dataset.map(preprocess, batched=True, remove_columns=train_dataset.column_names)
tokenized_test = test_dataset.map(preprocess, batched=True, remove_columns=test_dataset.column_names)

Map:   0%|          | 0/80 [00:00<?, ? examples/s]

Map:   0%|          | 0/40 [00:00<?, ? examples/s]

In [26]:
# Load base model
model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-cased-distilled-squad")

# Configure LoRA
lora_config = LoraConfig(
    task_type=TaskType.QUESTION_ANS,
    r=8,                             
    lora_alpha=32,                    
    lora_dropout=0.1,
    target_modules=["q_lin", "v_lin"], 
    bias="none",
)

# Apply LoRA
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

print("LoRA configured!")

trainable params: 148,994 || all params: 65,341,444 || trainable%: 0.2280
LoRA configured!


In [None]:
from transformers import TrainingArguments, Trainer
import os

# Disable the WANDB request for an API key
os.environ["WANDB_DISABLED"] = "true"

# Set up all of the training arguments
training_args = TrainingArguments(
    # Set an output directory for results from LoRA (if needed)
    output_dir="./results_lora",
    
    # Set up the number of epochs and learning rate
    num_train_epochs=15,
    learning_rate=2e-4,
    
    # Set up the batch settings (small number since we have a small number of samples)
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=2,
    
    # Regularization parameters to prevent possibilities of overfitting
    weight_decay=0.01,
    warmup_ratio=0.1,
    
    # Evaluation strategies (epoch) and selecting the best model based on eval loss
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    
    # Logging
    logging_steps=10,
    dataloader_num_workers=0,
    fp16=False,
    
    save_total_limit=2,
    report_to="none",
)

# Training with the pre-trained model and the tokenized train/test Q&A's
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    processing_class=tokenizer,
)

print("Starting training...\n")
trainer.train()


 Starting training...



Epoch,Training Loss,Validation Loss
1,2.859,2.288441
2,2.6666,2.054247
3,2.2409,1.888279
4,1.9052,1.869295
5,1.6666,1.895602
6,1.5583,1.928743
7,1.3218,1.958389
8,1.2581,1.981228
9,0.9864,2.020948
10,0.9749,2.050828


TrainOutput(global_step=150, training_loss=1.4346472517649334, metrics={'train_runtime': 968.1231, 'train_samples_per_second': 1.24, 'train_steps_per_second': 0.155, 'total_flos': 117999728640000.0, 'train_loss': 1.4346472517649334, 'epoch': 15.0})

In [None]:
# Loads in the squad metrics for EM and F1
squad_metric = evaluate.load("squad")

# The final trained model is used here to evaluate performance
qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer)
print("Using trained model!\n")

# The sentence transformer model for semantic performance is initialized here
semantic_model = SentenceTransformer('all-MiniLM-L6-v2')

# Store the predictions, actual answers, and semantic scores for each of the test questions
predictions = []
actual = []
semantic_scores = []

# Cycle through all of the questions in test_qas
for qa in test_qas:
    # Track the context, question, ground_truth, and result for each question in test_qas
    context = qa["context"]
    question = qa["question"]
    ground_truth = qa["answers"][0]["text"]

    # Feed the question into the qa_pipeline for the prediction
    result = qa_pipeline(question=question, context=context)

    # Call the semantic model to determine the semantic meaning scores
    embeddings = semantic_model.encode([result["answer"], ground_truth])
    semantic_sim = cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]
    semantic_scores.append(semantic_sim)
    
    # Display the results for each testing question in the test_qas
    print(f"\nID: {qa['id']}")
    print(f"Q: {question}")
    print(f"Prediction: {result['answer']}")
    print(f"Ground truth: {ground_truth}")
    print(f"Semantic Similarity: {semantic_sim:.2%}")
    print("-" * 60)

    predictions.append({"id": qa["id"], "prediction_text": result["answer"]})
    actual.append({"id": qa["id"], "answers": {"text": [ground_truth], "answer_start": [qa["answers"][0]["answer_start"]]}})

metrics = squad_metric.compute(predictions=predictions, references=actual)
avg_semantic_score = sum(semantic_scores) / len(semantic_scores) * 100

# Display the results fo the final Fine Tuned Modle
print("\n" + "="*60)
print("FINE-TUNED MODEL RESULTS")
print("="*60)
print(f"Exact Match (EM): {metrics['exact_match']:.2f}%")
print(f"F1 Score: {metrics['f1']:.2f}%")
print(f"Semantic Similarity: {avg_semantic_score:.2f}%")
print("="*60)

Device set to use cpu


Using trained model!


ID: chunk10-1
Q: Which characters from the house of Montagues enter?
Prediction: Enter Abram and Balthasar
Ground truth: Abram and Balthasar
Semantic Similarity: 91.69%
------------------------------------------------------------

ID: chunk10-2
Q: What does Sampson say is out?
Prediction: My naked weapon
Ground truth: naked weapon
Semantic Similarity: 85.85%
------------------------------------------------------------

ID: chunk10-3
Q: How does Gregory respond when Sampson says he will back him
Prediction: My naked weapon is out: quarrel
Ground truth: How? Turn thy back and run?
Semantic Similarity: 22.00%
------------------------------------------------------------

ID: chunk10-4
Q: What does Gregory state to convey that he is worried about Sampson?
Prediction: Fear me not
Ground truth: No, marry; I fear thee!
Semantic Similarity: 52.55%
------------------------------------------------------------

ID: chunk10-5
Q: Which Montague family members make an entrance 

### **Hypertuning the Fine-Tuned Transformer with LoRA (Future Steps)**

In [36]:
# First, define a metric function to compute EM and F1 during training
import numpy as np
squad_metric = evaluate.load("squad")

# Create a validation and held out test set for the hypertuning task
test_raw = Dataset.from_list(test_qas)
test_shuffled = test_raw.shuffle(seed=42)
test_size = len(test_qas) // 2

valid_dataset = test_shuffled.select(range(test_size))
heldout_dataset = test_shuffled.select(range(test_size, len(test_qas)))

tokenized_valid = valid_dataset.map(preprocess, batched=True, remove_columns = valid_dataset.column_names)
tokenized_heldout = heldout_dataset.map(preprocess, batched=True, remove_columns = heldout_dataset.column_names)

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

In [None]:
# Set up hyperparameters for hypertuning
hyperparams = [
    {"learning_rate": 2e-4, "lora_r": 8},
    {"learning_rate": 3e-4, "lora_r": 8},
    {"learning_rate": 2e-4, "lora_r": 16},
    {"learning_rate": 3e-4, "lora_r": 16},
]

best_f1 = 0.0
best_config = None
best_model = None

for param in hyperparams:
    # Define the configuration for LoRA with each configuration
    lora_config = LoraConfig(
        task_type=TaskType.QUESTION_ANS,
        r=param["lora_r"],
        lora_alpha=32,
        lora_dropout=0.1,
        target_modules=["q_lin", "v_lin"],
        bias="none",
    )

    # Train the base model with the LoRA configuration
    model = get_peft_model(
        AutoModelForQuestionAnswering.from_pretrained("distilbert-base-cased-distilled-squad"), 
        lora_config
    )

    # Set up the training arguments for the model 
    training_args = TrainingArguments(
        output_dir="./results_lora_hypertune",
        
        # Training settings
        num_train_epochs=10,
        learning_rate=param['learning_rate'],
        
        # Batch settings
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        gradient_accumulation_steps=2,
        
        # Regularization
        weight_decay=0.01,
        warmup_ratio=0.1,
        
        # Evaluation
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,

        # Logging
        logging_steps=10,
        dataloader_num_workers=0,
        fp16=False,
        
        save_total_limit=2,
        report_to="none",
    )


    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_valid,
        tokenizer=tokenizer
    )

    trainer.train()

    qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer)
    
    predictions = []
    actual = []
    semantic_scores = []

    for qa in valid_dataset:
        context = qa["context"]
        question = qa["question"]
        ground_truth = qa["answers"][0]["text"]

        pred = qa_pipeline(question = question, context = context)

        # semantic similarity
        emb = semantic_model.encode([pred["answer"], ground_truth])
        sem_sim = cosine_similarity([emb[0]], [emb[1]])[0][0]
        semantic_scores.append(sem_sim)

        predictions.append({"id": qa["id"], "prediction_text": pred["answer"]})
        actual.append({
            "id": qa["id"], 
            "answers": {
                "text": [ground_truth],
                "answer_start": [qa["answers"][0]["answer_start"]]
            }
        })

    results = squad_metric.compute(predictions = predictions, references = actual)
    f1 = results["f1"]
    em = results["exact_match"]
    avg_sem = np.mean(semantic_scores) * 100

    print(f"\nParams {param}")
    print(f"  EM: {em:.2f}%")
    print(f"  F1: {f1:.2f}%")
    print(f"  Semantic Similarity: {avg_sem:.2f}%")

    if f1 > best_f1:
        best_f1 = f1
        best_config = param
        best_model = model  # save the trained model object

print(f"Best params: {best_config}, Best F1: {best_f1:.2f}")