In [None]:
!pip install datasets

In [None]:
from datasets import load_dataset
dataset = load_dataset('squad')

In [None]:
import pandas as pd

train_dataset = pd.DataFrame(dataset['train'])
test_dataset = pd.DataFrame(dataset['validation'])

In [None]:
train_dataset.shape

(87599, 5)

In [None]:
test_dataset.shape

(10570, 5)

In [None]:
test_dataset["combined"] = test_dataset["context"] + " " + test_dataset["question"]
train_dataset["combined"] = train_dataset["context"] + " " + train_dataset["question"]

train_dataset['answer_start'] =  train_dataset['answers'].apply(lambda x: x['answer_start'][0])
test_dataset['answer_start'] =  test_dataset['answers'].apply(lambda x: x['answer_start'][0])

In [None]:
train_dataset.head()

Unnamed: 0,id,title,context,question,answers,combined,answer_start
0,5733be284776f41900661182,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",To whom did the Virgin Mary allegedly appear i...,"{'text': ['Saint Bernadette Soubirous'], 'answ...","Architecturally, the school has a Catholic cha...",515
1,5733be284776f4190066117f,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",What is in front of the Notre Dame Main Building?,"{'text': ['a copper statue of Christ'], 'answe...","Architecturally, the school has a Catholic cha...",188
2,5733be284776f41900661180,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",The Basilica of the Sacred heart at Notre Dame...,"{'text': ['the Main Building'], 'answer_start'...","Architecturally, the school has a Catholic cha...",279
3,5733be284776f41900661181,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",What is the Grotto at Notre Dame?,{'text': ['a Marian place of prayer and reflec...,"Architecturally, the school has a Catholic cha...",381
4,5733be284776f4190066117e,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",What sits on top of the Main Building at Notre...,{'text': ['a golden statue of the Virgin Mary'...,"Architecturally, the school has a Catholic cha...",92


In [None]:
import string
from nltk.stem import PorterStemmer
import re

def preprocess(text):

    text = str(text)

    #Lowercase
    text = text.lower()

    #Remove html tags
    text = re.sub(r'<.*?>', ' ', text)

    #Remove other Punctuation
    text = re.sub(r'[^\w]', ' ', text)

    #Remove extra space
    text = ' '.join(text.split())

    #Stemming
    stemmer = PorterStemmer()
    text = stemmer.stem(text)

    return text

In [None]:
train_dataset['combined'] = train_dataset['combined'].apply(preprocess)

In [None]:
X_train, X_test, y_train, y_test = train_dataset['combined'], test_dataset['combined'], train_dataset['answer_start'], test_dataset['answer_start']

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Vectorize the text
vectorizer = TfidfVectorizer(max_features = 10000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)


In [None]:
from xgboost import XGBRegressor
from sklearn.metrics import r2_score

# Train the model
regressor = XGBRegressor(n_estimators=50, seed=0)
regressor.fit(X_train_tfidf, y_train)

# Predict on the test set
y_pred = regressor.predict(X_test_tfidf)

# Evaluate the model
r2 = r2_score(y_test, y_pred)
print(f"R2 Score is : {r2:.2f}")


R2 Score is : 0.16


In [None]:
# Function that returns sentence from starting index of context

def sentence_from_index(num, context):
    index = int(num)

    if index < 0 or index >= len(context):
        return "Index out of range"

    # Adjust index if it points to a space or punctuation, moving back to find the word start
    while index > 0 and not context[index].isspace() and not context[index-1].isspace():
        index -= 1

    # Find the beginning of the word containing the index
    start = index
    while start > 0 and not context[start].isspace():
        start -= 1

    # Find the end of the sentence containing the index
    end = index
    while end < len(context) and not context[end] in '.?!':
        end += 1

    # Extract and return the sentence from the start to the end
    return context[start:end].strip()

In [None]:
#Test
context = '''
In 1997, a monumental event in the history of artificial intelligence and computing occurred when IBM's supercomputer,
Deep Blue, defeated world chess champion Garry Kasparov. This marked the first time a computer had beaten a reigning
world champion in a match under standard chess tournament conditions.
'''

question = 'Who was defeated by Supercomputer Deep Blue ?'


x_tfidf = vectorizer.transform([context + question])
y = regressor.predict(x_tfidf)
print(sentence_from_index(y, context))

Deep Blue, defeated world chess champion Garry Kasparov


  index = int(num)
