In [1]:
import json
import re
from tqdm import tqdm
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\15710\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\15710\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\15710\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Data Loading

Load squad data from json

In [2]:
def load_squad_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    return data['data']

## Data cleaning

In [4]:
def clean_text(text):
    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    return text

def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text)
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

# def lemmatize_text(text):
#     words = word_tokenize(text)
#     lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
#     return ' '.join(lemmatized_words)

In [5]:
def preprocess_squad(data):
    processed_data = []
    
    for article in tqdm(data, desc="Processing articles"):
        for paragraph in article['paragraphs']:
            context = paragraph['context']
            
            # Clean and preprocess context
            cleaned_context = clean_text(context)
            cleaned_context = remove_stopwords(cleaned_context)
            # cleaned_context = lemmatize_text(cleaned_context)
            
            for qa in paragraph['qas']:
                question = qa['question']
                
                # Clean and preprocess question
                cleaned_question = clean_text(question)
                cleaned_question = remove_stopwords(cleaned_question)
                # cleaned_question = lemmatize_text(cleaned_question)
                
                # Handle answers
                if qa['is_impossible']:
                    answer_text = 'none'
                    answer_start = -1
                else:
                    answer = qa['answers'][0]  # Consider only the first answer
                    answer_text = answer['text']
                    answer_start = answer['answer_start']
                
                processed_data.append({
                    'id': qa['id'],
                    'context': cleaned_context,
                    'question': cleaned_question,
                    'answer_text': answer_text,
                    'answer_start': answer_start,
                    'is_impossible': qa['is_impossible']
                })
    
    return processed_data

In [6]:
train_data = load_squad_data("./train-v2.0.json")

In [8]:
train_data[0]

{'title': 'Beyoncé',
 'paragraphs': [{'qas': [{'question': 'When did Beyonce start becoming popular?',
     'id': '56be85543aeaaa14008c9063',
     'answers': [{'text': 'in the late 1990s', 'answer_start': 269}],
     'is_impossible': False},
    {'question': 'What areas did Beyonce compete in when she was growing up?',
     'id': '56be85543aeaaa14008c9065',
     'answers': [{'text': 'singing and dancing', 'answer_start': 207}],
     'is_impossible': False},
    {'question': "When did Beyonce leave Destiny's Child and become a solo singer?",
     'id': '56be85543aeaaa14008c9066',
     'answers': [{'text': '2003', 'answer_start': 526}],
     'is_impossible': False},
    {'question': 'In what city and state did Beyonce  grow up? ',
     'id': '56bf6b0f3aeaaa14008c9601',
     'answers': [{'text': 'Houston, Texas', 'answer_start': 166}],
     'is_impossible': False},
    {'question': 'In which decade did Beyonce become famous?',
     'id': '56bf6b0f3aeaaa14008c9602',
     'answers': [{'text

In [7]:
processed_train_data = preprocess_squad(train_data)

Processing articles: 100%|██████████| 442/442 [01:13<00:00,  6.02it/s]


In [8]:
processed_train_data

[{'id': '56be85543aeaaa14008c9063',
  'context': 'beyonc giselle knowlescarter bijnse beeyonsay born september american singer songwriter record producer actress born raised houston texas performed various singing dancing competitions child rose fame late lead singer rb girlgroup destinys child managed father mathew knowles group became one worlds bestselling girl groups time hiatus saw release beyoncs debut album dangerously love established solo artist worldwide earned five grammy awards featured billboard hot numberone singles crazy love baby boy',
  'question': 'beyonce start becoming popular',
  'answer_text': 'in the late 1990s',
  'answer_start': 269,
  'is_impossible': False},
 {'id': '56be85543aeaaa14008c9065',
  'context': 'beyonc giselle knowlescarter bijnse beeyonsay born september american singer songwriter record producer actress born raised houston texas performed various singing dancing competitions child rose fame late lead singer rb girlgroup destinys child managed fa

In [9]:
import pandas as pd
df = pd.DataFrame(processed_train_data)

In [10]:
df.head()

Unnamed: 0,id,context,question,answer_text,answer_start,is_impossible
0,56be85543aeaaa14008c9063,beyonc giselle knowlescarter bijnse beeyonsay ...,beyonce start becoming popular,in the late 1990s,269,False
1,56be85543aeaaa14008c9065,beyonc giselle knowlescarter bijnse beeyonsay ...,areas beyonce compete growing,singing and dancing,207,False
2,56be85543aeaaa14008c9066,beyonc giselle knowlescarter bijnse beeyonsay ...,beyonce leave destinys child become solo singer,2003,526,False
3,56bf6b0f3aeaaa14008c9601,beyonc giselle knowlescarter bijnse beeyonsay ...,city state beyonce grow,"Houston, Texas",166,False
4,56bf6b0f3aeaaa14008c9602,beyonc giselle knowlescarter bijnse beeyonsay ...,decade beyonce become famous,late 1990s,276,False


In [11]:
df.to_csv("train-v2.0.csv", index=False)