In [1]:
import json
import re
from tqdm import tqdm
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\15710\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\15710\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\15710\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Data Loading

Load squad data from json

In [2]:
lemmatizer = WordNetLemmatizer()

In [3]:
def load_squad_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    return data['data']

## Data cleaning

In [4]:
def clean_text(text):
    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    return text

def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text)
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

def lemmatize_text(text):
    words = word_tokenize(text)
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(lemmatized_words)

In [5]:
def preprocess_squad(data):
    processed_data = []
    
    for article in tqdm(data, desc="Processing articles"):
        for paragraph in article['paragraphs']:
            context = paragraph['context']
            
            # Clean and preprocess context
            cleaned_context = clean_text(context)
            cleaned_context = remove_stopwords(cleaned_context)
            # cleaned_context = lemmatize_text(cleaned_context)
            
            for qa in paragraph['qas']:
                question = qa['question']
                
                # Clean and preprocess question
                cleaned_question = clean_text(question)
                cleaned_question = remove_stopwords(cleaned_question)
                # cleaned_question = lemmatize_text(cleaned_question)
                
                # Handle answers
                if qa['is_impossible']:
                    answer_text = ''
                    answer_start = -1
                else:
                    answer = qa['answers'][0]  # Consider only the first answer
                    answer_text = answer['text']
                    answer_start = answer['answer_start']
                
                processed_data.append({
                    'id': qa['id'],
                    'context': cleaned_context,
                    'question': cleaned_question,
                    'answer_text': answer_text,
                    'answer_start': answer_start,
                    'is_impossible': qa['is_impossible']
                })
    
    return processed_data

In [6]:
train_data = load_squad_data("./dev-v2.0.json")

In [7]:
processed_train_data = preprocess_squad(train_data)

Processing articles: 100%|██████████| 35/35 [00:05<00:00,  5.92it/s]


In [8]:
processed_train_data

[{'id': '56ddde6b9a695914005b9628',
  'context': 'normans norman nourmands french normands latin normanni people th th centuries gave name normandy region france descended norse norman comes norseman raiders pirates denmark iceland norway leader rollo agreed swear fealty king charles iii west francia generations assimilation mixing native frankish romangaulish populations descendants would gradually merge carolingianbased cultures west francia distinct cultural ethnic identity normans emerged initially first half th century continued evolve succeeding centuries',
  'question': 'country normandy located',
  'answer_text': 'France',
  'answer_start': 159,
  'is_impossible': False},
 {'id': '56ddde6b9a695914005b9629',
  'context': 'normans norman nourmands french normands latin normanni people th th centuries gave name normandy region france descended norse norman comes norseman raiders pirates denmark iceland norway leader rollo agreed swear fealty king charles iii west francia generatio

In [9]:
import pandas as pd
df = pd.DataFrame(processed_train_data)

In [10]:
df.head()

Unnamed: 0,id,context,question,answer_text,answer_start,is_impossible
0,56ddde6b9a695914005b9628,normans norman nourmands french normands latin...,country normandy located,France,159,False
1,56ddde6b9a695914005b9629,normans norman nourmands french normands latin...,normans normandy,10th and 11th centuries,94,False
2,56ddde6b9a695914005b962a,normans norman nourmands french normands latin...,countries norse originate,"Denmark, Iceland and Norway",256,False
3,56ddde6b9a695914005b962b,normans norman nourmands french normands latin...,norse leader,Rollo,308,False
4,56ddde6b9a695914005b962c,normans norman nourmands french normands latin...,century normans first gain separate identity,10th century,671,False


In [11]:
df.to_csv("dev-v2.0.csv", index=False)