In [24]:
import numpy as np 
import pandas as pd
import torch
from datasets import load_dataset

newsqa = load_dataset("StellarMilk/newsqa")
print(newsqa)

train = newsqa["train"]

DatasetDict({
    train: Dataset({
        features: ['paragraph', 'questions', 'answers', 'questions_answers'],
        num_rows: 10327
    })
    validation: Dataset({
        features: ['paragraph', 'questions', 'answers', 'questions_answers'],
        num_rows: 574
    })
    test: Dataset({
        features: ['paragraph', 'questions', 'answers', 'questions_answers'],
        num_rows: 574
    })
})


In [25]:
df = train.to_pandas()
df.head()
df.isnull().sum()

paragraph            0
questions            0
answers              0
questions_answers    0
dtype: int64

## Data Preprocessing(cleaning)

#### we use a contraction map to map contractions to their respective full forms: to reduce extra single/double letter tokens which would, by themselves, have no meaning
#### convert all text to lower case; extract only letters as we only care about the words and their general- static meaning; also replace all whitespace characters with a single space

In [26]:
import pandas as pd
import re

CONTRACTION_MAP = {
    "ain't": "am not", "aren't": "are not", "can't": "cannot",
    "can't've": "cannot have", "'cause": "because", "could've": "could have",
    "couldn't": "could not", "couldn't've": "could not have", "didn't": "did not",
    "doesn't": "does not", "don't": "do not", "hadn't": "had not",
    "hadn't've": "had not have", "hasn't": "has not", "haven't": "have not",
    "he'd": "he would", "he'd've": "he would have", "he'll": "he will",
    "he'll've": "he will have", "he's": "he is", "how'd": "how did",
    "how'd'y": "how do you", "how'll": "how will", "how's": "how is",
    "I'd": "I would", "I'd've": "I would have", "I'll": "I will",
    "I'll've": "I will have", "I'm": "I am", "I've": "I have",
    "isn't": "is not", "it'd": "it would", "it'd've": "it would have",
    "it'll": "it will", "it'll've": "it will have", "it's": "it is",
    "let's": "let us", "ma'am": "madam", "mayn't": "may not",
    "might've": "might have", "mightn't": "might not", "mightn't've": "might not have",
    "must've": "must have", "mustn't": "must not", "mustn't've": "must not have",
    "needn't": "need not", "needn't've": "need not have", "o'clock": "of the clock",
    "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not",
    "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would",
    "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have",
    "she's": "she is", "should've": "should have", "shouldn't": "should not",
    "shouldn't've": "should not have", "so've": "so have", "so's": "so is",
    "that'd": "that would", "that'd've": "that would have", "that's": "that is",
    "there'd": "there would", "there'd've": "there would have", "there's": "there is",
    "they'd": "they would", "they'd've": "they would have", "they'll": "they will",
    "they'll've": "they will have", "they're": "they are", "they've": "they have",
    "to've": "to have", "wasn't": "was not", "we'd": "we would",
    "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have",
    "we're": "we are", "we've": "we have", "weren't": "were not",
    "what'll": "what will", "what'll've": "what will have", "what're": "what are",
    "what's": "what is", "what've": "what have", "when's": "when is",
    "when've": "when have", "where'd": "where did", "where's": "where is",
    "where've": "where have", "who'll": "who will", "who'll've": "who will have",
    "who's": "who is", "who've": "who have", "why's": "why is",
    "why've": "why have", "will've": "will have", "won't": "will not",
    "won't've": "will not have", "would've": "would have", "wouldn't": "would not",
    "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would",
    "y'all'd've": "you all would have", "y'all're": "you all are", "y'all've": "you all have",
    "you'd": "you would", "you'd've": "you would have", "you'll": "you will",
    "you'll've": "you all have", "you're": "you are", "you've": "you have"
}
# above is generated by gpt
# Regex pattern to find contractions quickly
CONTRACTIONS_RE = re.compile('({})'.format('|'.join(re.escape(key) for key in CONTRACTION_MAP.keys())), re.IGNORECASE)

def expand_contractions(text, CONTRACTION_MAP):
    def replace(match):
        try:
            return CONTRACTION_MAP[match.group(0).lower()]
        except KeyError:
            return
    return CONTRACTIONS_RE.sub(replace, text)

#above was done just to reduce number of tokens; as certain letters like 's', 'll' etc would also be counted.

def clean_text(text):
    text = expand_contractions(text)
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()

    return text

df['paragraph'] = df['paragraph'].apply(clean_text)

## Model initialization

In [27]:
import gensim

model = gensim.models.Word2Vec(
    window = 10,                               # Radius of the sliding window
    min_count = 2,                             # ignores all words with less than min_count frequency
    vector_size = 200,                         # vector embeddings size
    epochs = 10           
)


### Creating the corpus

In [36]:
corpus = [i.split() for i in list(df['paragraph'])]         # list of sentences which has been split

### Building the Vocab

In [30]:
model.build_vocab(corpus)

### Training the model

In [31]:
model.train(corpus, total_examples = model.corpus_count, epochs = model.epochs)

(39736239, 51318650)

## Example to show the use of this model:
#### we can find the nearest words to a particular word in terms of its meaning learnt by training the model; this measure of how near a word is to a given word is calculated by cosine similarity between the given word's embeddings and other words' embeddings

In [40]:
model.wv.most_similar('apple')

[('apples', 0.8100547194480896),
 ('app', 0.7732735872268677),
 ('microsoft', 0.7679699659347534),
 ('google', 0.7653537392616272),
 ('iphone', 0.7572395205497742),
 ('samsung', 0.7503407597541809),
 ('android', 0.7503293752670288),
 ('iphones', 0.7369444966316223),
 ('apps', 0.7294312119483948),
 ('software', 0.7188068628311157)]