In [1]:
from pathlib import Path
import pandas as pd
from unidecode import unidecode
from bs4 import BeautifulSoup
import torch
from fastai.text import *
import dill as pickle
from sklearn.model_selection import train_test_split

# Set paths
DATA = Path('../data/') 
LM = DATA/'language-model/'
CLS = DATA/'classifier/' 
TMP = DATA/'tmp/'

# Make directories if don't exist
LM.mkdir(exist_ok=True)
CLS.mkdir(exist_ok=True)
TMP.mkdir(exist_ok=True)

#### 1. Read Data

In [2]:
data = pd.concat([
    pd.read_csv('../data/kaggle-dataset/train.csv', encoding='utf-8').fillna(''),
    pd.read_csv('../data/kaggle-dataset/test.csv', encoding='utf-8').fillna('')
])

trn_idx = np.array(data.dropna(subset=['id']).index)
tst_idx = np.array(data.dropna(subset=['test_id']).index)

data.head()

Unnamed: 0,id,is_duplicate,qid1,qid2,question1,question2,test_id
0,0.0,0.0,1.0,2.0,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,
1,1.0,0.0,3.0,4.0,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,
2,2.0,0.0,5.0,6.0,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,
3,3.0,0.0,7.0,8.0,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,
4,4.0,0.0,9.0,10.0,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,


#### 2. Preprocess

In [3]:
def fixup(x):
    """ Cleans input text. """

    #x = BeautifulSoup(x, "lxml").get_text()
    x = unidecode(x)
    
    if not x: 
        x = ' '
        
    return x

data['question1'] = data['question1'].apply(fixup).values.astype(str)
data['question2'] = data['question2'].apply(fixup).values.astype(str)

#### 3. Tokenize

In [4]:
tokq1 = Tokenizer().proc_all_mp(partition_by_cores(data.question1.as_matrix()))
tokq2 = Tokenizer().proc_all_mp(partition_by_cores(data.question2.as_matrix()))

labels = data['is_duplicate'].as_matrix()

#### 4. Make vocabulary

In [5]:
# Vocab config
max_vocab = 60000
min_freq = 2

In [6]:
# Get a list of all tokens
all_toks = [tok for toklst in tokq1 for tok in toklst] +\
           [tok for toklst in tokq2 for tok in toklst]

# Make mapping from integer to string
itos = [tok for tok, count in collections.Counter(all_toks).most_common(max_vocab)
        if count > min_freq]
itos.insert(0, '_unk_')
itos.insert(1, '_pad_')

# Make mapping from string to integer
stoi = collections.defaultdict(lambda: 0, {tok: i for i, tok in enumerate(itos)} )

In [7]:
# Save vocabulary
pickle.dump(itos, (DATA/'itos.p').open('wb'))
pickle.dump(stoi, (DATA/'stoi.p').open('wb'))

#### 5. Save data

In [8]:
# Map token strings to vocab ids.
tokid_q1 = np.array([[stoi[i] for i in toklst] for toklst in tokq1])
tokid_q2 = np.array([[stoi[i] for i in toklst] for toklst in tokq2])

A. Language Model

In [9]:
# Language model.
lm = np.concatenate([tokid_q1, tokid_q2])

# Split into training and validation sets
lm_trn, lm_val = train_test_split(lm, test_size=0.1, random_state=0)

# Save to disk
np.save(LM/'lm_trn.npy', lm_trn)
np.save(LM/'lm_val.npy', lm_val)

B. Classifier

In [11]:
# Classifier
cls = np.concatenate([tokid_q1[None], tokid_q2[None]]).T

# Get train and test sets
cls_trn = cls[trn_idx]
cls_tst = cls[tst_idx]

# Save to disk
np.save(CLS/'cls_tst.npy', cls_tst)
np.save(CLS/'cls_trn.npy', cls_trn)
np.save(CLS/'cls_trn_lbl.npy', label[trn_idx])