In [1]:
import pandas as pd
import numpy as np
import re
import os
import json

from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

In [2]:
train_data = pd.read_csv('data_in/quora_questions_pairs/train.csv', encoding='utf-8', error_bad_lines=False)

b'Skipping line 27290: expected 6 fields, saw 9\nSkipping line 47638: expected 6 fields, saw 7\nSkipping line 54392: expected 6 fields, saw 7\nSkipping line 74782: expected 6 fields, saw 8\nSkipping line 81569: expected 6 fields, saw 7\nSkipping line 88323: expected 6 fields, saw 7\nSkipping line 108520: expected 6 fields, saw 8\nSkipping line 115307: expected 6 fields, saw 7\n'
b'Skipping line 148847: expected 6 fields, saw 7\nSkipping line 182440: expected 6 fields, saw 10\nSkipping line 215926: expected 6 fields, saw 9\nSkipping line 229285: expected 6 fields, saw 7\nSkipping line 242574: expected 6 fields, saw 7\n'
b'Skipping line 262519: expected 6 fields, saw 9\nSkipping line 275798: expected 6 fields, saw 7\nSkipping line 295754: expected 6 fields, saw 9\nSkipping line 322347: expected 6 fields, saw 7\nSkipping line 348986: expected 6 fields, saw 7\n'
b'Skipping line 395697: expected 6 fields, saw 7\nSkipping line 435779: expected 6 fields, saw 9\nSkipping line 449060: expected 

In [3]:
train_data.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0.0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0.0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0.0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0.0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0.0


In [4]:
train_data.columns

Index(['id', 'qid1', 'qid2', 'question1', 'question2', 'is_duplicate'], dtype='object')

In [5]:
len(train_data)

788058

### 중복인 데이터와 아닌 데이터를 나눈 후 라벨 갯수의 균형을 맞춘다.

In [9]:
train_pos_data = train_data.loc[train_data['is_duplicate'] == 1]
train_neg_data = train_data.loc[train_data['is_duplicate'] == 0]

# class_difference = len(train_neg_data) - len(train_pos_data)
# sample_frac = 1 - (class_difference / len(train_neg_data))

# train_neg_data = train_neg_data.sample(frac=sample_frac)

In [12]:
(len(train_neg_data) - len(train_pos_data)) / len(train_neg_data)

206332

In [6]:
class_difference/len(train_neg_data)

0.7094298622619841

In [7]:
len(train_neg_data), len(train_pos_data)

(290842, 290842)

In [8]:
train_data = pd.concat([train_neg_data, train_pos_data])

In [9]:

type(train_neg_data)

pandas.core.frame.DataFrame

In [10]:
train_data.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
489088,261494,377652,377653,I am a student of instrumentation and control ...,Marvel aw boss node?,0.0
591923,297722,420114,420115,How can I better organize my thoughts before s...,Can primates speak?,0.0
126119,65313,113373,113374,Is it okay to offer someone who doesn't like y...,What do you call someone who is attracted to w...,0.0
720437,379541,511076,511077,What should I know as I am going to start new ...,I am an electrical graduate can any one sugges...,0.0
310467,156229,244545,244546,What is the equivalent resistance between the ...,What is the equivalent resistance between A an...,0.0


In [11]:
train_data.columns

Index(['id', 'qid1', 'qid2', 'question1', 'question2', 'is_duplicate'], dtype='object')

In [12]:
FILTERS = "([~.,!?\"':;)(])"

In [13]:
change_filter = re.compile(FILTERS)

In [14]:
questions1 = [str(s) for s in train_data['question1']]
questions2 = [str(s) for s in train_data['question2']]

filtered_questions1 = list()
filtered_questions2 = list()

for q in questions1:
    filtered_questions1.append(re.sub(change_filter, '', q).lower())

for q in questions2:
    filtered_questions2.append(re.sub(change_filter, '', q).lower())

In [15]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(filtered_questions1 + filtered_questions2)

In [16]:
questions1_sequence = tokenizer.texts_to_sequences(filtered_questions1)
questions2_sequence = tokenizer.texts_to_sequences(filtered_questions2)

In [17]:
max_sequence_length = 31

q1_data = pad_sequences(questions1_sequence, maxlen = max_sequence_length, padding='post')
q2_data = pad_sequences(questions2_sequence, maxlen = max_sequence_length, padding='post')

In [18]:
word_vocab = tokenizer.word_index

In [19]:
labels = np.array(train_data['is_duplicate'], dtype=int)

In [20]:
labels

array([0, 0, 0, ..., 1, 1, 1])

In [21]:
q1_data.shape, q2_data.shape, labels.shape, len(word_vocab)

((581684, 31), (581684, 31), (581684,), 87774)

In [22]:
data_configs = {}
data_configs['vocab'] = word_vocab
data_configs['vocab_size'] = len(word_vocab) + 1

In [23]:
data_configs['vocab_size']

87775

In [24]:
np.save(open('data_in/quora_questions_pairs/q1_train.npy', 'wb'), q1_data)
np.save(open('data_in/quora_questions_pairs/q2_train.npy', 'wb'), q2_data)
np.save(open('data_in/quora_questions_pairs/label_train.npy', 'wb'), labels)

json.dump(data_configs, open('data_in/quora_questions_pairs/data_configs.json', 'w'))



# Test data preprocessing

In [25]:
test_data = pd.read_csv('data_in/quora_questions_pairs/test.csv', encoding='utf-8')

In [26]:
valid_ids = [type(x) == int for x in test_data.test_id]
test_data = test_data[valid_ids].drop_duplicates()

In [27]:
list(filter(lambda x: x == False, valid_ids))

[]

In [28]:
len(test_data)

2345796

In [29]:
test_questions1 = [str(s) for s in test_data['question1']]
test_questions2 = [str(s) for s in test_data['question2']]

filtered_test_questions1 = list()
filtered_test_questions2 = list()

for q in test_questions1:
    filtered_test_questions1.append(re.sub(change_filter, '', q).lower())

for q in test_questions2:
    filtered_test_questions2.append(re.sub(change_filter, '', q).lower())

In [30]:
change_filter

re.compile(r'([~.,!?"\':;)(])', re.UNICODE)

In [31]:
test_questions1_sequence = tokenizer.texts_to_sequences(filtered_test_questions1)
test_questions2_sequence = tokenizer.texts_to_sequences(filtered_test_questions2)

In [32]:
test_q1_data = pad_sequences(test_questions1_sequence, maxlen=max_sequence_length, padding='post')
test_q2_data = pad_sequences(test_questions2_sequence, maxlen=max_sequence_length, padding='post')

### 케글 제출용 id 값

In [33]:
test_id = np.array(test_data['test_id'])
test_q1_data.shape, test_q2_data.shape, test_id.shape

((2345796, 31), (2345796, 31), (2345796,))

In [34]:
TEST_Q1_DATA = 'test_q1.npy'
TEST_Q2_DATA = 'test_q2.npy'
TEST_ID_DATA = 'test_id.npy'

np.save(open('data_in/' + TEST_Q1_DATA, 'wb'), test_q1_data)
np.save(open('data_in/' + TEST_Q2_DATA , 'wb'), test_q2_data)
np.save(open('data_in/' + TEST_ID_DATA , 'wb'), test_id)