# Splitting data into training and testing

 Total: 1306122

In [None]:
import pandas as pd

filepath_train = '../data/train.csv'
df_train = pd.read_csv(filepath_train,index_col=0)
print('training dataset size: ',df_train.shape)

# Save new split training and testing data

In [None]:
df_train['qid_base_hex'] = df_train.index
df_train['qid_base_ten'] = df_train['qid_base_hex'].apply(lambda x : int(x, 16))

msk = np.random.rand(len(df_train)) < 0.8

train = df_train[msk][['question_text','target']]
test = df_train[~msk][['question_text','target']]

train.to_csv('../data/mytrain.csv')
test.to_csv('../data/mytest.csv')

# Read from train and test data and visualize them

In [None]:
df_train = pd.read_csv('../data/mytrain.csv')
df_test = pd.read_csv('../data/mytest.csv')

In [None]:
df_train['train_test'] = 'train'
df_test['train_test'] = 'test'
df = pd.concat([df_train, df_test])
df['qid_base_ten'] = df['qid'].apply(lambda x : int(x, 16))


min_qid = df['qid_base_ten'].min()
df['qid_base_ten_normalized'] = df['qid_base_ten'].apply(lambda x : (x - min_qid)/min_qid)

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
plt.clf()
plt.figure(figsize=(18, 8));
plt.scatter(x=df[df['train_test']=='train']['qid_base_ten_normalized'], y=[1]*df[df['train_test']=='train'].shape[0], label='Train', s=300);
plt.scatter(x=df[df['train_test']=='test']['qid_base_ten_normalized'], y=[1]*df[df['train_test']=='test'].shape[0], label='Test',s=2);
plt.xlabel('qid_base_ten_normalized');
plt.ylabel('N/A');
plt.title('qid_base_ten_normalized for train and test datasets')
plt.legend()
plt.show()

# Finally the reading of training and testing data would be:

In [None]:
import pandas as pd
df_train = pd.read_csv('../data/mytrain.csv')
df_test = pd.read_csv('../data/mytest.csv')
# or read qid as index:
# df_train = pd.read_csv('../../data/mytrain.csv',index_col=0)
# df_test = pd.read_csv('../../data/mytest.csv',index_col=0)

# Stats over corpus

In [None]:
sentences = df_train['question_text'].values
sen_len = [len(line.split()) for line in sentences]

In [None]:
print('Min len:',min(sen_len))
print('Max len:',max(sen_len))

In [None]:
%matplotlib inline
import numpy as np
from matplotlib import pyplot as plt



plt.hist(sen_len, bins =  np.arange(0,140,2)) 
plt.title("histogram") 
plt.xlabel('Sentence lens')
plt.ylabel('Count')
plt.show()

In [None]:
import re
tokenized_sen = [re.findall(r"\w+|[^\w\s]", text, re.UNICODE) for text in sentences]
token_len = [len(e) for e in tokenized_sen]
print('Min len:',min(token_len))
print('Max len:',max(token_len))

In [None]:
token_len.sort()
x = range(600)
y = []
prev = 0
for e in x:
    try:
        prev = token_len.index(e)
        y.append(token_len.index(e))
    except:
        y.append(prev)

y_percent = [e/len(token_len) for e in y]

In [None]:
plt.plot(x, y_percent, 'ro')
plt.title("Percentage") 
plt.xlabel('Number of tokens')
plt.ylabel('Percentages')
plt.show()

`(1-y_percentage[100])*len(token_len) = 22`, which means only about 22 sentences are longer than 100 tokens

<br/><br/><br/><br/><br/><br/><br/><br/><br/><br/>


# Starting with mytrain.csv and mytest.csv

## labels of samples with \[math\] that need to be modified

1. Proper math questions that were classified as insincere by Quora:  
`[[22402,0],
[30914,0],
[101048,0],
[131075,0],
[134731,0],
[185318,0],
[224464,0],
[262046,0],
[267327,0],
[354833,0],
[405552,0],
[407980,0],
[422950,0],
[583645,0], 
[584827,0],
[649125,0],
[731531,0],
[848889,0],
[875208,0], 
[930275,0], 
[972559,0], 
[976850,0], 
[1007192,0], 
[1012853,0], 
[1044032,0]]`

2. To change the label accordingly

In [None]:
import pandas as pd
import numpy as np
df_train = pd.read_csv('../data/mytrain.csv')
list_to_be_handled = [[22402,0],
[30914,0],
[101048,0],
[131075,0],
[134731,0],
[185318,0],
[224464,0],
[262046,0],
[267327,0],
[354833,0],
[405552,0],
[407980,0],
[422950,0],
[583645,0], 
[584827,0],
[649125,0],
[731531,0],
[848889,0],
[875208,0], 
[930275,0], 
[972559,0], 
[976850,0], 
[1007192,0], 
[1012853,0], 
[1044032,0]]
for [ind, target] in list_to_be_handled:
    print('%d orginal target %d changing to ' % (ind,df_train.iloc[ind].target),end='')
    df_train.loc[ind,'target'] = target
    print(df_train.iloc[ind].target)

3. Remove sentences with newline characters

In [None]:
sentences = df_train['question_text'].values
is_newline = [1 if '\n' in e else 0 for e in sentences] # there are 6 sentences containing \n
df_train_newline_removed = df_train[~df_train.question_text.str.contains('\n')]

4. Since there are around 20 sentences with more than 100 tokens, to speed up training process, remove them

In [None]:
import re
########################### following code compares regex vs spacy in terms of tokenization
tokens = [re.findall(r"\w+|[^\w\s]", text, re.UNICODE) for text in df_train_newline_removed.question_text.values]

from allennlp.data.tokenizers.word_splitter import SpacyWordSplitter

def tokenizer(x: str):
    return [w.text for w in
        SpacyWordSplitter(language='en_core_web_sm', pos_tags=False).split_words(x)]

tokens_spacy=[]
for i in range(len(df_train_newline_removed.question_text.values)):
    if i % 10000 == 0:
        print(i)
    tokens_spacy.append(tokenizer(df_train_newline_removed.question_text.values[i]))

import numpy as np
tokens_np = np.array(tokens)
tokens_spacy_np = np.array(tokens_spacy)


for i in range(10000):
    if tokens_np[i] != tokens_spacy_np[i]:
        print(i,tokens_np[i],tokens_spacy_np[i])

######################### Apparently spacy makes more sense        
        
        
        


df_train_newline_removed['tokens'] = tokens_spacy
token_len = [len(e) for e in tokens]
df_train_newline_removed['token_len'] = token_len
df_train_filtered = df_train_newline_removed[df_train_newline_removed['token_len']<=100]

In [21]:
print('Originally file contains %d rows' % 1044886)
print('There are %d samples' % df_train.shape[0])
print('After removing data containing newline, there are %d samples' % df_train_newline_removed.shape[0])
print('After removing data with more than 100 tokens, there are %d samples' % df_train_filtered.shape[0])

Originally file contains 1044886 rows
There are 1044865 samples
After removing data containing newline, there are 1044859 samples
After removing data with more than 100 tokens, there are 1044839 samples


5. Join tokens so that each word and punctuations are separated

In [None]:
def token_join(row):
    return ' '.join(row['tokens'])

df_train_filtered['tokenized'] = df_train_filtered.apply(lambda row: token_join(row),axis=1)

6. Write question texts into file for embedding

In [None]:
# "insincere" has a value of 1, otherwise 0
sincere = [1 if e == 0 else 0 for e in df_train_filtered.target]
insincere = [0 if e == 0 else 1 for e in df_train_filtered.target]
df_train_filtered['sincere'], df_train_filtered['insincere'] = sincere, insincere
df_train_filtered.to_csv('filtered_train_data_all.csv')
np.savetxt('train_no_newline_no_quote_tokenized',df_train_filtered.tokenized.values,fmt='%s')