# Remove duplicates and split train/test sets

For truevoice-intent dataset
- See the issue at https://github.com/PyThaiNLP/truevoice-intent/issues/2
- Based on https://github.com/PanthonImem/truevoice-intent/blob/master/truevoice-intent_remove-dup_sample-label.ipynb


In [1]:
import pandas as pd
import numpy as np
import random
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from IPython.display import display

In [2]:
TEST_PORTION = 0.20
RANDOM_SEED = 0

# initializing
random.seed(RANDOM_SEED)
old_train_df = pd.read_csv('mari_train.csv')
old_test_df = pd.read_csv('mari_test.csv')

# concat train and test together  drop duplicates
merged_df = pd.concat([old_train_df, old_test_df])

# drop duplicates on 'texts' and 'destination' column
undup_df = merged_df.drop_duplicates(subset = ['texts','texts_deepcut','action','object','destination'])

print('Got rid of %d duplicated rows' % (old_train_df.shape[0] + old_test_df.shape[0] - undup_df.shape[0]))
print('%d rows remaining', undup_df.shape[0])

# shuffle data
undup_df = shuffle(undup_df, random_state = RANDOM_SEED)

Got rid of 2746 duplicated rows
%d rows remaining 13429


In [3]:
# look at text length, to decide about BUCKET_SIZE
undup_df["len"] = undup_df["texts"].str.len()
undup_df.describe()

Unnamed: 0,len
count,13429.0
mean,49.870132
std,30.505069
min,4.0
25%,30.0
50%,42.0
75%,61.0
max,440.0


In [4]:
# sample train-test from each label separately to ensure similar distribution in text size between train-test

# put texts in to 9 buckets of these lenghts:
# 0-15, 16-30, 31-45, 46-60, 61-75, 76-90, 91-105, 106-120, 121+
BUCKET_SIZE = 15  # based on dataset characteristics
BUCKET_NUM = 9
def bucket(size):
    size = size - 1
    i = int(size / BUCKET_SIZE)
    if i > BUCKET_NUM - 1:
        i = BUCKET_NUM - 1
    return i

undup_df["size"] = undup_df["len"].apply(bucket)
undup_bysize_ls = [pd.DataFrame(y) for x, y in undup_df.groupby("size", as_index=False)]
del undup_df

In [5]:
# sample train-test from each label separately to ensure similar distribution in labels between train-test
train_ls, test_ls = [], []

# iterate over DataFrames, grouped by text length
for i in range(0, BUCKET_NUM):
    print("\nLength Bucket: %d" % i)
    print("Samples   Label")
    df = undup_bysize_ls[i]
    for label in df['destination'].unique():
        print('%7d   %s' % ((df[df.destination == label].shape[0]), (label)))
        label_df = df[df.destination == label]
        _train_df, _test_df = train_test_split(label_df, test_size=TEST_PORTION, random_state=RANDOM_SEED)
        train_ls.append(_train_df)
        test_ls.append(_test_df)

# concat the samples together
train_df = pd.concat(train_ls)
test_df = pd.concat(test_ls)
train_df = train_df.drop(columns=["size"])
test_df = test_df.drop(columns=["size"])

# shuffle post-concatenation
train_df = shuffle(train_df, random_state=RANDOM_SEED)
test_df = shuffle(test_df, random_state=RANDOM_SEED)

print()
print('Train Set Size', len(train_df))
print('Test Set Size',len(test_df))

# multi-intent case
# move rows with texts that are both in train and test to train
movels = set(train_df['texts'].values).intersection(set(test_df['texts'].values))
train_df = pd.concat([train_df, test_df[test_df['texts'].isin(movels)]])
test_df = test_df[~test_df['texts'].isin(movels)]

print()
print('post-moving:')
print('Train Set Size', len(train_df))
print('Test Set Size',len(test_df))

display(train_df[:3])


Length Bucket: 0
Samples   Label
    133   billing and payment
     55   promotions
     71   other queries
      6   true money
     50   internet
     11   international dialing
     27   lost and stolen

Length Bucket: 1
Samples   Label
    666   other queries
   1048   billing and payment
    738   promotions
    447   internet
     81   lost and stolen
     63   true money
    128   international dialing

Length Bucket: 2
Samples   Label
   1408   billing and payment
    673   other queries
    977   promotions
     68   true money
    567   internet
    162   international dialing
     58   lost and stolen

Length Bucket: 3
Samples   Label
    424   other queries
    417   internet
   1028   billing and payment
    542   promotions
     51   true money
     30   lost and stolen
     72   international dialing

Length Bucket: 4
Samples   Label
    218   other queries
     34   true money
    613   billing and payment
    238   internet
    305   promotions
     37   international

Unnamed: 0,texts,texts_deepcut,action,object,destination,len
12072,สาบถาม ช๊อป ที่สามารถติดต่อเช็คเครื่องมือถือมี...,สาบถาม ช๊อป ที่ สามารถ ติดต่อ เช็ค เครื่องมือถ...,request,information,other queries,54
12764,อยากถามว่ามีซิมอยู่แล้ว จะสมัครเล่นเน็ตได้ไหม,อยาก ถาม ว่า มี ซิม อยู่แล้ว จะ สมัคร เล่น เน็...,buy,package,promotions,45
11499,สอบถามแพจแกจ อินเตอร์เน็ต ที่จะใช้ในต่างประเทศ,สอบถาม แพจแกจ อินเตอร์เน็ต ที่ จะ ใช้ ใน ต่าง ...,enquire,roaming,international dialing,46


In [6]:
# check if train-test sets have similar text length distribution
print()
print("Text length distribution in Train Set")
display(train_df.describe())
print("Text length distribution in Test Set")
display(test_df.describe())

train_df = train_df.drop(columns=["len"])
test_df = test_df.drop(columns=["len"])

# count each labels in train and test set
print('Label Count in Train Set')
count_train = train_df.groupby('destination')['texts'].nunique().to_frame('count').reset_index()
display(count_train)

print('Label Count in Test Set')
count_test = test_df.groupby('destination')['texts'].nunique().to_frame('count').reset_index()
display(count_test)

train_df.to_csv('mari-train-undup-balanced-label.csv', index=False)
test_df.to_csv('mari-test-undup-balanced-label.csv', index=False)


Text length distribution in Train Set


Unnamed: 0,len
count,10734.0
mean,49.895565
std,30.702717
min,4.0
25%,30.0
50%,42.0
75%,61.0
max,440.0


Text length distribution in Test Set


Unnamed: 0,len
count,2695.0
mean,49.768831
std,29.710175
min,5.0
25%,30.0
50%,42.0
75%,61.0
max,310.0


Label Count in Train Set


Unnamed: 0,destination,count
0,billing and payment,4039
1,international dialing,357
2,internet,1650
3,lost and stolen,181
4,other queries,1876
5,promotions,2380
6,true money,195


Label Count in Test Set


Unnamed: 0,destination,count
0,billing and payment,1009
1,international dialing,95
2,internet,417
3,lost and stolen,50
4,other queries,475
5,promotions,588
6,true money,53
