# Process `wisesight-sentiment` for [Huggingface Datasets](https://github.com/huggingface/datasets)

This notebook processes [`wisesight-sentiment`](https://github.com/PyThaiNLP/wisesight-sentiment) dataset which was provided by **Wisesight (Thailand) Co., Ltd.** It contains 24,063 texts with 4 categories (`q`uestion, `neg`ative, `neu`tral, and `pos`itive) for training set and 2,674 texts for test set. We perform a uniformly random 90/10 train-validation split from the original train set.

In [18]:
import re
import pandas as pd
from pathlib import Path
from pythainlp.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from tqdm import tqdm_notebook

In [19]:
# Set data path
data_folder = Path("kaggle-competition/")

In [20]:
# Generate _train.csv from traint.txt and train_label.txt
texts = []
labels = []

with open(data_folder / "train.txt") as f:
    for line in f:
        texts.append(line.strip())

with open(data_folder / "train_label.txt") as f:
    for line in f:
        labels.append(line.strip())

df = pd.DataFrame({ "category": labels, "texts": texts })
del texts
del labels
df.shape

(24063, 2)

In [33]:
# Generate _test.csv from test.txt, use "neu" as the only class
texts = []
labels = []

with open(data_folder / "test.txt") as f:
    for line in f:
        texts.append(line.strip())

with open(data_folder / "test_label.txt") as f:
    for line in f:
        labels.append(line.strip())

test_df = pd.DataFrame({ "category": labels, "texts": texts })
del texts
del labels
test_df.shape

(2674, 2)

In [34]:
# Filter #ERROR! from all datasets
df = df[df.texts!='#ERROR!'].reset_index(drop=True)
test_df = test_df[test_df.texts!='#ERROR!'].reset_index(drop=True)
df.shape,test_df.shape

((24032, 2), (2671, 2))

In [35]:
# Split validation
train_df, valid_df = train_test_split(df, test_size=0.1, random_state=1412)
train_df.shape, valid_df.shape

((21628, 2), (2404, 2))

In [36]:
train_df.describe()

Unnamed: 0,category,texts
count,21628,21628
unique,4,21612
top,neu,พรบ.คู่ชีวิตไทยนี่ถึงไหนแล้วคะ ไต้หวันไปไกลแล้...
freq,11795,2


In [37]:
train_df.category.value_counts() / train_df.shape[0]

neu    0.545358
neg    0.253884
pos    0.178750
q      0.022009
Name: category, dtype: float64

In [38]:
valid_df.describe()

Unnamed: 0,category,texts
count,2404,2404
unique,4,2403
top,neu,สวัสดีค่ะ เราขอสอบถามเรื่องการขึ้นรถไฟฟ้าTHSR ...
freq,1291,2


In [42]:
valid_df.category.value_counts() / valid_df.shape[0]

neu    0.537022
neg    0.264975
pos    0.180532
q      0.017471
Name: category, dtype: float64

In [44]:
test_df.describe()

Unnamed: 0,category,texts
count,2671,2671
unique,4,2671
top,neu,น่าสนนน
freq,1453,1


In [40]:
test_df.category.value_counts() / test_df.shape[0]

neu    0.543991
neg    0.255709
pos    0.178959
q      0.021340
Name: category, dtype: float64

In [53]:
#save
train_df.to_json('huggingface/train.json',orient='records',lines=True)
valid_df.to_json('huggingface/valid.json',orient='records',lines=True)
test_df.to_json('huggingface/test.json',orient='records',lines=True)