In [None]:
%%capture
!pip install gdown
!pip install pythainlp
!pip install emoji

In [None]:
!gdown --id 1I6FkY-wppSCt3eB1czmP0hHfcScwMc3s
!unzip 'sentiment-assignment.zip'
!rm 'sentiment-assignment.zip'

In [12]:
# Import require library
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
from pythainlp import word_tokenize
from pythainlp.ulmfit import process_thai
from pythainlp.tag.named_entity import ThaiNameTagger

In [49]:
def load_data(path):
  data = pd.read_csv( path, sep="\n", header=None)
  data.columns = ['texts']

  #set lower
  data['texts'] = data.texts.map(lambda x: x.lower())

  return data

In [50]:
neg_file_path = os.path.join(os.getcwd(), 'sentiment-assignment/neg.txt')
neu_file_path = os.path.join(os.getcwd(), 'sentiment-assignment/neu.txt')
pos_file_path = os.path.join(os.getcwd(), 'sentiment-assignment/pos.txt')
q_file_path = os.path.join(os.getcwd(), 'sentiment-assignment/q.txt')

df = load_data(neg_file_path)
df = df.append(load_data(neu_file_path))
df = df.append(load_data(pos_file_path))
df = df.append(load_data(q_file_path))
df = df.sample(frac=1).reset_index(drop=True)
print(df.shape)
df.head()

(28055, 1)


Unnamed: 0,texts
0,งืม
1,คิดว่าไม่เเพงนะเพราะกินตั้ง13คนคิดเเล้วก็คนประ...
2,ช้าง1..สิงห์1..ไฮเนเก้น1..😭😭😭😭😭
3,เบียร์ช้างดื่มแล้วพูดภาษาอังกฤษคล่องด้วยครับ
4,แพ้ผ้าอนามัย t t


# Preprocessing

In [51]:
# Split data
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(df, test_size = 0.15, random_state=0)
df_train = df_train.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

df_train.shape[0], df_test.shape[0]

(23846, 4209)

## tagging

In [55]:
def tag_name(text):
    tagged = ner.get_ner(text, pos=True)
    word = []
    pos_tag = []
    ner_tag = []
    for tag in tagged:
        word.append(tag[0])
        pos_tag.append(tag[1])
        ner_tag.append(tag[2])
    return word, pos_tag, ner_tag

def tag_df(df):
    word_list = []
    pos_tag_list = []
    ner_tag_list = []
    for text in df['texts']:
        word, pos_tag, ner_tag = tag_name(text)
        word_list.append(word)
        pos_tag_list.append(pos_tag)
        ner_tag_list.append(ner_tag)
    
    data = []
    for i in range(len(word_list)):
        d = [word_list[i], pos_tag_list[i], ner_tag_list[i]]
        data.append(d)
    
    df = pd.DataFrame(data = data, columns = ['words', 'pos', 'ner'])
    return df

In [56]:
ner = ThaiNameTagger()

print('shape before tag :', df_train.shape)
df_train = tag_df(df_train)
df_test = tag_df(df_test)
print('shape after tag :', df_train.shape)

shape before tag : (23846, 1)
shape after tag : (23846, 3)


In [62]:
df_train.sample(10)

Unnamed: 0,words,pos,ner
772,"[ตะ, เตือน, ไต]","[NN, VV, NN]","[O, O, O]"
14300,"[คุณ, ลูกค้า, สามารถ, เข้าไป, อ่าน, รายละเอียด...","[NN, NN, AX, VV, VV, NN, NN, AV, VV, PS, PU, N...","[B-PERSON, I-PERSON, O, O, O, O, O, O, O, O, O..."
2655,"[25, นะ]","[NU, PA]","[B-ORGANIZATION, I-ORGANIZATION]"
17235,"[มา, ทาน, ที่, สาขา, เซ, นท, รัล, รัตนาธิเบศร์...","[AV, VV, PS, NN, NN, NN, NN, NN, PU, AX, NN, N...","[O, O, O, B-LOCATION, I-LOCATION, I-LOCATION, ..."
10069,"[ใช้, นาวา, รา, , มา, , 4, , ปี, , ไม, วิ่...","[VV, NN, NN, PU, AV, PU, NU, PU, CL, PU, NN, V...","[O, O, O, O, O, O, B-TIME, I-TIME, I-TIME, O, ..."
1685,"[รถยนต์, โตโยต้า, ไฮลักซ์, รุ่น, ที่, หาย, ไป,...","[NN, NN, NN, NN, CC, VV, AV, VV, PU, NN, PU, N...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O]"
5695,"[สงสาร, นาง, นะ, , เจอ, บุหรี่, ไฟฟ้า, ก็, ยึ...","[VV, NN, PA, PU, VV, NN, NN, CC, VV, AV, VV, P...","[O, B-PERSON, I-PERSON, I-PERSON, I-PERSON, I-..."
5821,"[กว่า, จะ, ร็อก, เท่า, วันนี้, , ใน, ทุกวัน, ...","[CC, AX, VV, VV, NN, PU, PS, NN, NN, PR, CC, V...","[O, O, O, O, B-DATE, O, O, O, O, O, O, O, O, O..."
15738,"[อยุธยา, ไม่, เห็น, มี]","[NN, NG, VV, VV]","[B-LOCATION, O, O, O]"
5242,"[ใคร, จะ, ใส่, ชุด, ไทย, ไป, กิน, , ค่า, ชุด,...","[PR, AX, VV, NN, NN, AV, VV, PU, NN, NN, VV, A...","[O, O, O, O, B-LOCATION, O, O, O, O, O, O, O, ..."


## padding

In [None]:
from keras.preprocessing import sequence