In [None]:
!gdown 1mTfs7CvOba-kbnW1g3RPplEPNRrJR2Vy
!gdown 1hiY6w0k0nzN0gLuhDJbrVWEPhYNeE-kq
!gdown 17tDZEoW7NUikFid-pf5HLap3c_59BOrH

Downloading...
From: https://drive.google.com/uc?id=1mTfs7CvOba-kbnW1g3RPplEPNRrJR2Vy
To: /content/LST_dev.txt
100% 2.96M/2.96M [00:00<00:00, 185MB/s]
Downloading...
From: https://drive.google.com/uc?id=1hiY6w0k0nzN0gLuhDJbrVWEPhYNeE-kq
To: /content/LST_train.txt
100% 32.4M/32.4M [00:00<00:00, 76.2MB/s]
Downloading...
From: https://drive.google.com/uc?id=17tDZEoW7NUikFid-pf5HLap3c_59BOrH
To: /content/LST_test.txt
100% 2.45M/2.45M [00:00<00:00, 106MB/s]


In [None]:
# 200-unit embeddings
!gdown --id 14k1PLN9MVszCK6zUIRcLzWLwEGdR0ABw

Downloading...
From: https://drive.google.com/uc?id=14k1PLN9MVszCK6zUIRcLzWLwEGdR0ABw
To: /content/TNC_embeddings-200.bin
100% 50.6M/50.6M [00:00<00:00, 214MB/s]


In [None]:
import numpy as np
import pandas as pd
from numpy.linalg import norm
from sklearn.metrics import classification_report, confusion_matrix
from IPython.display import Image, display_png
from gensim.models import word2vec, KeyedVectors
from keras.models import Sequential
from keras.layers import (Input, Embedding, Dense, Dropout, Flatten, GlobalAveragePooling1D, Conv1D, GlobalMaxPooling1D)
from tensorflow.keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from tensorflow import keras
from tensorflow.keras import layers

# Load the word embeddings and the data

In [None]:
!pip install pythainlp
from pythainlp.tokenize import word_tokenize

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pythainlp
  Downloading pythainlp-3.0.8-py3-none-any.whl (11.5 MB)
[K     |████████████████████████████████| 11.5 MB 4.6 MB/s 
Collecting tinydb>=3.0
  Downloading tinydb-4.7.0-py3-none-any.whl (24 kB)
Installing collected packages: tinydb, pythainlp
Successfully installed pythainlp-3.0.8 tinydb-4.7.0


In [None]:
train = pd.read_csv('LST_train.txt', encoding='utf-8',sep='\t')
dev = pd.read_csv('LST_dev.txt', encoding='utf-8', sep='\t')
test = pd.read_csv('LST_test.txt', encoding='utf-8', sep='\t')
def convert_df(df):
    df['text'] = df['tokens'].apply(lambda x: x.replace('|', ''))
    df['text'] = df['text'].apply(lambda x: x.replace('_', ' '))
    df['token'] = df['text'].apply(lambda x: word_tokenize(x))
    df = df.drop(columns=['tokens', 'num', 'Unnamed: 0', 'text'])
    df = df.rename({'lable' : 'category'}, axis = 'columns')
    return df


In [None]:
train = convert_df(train)
dev = convert_df(dev)
test = convert_df(test)

In [None]:
train

Unnamed: 0,category,token
0,politics,"[สุรยุทธ์, ยัน, ปฏิเสธ, ลงนาม, , MOU, , กับ,..."
1,C&A,"[บุก, ยึด, ไม้, เถื่อน, อดีต, ส.ส., บุรีรัมย์,..."
2,C&A,"[ผกก., แจง, "", สุ, ไฮ, มี, "", ที่, ถูก, วิสามั..."
3,general,"[เฒ่า, วัย, , 72, , ร้อง, ถูก, รัฐ, ยึด, ที่..."
4,C&A,"[เมีย, สาวใหญ่, สุด, แค้น, ผัว, นอกใจ, มอมเหล้..."
...,...,...
3789,politics,"["", เพื่อ, ไทย, "", ชำแหละ, , 6, , เดือน, รัฐ..."
3790,politics,"[พันธมิตร, เล็ง, แจ้ง, จับกลุ่ม, เสื้อ, แดง, ..."
3791,politics,"[ส.ว., ผ่าน, ความ, เห็นชอบ, พ.ร.ก., กู้, , 4,..."
3792,politics,"[นายกฯ, บี้, ผบ., ตร., ลา, ลง, ใต้, ต่อ, นายกฯ..."


In [None]:
train['category'].unique()

array(['politics', 'C&A', 'general', 'economics', 'culture', 'STE',
       'international', 'environment', 'weather', 'health', 'disaster',
       'entertainment', 'development', 'sports', 'royal'], dtype=object)

# CNN

In [None]:
w2v_model = KeyedVectors.load_word2vec_format('TNC_embeddings-200.bin', 
                                              binary=True, unicode_errors='ignore')
vocab_size = len(w2v_model.vocab)
vector_dim = w2v_model.vector_size
# make weight matrix of word embedding, vocab size + 1 (for padding)
embedding_matrix = np.zeros((vocab_size+1, vector_dim), dtype="float32")
embedding_matrix[0] = np.zeros(vector_dim)

word_to_index = {word:i+1 for i, word in enumerate(w2v_model.vocab)}
# word to index dictionary, 0 for padding, UNKNOWN
word_to_index['PADDING'] = 0 

for i, word in enumerate(w2v_model.vocab):
    embedding_matrix[i+1] = w2v_model[word] 
    
# load data
def convert_words(df, word_to_index, max_length):
    tokens = df['token']
    list_of_list_of_indices = list(tokens.map(lambda x: [word_to_index.get(word, 0) for word in x]))
    return pad_sequences(list_of_list_of_indices, max_length, padding='post', value=0, truncating='post')

# max length (กำหนดเอง)
max_len = 500
train_x = convert_words(train, word_to_index, max_len)
dev_x = convert_words(dev, word_to_index, max_len)
test_x = convert_words(test, word_to_index, max_len)
num_classes = 3

def get_label(df):
    star_to_label = {'politics': 0, 'C&A': 1, 'general': 2, 'economics': 3, 'culture': 4, 'STE': 5,
       'international': 6, 'environment': 7, 'weather': 8, 'health': 9 , 'disaster': 10,
       'entertainment': 11 , 'development': 12 , 'sports': 13 , 'royal': 14}
    # apply functions & convert to np.array
    label = np.array(df['category'].replace(star_to_label).tolist())
    df['label'] = label
    return to_categorical(label, num_classes=15)

# label : one-hot vector
train_y = get_label(train)
dev_y = get_label(dev)
test_y = get_label(test)



In [None]:
train

Unnamed: 0,category,token,label
0,politics,"[สุรยุทธ์, ยัน, ปฏิเสธ, ลงนาม, , MOU, , กับ,...",0
1,C&A,"[บุก, ยึด, ไม้, เถื่อน, อดีต, ส.ส., บุรีรัมย์,...",1
2,C&A,"[ผกก., แจง, "", สุ, ไฮ, มี, "", ที่, ถูก, วิสามั...",1
3,general,"[เฒ่า, วัย, , 72, , ร้อง, ถูก, รัฐ, ยึด, ที่...",2
4,C&A,"[เมีย, สาวใหญ่, สุด, แค้น, ผัว, นอกใจ, มอมเหล้...",1
...,...,...,...
3789,politics,"["", เพื่อ, ไทย, "", ชำแหละ, , 6, , เดือน, รัฐ...",0
3790,politics,"[พันธมิตร, เล็ง, แจ้ง, จับกลุ่ม, เสื้อ, แดง, ...",0
3791,politics,"[ส.ว., ผ่าน, ความ, เห็นชอบ, พ.ร.ก., กู้, , 4,...",0
3792,politics,"[นายกฯ, บี้, ผบ., ตร., ลา, ลง, ใต้, ต่อ, นายกฯ...",0


In [None]:
test

Unnamed: 0,category,token,label
0,disaster,"[จีน, -, อินเดีย, เสี่ยง, สูญเสีย, จาก, ภัยธรร...",10
1,disaster,"[เกิดเหตุ, พายุ, พัด, ถล่ม, จีน, , ทำให้, มี,...",10
2,disaster,"[เชื่อ, น, จีน, , 40, %, , เสี่ยง, ถล่ม, , ...",10
3,culture,"[ร่าง, ของ, หลวงปู่, ทิม, , เกจิ, ชื่อดัง, แห...",4
4,culture,"[ก., เกษตร, ฯ, , เตรียม, จัดงาน, ประกวด, โครง...",4
...,...,...,...
478,politics,"[ปทีป, -, จุมพล, , เบียด, กัน, สูสี, เก้าอี้,...",0
479,politics,"["", มาร์ค, "", เร่ง, กฤษฎีกา, , ตรวจแก้, ร่าง,...",0
480,politics,"[พัชร, วาท, ยอม, ให้การ, ปปช., , ปาก, สุดท้าย...",0
481,politics,"[มาร์ค, โยน, เทือก, ดู, ฎีกา, , ซัด, ไอ้, ตู่...",0


In [None]:
train_x

array([[11876,  3172,  1001, ...,     0,     0,     0],
       [ 2671,   936,   269, ...,     0,     0,     0],
       [ 7567,  6563,   409, ..., 50014,  6084,   121],
       ...,
       [ 9549,   230,    13, ...,    15,   947,    78],
       [36804,  8649,  9504, ..., 19131,     0,  1129],
       [ 2813,  2708,   732, ...,     0,     0,     0]], dtype=int32)

In [None]:
train_y

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       ...,
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [None]:
dev_x

array([[ 5404, 35469,  2041, ...,     0,     0,    10],
       [12314,     0,   822, ...,  1804,     0,    14],
       [12314,     0,   822, ..., 13332,  5503,    29],
       ...,
       [15264,    56,   674, ...,     0,     0,     0],
       [  409, 11764,   409, ...,    12, 19131,     6],
       [ 3939,   477,  1120, ...,     0,     0,    36]], dtype=int32)

In [None]:
test_x

array([[  342,   100,  1497, ...,     0,     0,     0],
       [    0,  2963,  1853, ...,     0,     0,     0],
       [  318,  2659,   342, ...,     0,     0,     0],
       ...,
       [    0, 11533,   276, ...,    32,     0,    16],
       [ 4255,  2077,  5147, ...,    43,   567,     6],
       [  409,  5147,   409, ...,   297, 19418,     0]], dtype=int32)

In [None]:
def make_cnn_model(print_model=True):
    filters = 250 #number of filters in your Convnet
    kernel_size = 3 # a window size of 3 tokens
    hidden_dims = 250 #number of neurons at the normal feedforward NN

    cnn_model = Sequential()
    cnn_model.add(Embedding(input_dim=vocab_size+1,
                        input_length=max_len,
                        output_dim=vector_dim, 
                        weights=[embedding_matrix], 
                        #mask_zero=True,
                        trainable=False))

    cnn_model.add(Conv1D(filters,
                        kernel_size,
                        input_shape=(max_len ,vector_dim),
                        #padding='valid',
                        activation='relu',
                        strides=1,))
    cnn_model.add(GlobalMaxPooling1D())
    #GlobalMaxPooling1D(n) default = 2.
    cnn_model.add(Dense(hidden_dims, activation='relu'))
    cnn_model.add(Dropout(0.4))
    cnn_model.add(Dense(15, activation='softmax'))
    opt = keras.optimizers.Adam(learning_rate=0.005)
    # compile model
    cnn_model.compile(loss="categorical_crossentropy", metrics=["accuracy"],optimizer=opt)
    return cnn_model

In [None]:
cnn_model = make_cnn_model()

In [None]:
# train
cnn_history = cnn_model.fit(train_x, train_y, batch_size=128, epochs=30, validation_data=(dev_x, dev_y))

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [None]:
prediction = [np.argmax(x) for x in cnn_model.predict(dev_x)]
print(classification_report(dev['label'], prediction))

              precision    recall  f1-score   support

           0       0.85      0.97      0.90       147
           1       0.72      0.79      0.75        70
           2       0.00      0.00      0.00         8
           3       0.77      0.73      0.75        60
           4       0.00      0.00      0.00         4
           5       0.41      0.44      0.43        27
           6       0.64      0.89      0.75        28
           7       0.00      0.00      0.00         4
           8       0.00      0.00      0.00         4
           9       0.62      0.80      0.70        10
          10       0.00      0.00      0.00         5
          11       0.85      0.87      0.86        53
          12       0.00      0.00      0.00         5
          13       0.97      0.86      0.91        43
          14       0.00      0.00      0.00         6

    accuracy                           0.78       474
   macro avg       0.39      0.42      0.40       474
weighted avg       0.72   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
prediction = [np.argmax(x) for x in cnn_model.predict(test_x)]
print(classification_report(test['label'], prediction))

              precision    recall  f1-score   support

           0       0.73      0.97      0.83       148
           1       0.83      0.84      0.84        70
           2       0.00      0.00      0.00         8
           3       0.70      0.66      0.68        61
           4       0.00      0.00      0.00         5
           5       0.50      0.61      0.55        28
           6       0.71      0.73      0.72        30
           7       0.00      0.00      0.00         5
           8       0.67      0.50      0.57         4
           9       0.00      0.00      0.00        10
          10       0.00      0.00      0.00         6
          11       0.86      0.83      0.85        53
          12       1.00      0.17      0.29         6
          13       1.00      0.79      0.88        43
          14       1.00      0.33      0.50         6

    accuracy                           0.75       483
   macro avg       0.53      0.43      0.45       483
weighted avg       0.72   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


#Max-ent

In [None]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [None]:
def featurize(text):
  return {x:1 for x in text}

vectorizer = DictVectorizer(sparse=True)
train_features = train['token'].apply(featurize)
#print(train_features)
feature_vectors = vectorizer.fit_transform(train_features)


#Train model
lr_text_classifier = LogisticRegression()
lr_text_classifier.fit(feature_vectors, train['category'])

# Evaluation
dev_features = dev['token'].apply(featurize)
dev_feature_vectors = vectorizer.transform(dev_features)
dev_predictions = lr_text_classifier.predict(dev_feature_vectors)
print (classification_report(dev_predictions, dev['category']))

               precision    recall  f1-score   support

          C&A       0.73      0.84      0.78        61
          STE       0.56      0.68      0.61        22
      culture       0.00      0.00      0.00         0
  development       0.00      0.00      0.00         1
     disaster       0.00      0.00      0.00         0
    economics       0.87      0.80      0.83        65
entertainment       0.96      0.85      0.90        60
  environment       0.25      1.00      0.40         1
      general       0.00      0.00      0.00         0
       health       0.90      0.90      0.90        10
international       0.89      0.78      0.83        32
     politics       0.99      0.82      0.90       178
        royal       0.50      1.00      0.67         3
       sports       0.91      0.97      0.94        40
      weather       0.25      1.00      0.40         1

     accuracy                           0.83       474
    macro avg       0.52      0.64      0.54       474
 weighte

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
test_features = test['token'].apply(featurize)
test_feature_vectors = vectorizer.transform(test_features)
test_predictions = lr_text_classifier.predict(test_feature_vectors)
#print (classification_report(test_predictions, test['category']))
print (classification_report(test['category'],test_predictions))

               precision    recall  f1-score   support

          C&A       0.94      0.87      0.90        70
          STE       0.59      0.79      0.68        28
      culture       1.00      0.20      0.33         5
  development       1.00      0.33      0.50         6
     disaster       0.00      0.00      0.00         6
    economics       0.67      0.56      0.61        61
entertainment       0.92      0.91      0.91        53
  environment       0.00      0.00      0.00         5
      general       0.00      0.00      0.00         8
       health       0.80      0.40      0.53        10
international       0.79      0.77      0.78        30
     politics       0.73      0.99      0.84       148
        royal       0.00      0.00      0.00         6
       sports       1.00      0.88      0.94        43
      weather       1.00      0.50      0.67         4

     accuracy                           0.79       483
    macro avg       0.63      0.48      0.51       483
 weighte

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
test.groupby(['category'])['category'].count()

category
C&A               70
STE               28
culture            5
development        6
disaster           6
economics         61
entertainment     53
environment        5
general            8
health            10
international     30
politics         148
royal              6
sports            43
weather            4
Name: category, dtype: int64