In [1]:
%run data_read.py

Data path: C:\Users\Pop\Documents\GitHub\utcc_independent_study\training\..\data\300_data_pop.xlsx
data_size: 318
variable: data train_data test_data


In [2]:
import numpy as np
import pandas as pd
import re
from collections import Counter
import string
from sklearn.model_selection import train_test_split
import torch
import torch.optim as optim
import torch.nn as nn 
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

def thai_tokenizer(text):
    text = normalize(text)
    tokens = word_tokenize(text, engine='newmm')
    return tokens

counts = Counter()
for text in list(data['comment']):
    counts.update(thai_tokenizer(text))

print("num_words before:",len(counts.keys()))
for word in list(counts):
    if counts[word] < 2:
        del counts[word]
print("num_words after:",len(counts.keys()))

# Creating vocabulary
vocab2index = {"":0, "UNK":1}
words = ["", "UNK"]
for word in counts:
    vocab2index[word] = len(words)
    words.append(word)
    
def encode_sentence(text, vocab2index, N=200):
    tokenized = thai_tokenizer(text)
    encoded = np.zeros(N, dtype=int)
    enc1 = np.array([vocab2index.get(word, vocab2index["UNK"]) for word in tokenized])
    length = min(N, len(enc1))
    encoded[:length] = enc1[:length]
    return encoded

data['encoded'] = data['comment'].apply(lambda x: np.array(encode_sentence(x, vocab2index)))
data.head()

  from .autonotebook import tqdm as notebook_tqdm


num_words before: 1326
num_words after: 645


Unnamed: 0,comment,score_norm,encoded
0,ตัวหูฟังมีรอยแตกทั้ง2ข้าง ตอนนี้ยังไม่มีผลต่อก...,0.444444,"[2, 3, 4, 1, 5, 6, 7, 8, 9, 10, 11, 12, 13, 8,..."
1,ซื้อสินค้าไปดูไม่ค่อยแข็งแรง ใช้ไม่ได้ด้วยค่ะ ...,1.0,"[22, 23, 1, 24, 25, 8, 26, 27, 28, 8, 1, 8, 29..."
2,สินค้าใช้ไม่ได้ ร้านค้าแจ้งไม่ต้องกดในระบบ พอเ...,0.888889,"[23, 26, 8, 47, 48, 49, 50, 51, 52, 8, 53, 35,..."
3,ของพึ่งได้มาเมื่อวานใช้ตอนเช้าฟังได้แค่ข้างเดี...,0.666667,"[58, 59, 60, 61, 62, 63, 64, 20, 65, 66, 67, 5..."
4,"ประสิทธิภาพ: ใช้พูดคุยไม่ได้เสียงเบามาก ,คนฟัง...",0.777778,"[68, 69, 8, 62, 1, 11, 70, 71, 72, 73, 8, 74, ..."


In [3]:
class CommonLitReadabiltyDataset(Dataset):
    def __init__(self, X, Y):
        self.X = X
        self.y = Y
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return torch.from_numpy(self.X[idx].astype(np.int32)), self.y[idx], self.X[idx][1] 

X = list(data['encoded'])
y = list(data['score_norm'])

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

train_ds = CommonLitReadabiltyDataset(X_train, y_train)
valid_ds = CommonLitReadabiltyDataset(X_valid, y_valid)

In [16]:
def train_model_regr(model, epochs=10, lr=0.001):
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = torch.optim.Adam(parameters, lr=lr)
    for i in range(epochs):
        # print('epochs:', i, end=' ')
        model.train()
        sum_loss = 0.0
        total = 0
        for x, y, l in train_dl:
            x = x.long()
            y = y.float()
            y_pred = model(x, l)
            optimizer.zero_grad()
            loss = F.mse_loss(y_pred, y.unsqueeze(-1))
            loss.backward()
            optimizer.step()
            sum_loss += loss.item()*y.shape[0]
            total += y.shape[0]
        val_loss = validation_metrics_regr(model, val_dl)
        if i % 5 == 1:
            print("train mse %.4f val rmse %.4f" % (sum_loss/total, val_loss))

def validation_metrics_regr (model, valid_dl):
    model.eval()
    correct = 0
    total = 0
    sum_loss = 0.0
    for x, y, l in valid_dl:
        x = x.long()
        y = y.float()
        y_hat = model(x, l)
        loss = np.sqrt(F.mse_loss(y_hat, y.unsqueeze(-1)).item())
        total += y.shape[0]
        sum_loss += loss.item()*y.shape[0]
    return sum_loss/total

batch_size = 16
vocab_size = len(words)
embedding_dim = 300
hidden_dim = 200
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
val_dl = DataLoader(valid_ds, batch_size=batch_size)

In [17]:
class LSTM_regr(torch.nn.Module) :
    def __init__(self, vocab_size, embedding_dim, hidden_dim) :
        super().__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.linear = nn.Linear(hidden_dim, 1)
        self.dropout = nn.Dropout(0.4)
        
    def forward(self, x, l):
        x = self.embeddings(x)
        x = self.dropout(x)
        lstm_out, (ht, ct) = self.lstm(x)
        return self.linear(ht[-1])

In [18]:
model =  LSTM_regr(vocab_size, embedding_dim, hidden_dim)

In [19]:
train_model_regr(model, epochs=20, lr=5e-5)

train mse 0.2794 val rmse 0.5146
train mse 0.1055 val rmse 0.2553
train mse 0.0771 val rmse 0.2421
train mse 0.0770 val rmse 0.2428


In [None]:
model = torch.load(checkpoint_path)
model.eval()

# Read the test excerpts
test_data = pd.read_csv("../input/commonlitreadabilityprize/test.csv")
print(test_data.head())

# Apply the same encoding as the train texts
test_data['encoded'] = test_data['excerpt'].apply(lambda x: np.array(encode_sentence(x,vocab2index )))
idx, excerpts_test = test_data['id'], test_data['encoded']

X_test = [excerpts_test[i][0] for i in range(len(test_data))]
l_test = [excerpts_test[i][1] for i in range(len(test_data))]
X_test = torch.LongTensor(X_test)