In [13]:
%run data_read.py

Data path: C:\Users\Pop\Documents\GitHub\utcc_independent_study\training\..\data\300_data_pop.xlsx
data_size: 318
variable: data train_data test_data


Unnamed: 0,comment,score
281,พลังเสียงชัดเจน โอเคเลย,3
236,คุณภาพ: ดีให้ฟิล์มไม่ตรง สั่ง14 happy birthday...,5
245,คุณภาพ: ภาพรวมดี แข็งแรงขาตั้งแข็งแรง ปรับระดั...,6
90,เชื่อมต่อกับคอมไม่ได้ ทางร้านไม่แก้ปัญหาให้เลย,6
51,พึ่งได้หยิบมาใช้งาน สังเกตุว่ามีรอยร้าว แต่ไม่...,7
...,...,...
87,กล่องเหมือนโดนแกะแล้ว สตกเผยอๆนิดหน่อย แต่ของข...,7
184,สินค้าใช้การไม่ได้,3
168,สินค้ามีคุณภาพ คุณภาพดี การจัดส่งค่อนข้างรวดเร...,6
15,ก็ตามราคาแหละ แต่เคยซื้อยี่ห้อนี้ มันเคยดีกว่านี้,1


In [None]:
data, vocab2index = preprocess_data(data)
data = data[['comment', 'score', 'encoded']]

X = list(data['encoded'])
Y = list(data['score'])
batch_size = 16

train_dl, val_dl, vocab_size, embedding_dim, hidden_dim = create_datasets(X, Y, vocab2index, batch_size)

data.head()

In [5]:
def train_model_regr(model, epochs=10, lr=0.001):
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = torch.optim.Adam(parameters, lr=lr)
    for i in range(epochs):
        # print('epochs:', i, end=' ')
        model.train()
        sum_loss = 0.0
        total = 0
        for x, y, l in train_dl:
            x = x.long()
            y = y.float()
            y_pred = model(x, l)
            optimizer.zero_grad()
            loss = F.l1_loss(y_pred, y.unsqueeze(-1))
            loss.backward()
            optimizer.step()
            sum_loss += loss.item()*y.shape[0]
            total += y.shape[0]
        val_loss = validation_metrics_regr(model, val_dl)
        if i % 5 == 1:
            print("train mae %.4f val mae %.4f" % (sum_loss/total, val_loss))
            
def validation_metrics_regr (model, valid_dl):
    model.eval()
    correct = 0
    total = 0
    sum_loss = 0.0
    for x, y, l in valid_dl:
        x = x.long()
        y = y.float()
        y_hat = model(x, l)
        loss = F.l1_loss(y_hat, y.unsqueeze(-1))
        total += y.shape[0]
        sum_loss += loss.item()*y.shape[0]
    return sum_loss/total


In [6]:
class LSTM_regr(torch.nn.Module) :
    def __init__(self, vocab_size, embedding_dim, hidden_dim) :
        super().__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_ idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.linear = nn.Linear(hidden_dim, 1)
        self.dropout = nn.Dropout(0.4)
        
    def forward(self, x, l):
        x = self.embeddings(x)
        x = self.dropout(x)
        lstm_out, (ht, ct) = self.lstm(x)
        return self.linear(ht[-1])

class RNN_regr(torch.nn.Module) :
    def __init__(self, vocab_size, embedding_dim, hidden_dim) :
        super().__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.rnn = nn.RNN(embedding_dim, hidden_dim, batch_first=True)
        self.linear = nn.Linear(hidden_dim, 1)
        self.dropout = nn.Dropout(0.4)
        
    def forward(self, x, l):
        x = self.embeddings(x)
        x = self.dropout(x)
        rnn_out, ht = self.rnn(x)
        return self.linear(ht[-1])

class CNN_regr(torch.nn.Module) :
    def __init__(self, vocab_size, embedding_dim, num_filters, filter_sizes, hidden_dim) :
        super().__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.convs = nn.ModuleList([
            nn.Conv1d(embedding_dim, num_filters, fs)
            for fs in filter_sizes
        ])
        self.linear1 = nn.Linear(num_filters * len(filter_sizes), hidden_dim)
        self.linear2 = nn.Linear(hidden_dim, 1)
        self.dropout = nn.Dropout(0.4)
        
    def forward(self, x, l):
        x = self.embeddings(x)
        x = x.permute(0, 2, 1) # switch to (batch_size, embedding_dim, sequence_length)
        conv_outputs = []
        for conv in self.convs:
            conv_output = conv(x)
            conv_output = F.relu(conv_output)
            max_pool_output = F.max_pool1d(conv_output, conv_output.size()[2])
            conv_outputs.append(max_pool_output.squeeze(-1))
        x = torch.cat(conv_outputs, dim=1)
        x = self.dropout(x)
        x = self.linear1(x)
        x = F.relu(x)
        return self.linear2(x)
    
class DNN_regr(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super().__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.fc1 = nn.Linear(embedding_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, 1)
        self.dropout = nn.Dropout(0.4)
        
    def forward(self, x, l):
        x = self.embeddings(x)
        x = self.dropout(x)
        x = torch.mean(x, dim=1)
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = F.relu(self.fc2(x))
        x = self.dropout(x)
        return self.fc3(x)


In [7]:
lstm_model = LSTM_regr(vocab_size, embedding_dim, hidden_dim)
train_model_regr(lstm_model, epochs=20, lr=5e-5)

train mae 4.9043 val mae 5.0868
train mae 4.2082 val mae 2.9374
train mae 2.0928 val mae 1.8208
train mae 2.0956 val mae 1.8255


In [8]:
rnn_model = RNN_regr(vocab_size, embedding_dim, hidden_dim)
train_model_regr(rnn_model, epochs=20, lr=5e-5)

train mae 4.7792 val mae 4.8716
train mae 2.3496 val mae 2.1875
train mae 2.2489 val mae 2.1324
train mae 2.2302 val mae 1.9782


In [9]:
num_filters = 100
filter_sizes = [3, 4, 5]
cnn_model = CNN_regr(vocab_size, embedding_dim, num_filters, filter_sizes, hidden_dim)
train_model_regr(cnn_model, epochs=20, lr=5e-5)

train mae 4.0723 val mae 3.6759
train mae 1.2372 val mae 1.0450
train mae 1.0450 val mae 0.9666
train mae 0.8474 val mae 0.9335


In [10]:
dnn_model = DNN_regr(vocab_size, embedding_dim, hidden_dim)
train_model_regr(dnn_model, epochs=20, lr=5e-5)

train mae 4.8941 val mae 5.0823
train mae 4.7682 val mae 4.9340
train mae 4.4411 val mae 4.5771
train mae 3.6692 val mae 3.6969


In [None]:
model = torch.load(checkpoint_path)
model.eval()

# Read the test excerpts
test_data = pd.read_csv("../input/commonlitreadabilityprize/test.csv")
print(test_data.head())

# Apply the same encoding as the train texts
test_data['encoded'] = test_data['excerpt'].apply(lambda x: np.array(encode_sentence(x,vocab2index )))
idx, excerpts_test = test_data['id'], test_data['encoded']

X_test = [excerpts_test[i][0] for i in range(len(test_data))]
l_test = [excerpts_test[i][1] for i in range(len(test_data))]
X_test = torch.LongTensor(X_test)