In [1]:
%run data_read.py

Data path: C:\Users\POP PC\Documents\GitHub\utcc_independent_study\training\..\data\300_data_pop.xlsx
data_size: 318
variable: data train_data test_data


In [2]:
data, vocab2index = preprocess_data(data)
data.head()

num_words before: 1322
num_words after: 641


Unnamed: 0,comment,score_norm,encoded
0,ตัวหูฟังมีรอยแตกทั้ง2ข้าง ตอนนี้ยังไม่มีผลต่อก...,0.444444,"[2, 3, 4, 1, 5, 6, 7, 8, 9, 10, 11, 12, 13, 8,..."
1,ซื้อสินค้าไปดูไม่ค่อยแข็งแรง ใช้ไม่ได้ด้วยค่ะ ...,1.0,"[22, 23, 1, 24, 25, 8, 26, 27, 28, 8, 1, 8, 29..."
2,สินค้าใช้ไม่ได้ ร้านค้าแจ้งไม่ต้องกดในระบบ พอเ...,0.888889,"[23, 26, 8, 47, 48, 49, 50, 51, 52, 8, 53, 35,..."
3,ของพึ่งได้มาเมื่อวานใช้ตอนเช้าฟังได้แค่ข้างเดี...,0.666667,"[58, 59, 60, 61, 62, 63, 64, 20, 65, 66, 67, 5..."
4,"ประสิทธิภาพ: ใช้พูดคุยไม่ได้เสียงเบามาก ,คนฟัง...",0.777778,"[68, 69, 8, 62, 1, 11, 70, 71, 72, 73, 8, 74, ..."


In [3]:
X = list(data['encoded'])
Y = list(data['score_norm'])
batch_size = 16
train_dl, val_dl, vocab_size, embedding_dim, hidden_dim = create_datasets(X, Y, vocab2index, batch_size)

In [4]:
def train_model_regr(model, epochs=10, lr=0.001):
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = torch.optim.Adam(parameters, lr=lr)
    for i in range(epochs):
        # print('epochs:', i, end=' ')
        model.train()
        sum_loss = 0.0
        total = 0
        for x, y, l in train_dl:
            x = x.long()
            y = y.float()
            y_pred = model(x, l)
            optimizer.zero_grad()
            loss = F.mse_loss(y_pred, y.unsqueeze(-1))
            loss.backward()
            optimizer.step()
            sum_loss += loss.item()*y.shape[0]
            total += y.shape[0]
        val_loss = validation_metrics_regr(model, val_dl)
        if i % 5 == 1:
            print("train mse %.4f val rmse %.4f" % (sum_loss/total, val_loss))

def validation_metrics_regr (model, valid_dl):
    model.eval()
    correct = 0
    total = 0
    sum_loss = 0.0
    for x, y, l in valid_dl:
        x = x.long()
        y = y.float()
        y_hat = model(x, l)
        loss = np.sqrt(F.mse_loss(y_hat, y.unsqueeze(-1)).item())
        total += y.shape[0]
        sum_loss += loss.item()*y.shape[0]
    return sum_loss/total

In [5]:
class LSTM_regr(torch.nn.Module) :
    def __init__(self, vocab_size, embedding_dim, hidden_dim) :
        super().__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.linear = nn.Linear(hidden_dim, 1)
        self.dropout = nn.Dropout(0.4)
        
    def forward(self, x, l):
        x = self.embeddings(x)
        x = self.dropout(x)
        lstm_out, (ht, ct) = self.lstm(x)
        return self.linear(ht[-1])

class RNN_regr(torch.nn.Module) :
    def __init__(self, vocab_size, embedding_dim, hidden_dim) :
        super().__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.rnn = nn.RNN(embedding_dim, hidden_dim, batch_first=True)
        self.linear = nn.Linear(hidden_dim, 1)
        self.dropout = nn.Dropout(0.4)
        
    def forward(self, x, l):
        x = self.embeddings(x)
        x = self.dropout(x)
        rnn_out, ht = self.rnn(x)
        return self.linear(ht[-1])

class CNN_regr(torch.nn.Module) :
    def __init__(self, vocab_size, embedding_dim, num_filters, filter_sizes, hidden_dim) :
        super().__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.convs = nn.ModuleList([
            nn.Conv1d(embedding_dim, num_filters, fs)
            for fs in filter_sizes
        ])
        self.linear1 = nn.Linear(num_filters * len(filter_sizes), hidden_dim)
        self.linear2 = nn.Linear(hidden_dim, 1)
        self.dropout = nn.Dropout(0.4)
        
    def forward(self, x, l):
        x = self.embeddings(x)
        x = x.permute(0, 2, 1) # switch to (batch_size, embedding_dim, sequence_length)
        conv_outputs = []
        for conv in self.convs:
            conv_output = conv(x)
            conv_output = F.relu(conv_output)
            max_pool_output = F.max_pool1d(conv_output, conv_output.size()[2])
            conv_outputs.append(max_pool_output.squeeze(-1))
        x = torch.cat(conv_outputs, dim=1)
        x = self.dropout(x)
        x = self.linear1(x)
        x = F.relu(x)
        return self.linear2(x)
    
class DNN_regr(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super().__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.fc1 = nn.Linear(embedding_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, 1)
        self.dropout = nn.Dropout(0.4)
        
    def forward(self, x, l):
        x = self.embeddings(x)
        x = self.dropout(x)
        x = torch.mean(x, dim=1)
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = F.relu(self.fc2(x))
        x = self.dropout(x)
        return self.fc3(x)


In [6]:
lstm_model = LSTM_regr(vocab_size, embedding_dim, hidden_dim)
train_model_regr(lstm_model, epochs=20, lr=5e-5)

train mse 0.3014 val rmse 0.5358
train mse 0.1090 val rmse 0.2552
train mse 0.0775 val rmse 0.2419
train mse 0.0769 val rmse 0.2415


In [7]:
rnn_model = RNN_regr(vocab_size, embedding_dim, hidden_dim)
train_model_regr(rnn_model, epochs=20, lr=5e-5)

train mse 0.2945 val rmse 0.4883
train mse 0.0773 val rmse 0.2425
train mse 0.0782 val rmse 0.2420
train mse 0.0769 val rmse 0.2420


In [10]:
num_filters = 100
filter_sizes = [3, 4, 5]
cnn_model = CNN_regr(vocab_size, embedding_dim, num_filters, filter_sizes, hidden_dim)
train_model_regr(cnn_model, epochs=20, lr=5e-5)

train mse 0.0797 val rmse 0.1827
train mse 0.0439 val rmse 0.1419
train mse 0.0342 val rmse 0.1391
train mse 0.0238 val rmse 0.1432


In [11]:
dnn_model = DNN_regr(vocab_size, embedding_dim, hidden_dim)
train_model_regr(dnn_model, epochs=20, lr=5e-5)

train mse 0.4187 val rmse 0.6414
train mse 0.2830 val rmse 0.5157
train mse 0.1121 val rmse 0.3058
train mse 0.0527 val rmse 0.2135


In [None]:
model = torch.load(checkpoint_path)
model.eval()

# Read the test excerpts
test_data = pd.read_csv("../input/commonlitreadabilityprize/test.csv")
print(test_data.head())

# Apply the same encoding as the train texts
test_data['encoded'] = test_data['excerpt'].apply(lambda x: np.array(encode_sentence(x,vocab2index )))
idx, excerpts_test = test_data['id'], test_data['encoded']

X_test = [excerpts_test[i][0] for i in range(len(test_data))]
l_test = [excerpts_test[i][1] for i in range(len(test_data))]
X_test = torch.LongTensor(X_test)