In [1]:
# PyTorch imports
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn import functional as F
from torch import nn
torch.manual_seed(123)

# NLTK imports
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.gleu_score import sentence_gleu
from nltk.translate.nist_score import sentence_nist

# sklearn imports
import sklearn
from sklearn.svm import SVC
from sklearn.metrics import f1_score, accuracy_score

# others
import optuna
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

### Build Custom Dataset

In [2]:
class MTCheckDataset(Dataset):
    
    def __init__(self, data_path):
        self.data_path = data_path
        self.references, self.candidates, self.scores, self.labels = self.import_data(self.data_path)
        self.samples = self.generate_features(self.references, self.candidates)

    def __getitem__(self, idx):
        sample = torch.tensor(self.samples.loc[idx]).float()
        label = self.labels[idx]
        return (sample, label)
        
    def __len__(self):
        return len(self.samples)
    
    def import_data(self, file):
        # English references
        references = []
        # Translated English sentences
        candidates = []
        # Bleu scores for translations
        scores = []
        # Labels indicating human (H-->0) or machine translation (M-->1)
        labels = []
        with open(file) as f:
            lines = f.readlines()
            for i, line in enumerate(lines):
                line = line.strip('\n')
                if i % 6 == 1:
                    references.append(line)
                elif i % 6 == 2:
                    candidates.append(line)
                elif i % 6 == 3:
                    scores.append(float(line))
                elif i % 6 == 4:
                    if line == 'H':
                        labels.append(0)
                    else:
                        labels.append(1)
        return references, candidates, scores, labels
    
    def generate_features(self, references, candidates):
        # define empty arrays for metrics
        blue_1_ind, blue_2_ind, blue_3_ind, blue_4_ind, blue_2_cumu, blue_3_cumu, blue_4_cumu = [], [], [], [], [], [], []
        gleu_default, nist_defualt = [], []

        # compute 9 different metrics
        for ref, cand in zip(references, candidates):
            ref_split = ref.split(' ')
            ref_input = [ref_split]
            cand_input = cand.split(' ')
            # individual bleu scores
            blue_1_ind.append(sentence_bleu(ref_input, cand_input, weights=(1, 0, 0, 0)))
            blue_2_ind.append(sentence_bleu(ref_input, cand_input, weights=(0, 1, 0, 0)))
            blue_3_ind.append(sentence_bleu(ref_input, cand_input, weights=(0, 0, 1, 0)))
            blue_4_ind.append(sentence_bleu(ref_input, cand_input, weights=(0, 0, 0, 1)))
            # cumulative bleu scores
            blue_2_cumu.append(sentence_bleu(ref_input, cand_input, weights=(0.5, 0.5, 0, 0)))
            blue_3_cumu.append(sentence_bleu(ref_input, cand_input, weights=(0.33, 0.33, 0.33, 0)))
            blue_4_cumu.append(sentence_bleu(ref_input, cand_input, weights=(0.25, 0.25, 0.25, 0.25)))
            # other scores
            gleu_default.append(sentence_gleu(ref_input, cand_input))
            nist_defualt.append(sentence_nist(ref_input, cand_input))
            #ribes_defualt.append(sentence_ribes(ref_input, cand_input))

        # pass metrics to dataframe and return
        feature_dict = {"blue_1_ind": blue_1_ind, "blue_2_ind": blue_2_ind, "blue_3_ind": blue_3_ind,
                        "blue_4_ind": blue_4_ind, "blue_2_cumu": blue_2_cumu, "blue_3_cumu": blue_3_cumu,
                        "blue_4_cumu": blue_4_cumu, "gleu_default": gleu_default, "nist_defualt": nist_defualt}
        features = pd.DataFrame(feature_dict)
        return features

In [3]:
# define train and test data
train_data = MTCheckDataset(data_path="train.txt")
test_data = MTCheckDataset(data_path="test.txt")

In [4]:
# define train and test dataloaders
train_loader = DataLoader(train_data, batch_size=16, shuffle=True)
test_loader = DataLoader(test_data, batch_size=8, shuffle=True)

### SVM Classifier

In [63]:
def svm_optimize(trial):
    svc_c = trial.suggest_categorical('svc_c', [0.1, 1.0, 10])
    svc_kernel = trial.suggest_categorical('svc_kernel', ['poly', 'rbf', 'sigmoid'])
    classifier_obj = sklearn.svm.SVC(C=svc_c, kernel=svc_kernel, gamma='auto')
    classifier_obj.fit(train_data.samples, train_data.labels)
    predictions = classifier_obj.predict(test_data.samples)
    f1 = f1_score(test_data.labels, predictions)
    return f1
    
study = optuna.create_study(direction='maximize', study_name="SVM")
study.optimize(svm_optimize, n_trials=10)

[32m[I 2021-02-05 19:29:51,367][0m A new study created in memory with name: SVM[0m
[32m[I 2021-02-05 19:29:51,394][0m Trial 0 finished with value: 0.7500000000000001 and parameters: {'svc_c': 1.0, 'svc_kernel': 'rbf'}. Best is trial 0 with value: 0.7500000000000001.[0m
[32m[I 2021-02-05 19:29:51,427][0m Trial 1 finished with value: 0.15384615384615385 and parameters: {'svc_c': 0.1, 'svc_kernel': 'sigmoid'}. Best is trial 0 with value: 0.7500000000000001.[0m
[32m[I 2021-02-05 19:29:51,453][0m Trial 2 finished with value: 0.7291666666666666 and parameters: {'svc_c': 1.0, 'svc_kernel': 'poly'}. Best is trial 0 with value: 0.7500000000000001.[0m
[32m[I 2021-02-05 19:29:51,491][0m Trial 3 finished with value: 0.15384615384615385 and parameters: {'svc_c': 0.1, 'svc_kernel': 'sigmoid'}. Best is trial 0 with value: 0.7500000000000001.[0m
[32m[I 2021-02-05 19:29:51,531][0m Trial 4 finished with value: 0.7303370786516854 and parameters: {'svc_c': 0.1, 'svc_kernel': 'rbf'}. Best 

In [64]:
df = study.trials_dataframe()
df.to_csv('svm.csv', index=False)
df.head(5)

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_svc_c,params_svc_kernel,state
0,0,0.75,2021-02-05 19:29:51.369699,2021-02-05 19:29:51.394366,00:00:00.024667,1.0,rbf,COMPLETE
1,1,0.153846,2021-02-05 19:29:51.395310,2021-02-05 19:29:51.427685,00:00:00.032375,0.1,sigmoid,COMPLETE
2,2,0.729167,2021-02-05 19:29:51.428618,2021-02-05 19:29:51.453020,00:00:00.024402,1.0,poly,COMPLETE
3,3,0.153846,2021-02-05 19:29:51.454012,2021-02-05 19:29:51.491612,00:00:00.037600,0.1,sigmoid,COMPLETE
4,4,0.730337,2021-02-05 19:29:51.494283,2021-02-05 19:29:51.530801,00:00:00.036518,0.1,rbf,COMPLETE


### Feed-Forward Neural Network

In [57]:
class FFNN(nn.Module):
    def __init__(self, input_dim, hidden_dim, out_dim, activation, dropout_val):
        super().__init__()
        # params
        self.hidden_dim = hidden_dim
        self.input_dim = input_dim
        self.out_dim = out_dim
        self.droupout_val = dropout_val

        # layers
        self.hidden = nn.Linear(self.input_dim, self.hidden_dim)
        self.out = nn.Linear(self.hidden_dim, self.out_dim)
        if activation == "tanh":
            self.activation = F.tanh
        elif activation == "relu":
            self.activation = F.relu
        elif activation == "sigmoid":
            self.activation = F.sigmoid
        self.droput = nn.Dropout(self.droupout_val)
    
    def forward(self, x):
        x = self.hidden(x)
        x = self.activation(x)
        x = self.droput(x)
        x = F.sigmoid(self.out(x))
        return x

In [58]:
ffnn = FFNN(input_dim=9, hidden_dim=256, out_dim=2, activation="tanh", dropout_val=0.1)

### The Training Loop

In [59]:
def optimze_ffnn(trial):
    
    # define the loss function
    loss = nn.CrossEntropyLoss()
        
    # set the params for the model
    ff_hidden_dim = trial.suggest_categorical('ff_hidden_dim', [64, 128])
    ff_activation = trial.suggest_categorical('ff_activation', ['tanh', 'relu', 'sigmoid'])
    ff_droput = trial.suggest_categorical('ff_droput', [0.05, 0.1]) 
    model = FFNN(input_dim=9, hidden_dim=ff_hidden_dim, out_dim=2, activation=ff_activation, 
                 dropout_val=ff_droput)
    
    # set the optimizer
    opt = trial.suggest_categorical('opt', ['Adam', 'AdamW'])
    opt_lr = trial.suggest_categorical('opt_lr', [1e-1, 1e-2])
    if opt == 'Adam':
        optimizer = torch.optim.Adam(model.parameters(), lr=opt_lr)
    elif opt == 'AdamW':
        optimizer = torch.optim.AdamW(model.parameters(), lr=opt_lr)
    
    # training loop
    num_epochs = 5
    total = len(test_data)
    all_f1s = list()
    for epoch in range(num_epochs):
        losses = list()
        for batch in train_loader:
            X, y = batch
            # 1. forward
            predicted_vector = model(X)
            # 2. compute objective function
            J = loss(predicted_vector, y)
            # 3. clear the gradients
            optimizer.zero_grad()
            # 4. accumulate partial derivatives of J w.r.t to params
            J.backward()
            # 5. step in opp direction of gradient
            optimizer.step()     
            losses.append(J.item())

        train_loss = torch.tensor(losses).mean()

        all_labels, all_preds = list(), list()
        for batch in test_loader:
            X, y = batch
            # 1. forward
            predicted_vector = model(X)
            # for accuracy
            predicted_labels = torch.argmax(predicted_vector, dim=1)
            # for f1 score
            all_labels.append(y.tolist())
            all_preds.append(predicted_labels.tolist())

        # flatten the lists
        all_labels = [item for sublist in all_labels for item in sublist]
        all_preds = [item for sublist in all_preds for item in sublist]
        f1 = f1_score(all_labels, all_preds)
        acc = accuracy_score(all_labels, all_preds) 
        all_f1s.append(f1)
    
    return max(all_f1s)

In [60]:
study = optuna.create_study(direction='maximize', study_name="FFNN")
study.optimize(optimze_ffnn, n_trials=50)

[32m[I 2021-02-05 19:26:38,339][0m A new study created in memory with name: FFNN[0m
[32m[I 2021-02-05 19:26:39,650][0m Trial 0 finished with value: 0.7011494252873564 and parameters: {'ff_hidden_dim': 128, 'ff_activation': 'relu', 'ff_droput': 0.05, 'opt': 'Adam', 'opt_lr': 0.1}. Best is trial 0 with value: 0.7011494252873564.[0m
[32m[I 2021-02-05 19:26:41,085][0m Trial 1 finished with value: 0.718232044198895 and parameters: {'ff_hidden_dim': 64, 'ff_activation': 'sigmoid', 'ff_droput': 0.1, 'opt': 'Adam', 'opt_lr': 0.1}. Best is trial 1 with value: 0.718232044198895.[0m
[32m[I 2021-02-05 19:26:42,502][0m Trial 2 finished with value: 0.7362637362637363 and parameters: {'ff_hidden_dim': 64, 'ff_activation': 'relu', 'ff_droput': 0.05, 'opt': 'AdamW', 'opt_lr': 0.01}. Best is trial 2 with value: 0.7362637362637363.[0m
[32m[I 2021-02-05 19:26:44,089][0m Trial 3 finished with value: 0.708994708994709 and parameters: {'ff_hidden_dim': 128, 'ff_activation': 'relu', 'ff_droput':

[32m[I 2021-02-05 19:27:19,755][0m Trial 32 finished with value: 0.7272727272727273 and parameters: {'ff_hidden_dim': 64, 'ff_activation': 'sigmoid', 'ff_droput': 0.05, 'opt': 'Adam', 'opt_lr': 0.01}. Best is trial 11 with value: 0.7555555555555555.[0m
[32m[I 2021-02-05 19:27:20,942][0m Trial 33 finished with value: 0.7340425531914894 and parameters: {'ff_hidden_dim': 64, 'ff_activation': 'sigmoid', 'ff_droput': 0.05, 'opt': 'Adam', 'opt_lr': 0.01}. Best is trial 11 with value: 0.7555555555555555.[0m
[32m[I 2021-02-05 19:27:22,204][0m Trial 34 finished with value: 0.739884393063584 and parameters: {'ff_hidden_dim': 64, 'ff_activation': 'sigmoid', 'ff_droput': 0.05, 'opt': 'Adam', 'opt_lr': 0.01}. Best is trial 11 with value: 0.7555555555555555.[0m
[32m[I 2021-02-05 19:27:23,416][0m Trial 35 finished with value: 0.7023809523809523 and parameters: {'ff_hidden_dim': 64, 'ff_activation': 'relu', 'ff_droput': 0.05, 'opt': 'Adam', 'opt_lr': 0.01}. Best is trial 11 with value: 0.75

In [62]:
df = study.trials_dataframe()
df.to_csv('ffnn.csv', index=False)
df.head(5)

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_ff_activation,params_ff_droput,params_ff_hidden_dim,params_opt,params_opt_lr,state
0,0,0.701149,2021-02-05 19:26:38.342414,2021-02-05 19:26:39.650145,00:00:01.307731,relu,0.05,128,Adam,0.1,COMPLETE
1,1,0.718232,2021-02-05 19:26:39.651578,2021-02-05 19:26:41.084959,00:00:01.433381,sigmoid,0.1,64,Adam,0.1,COMPLETE
2,2,0.736264,2021-02-05 19:26:41.085991,2021-02-05 19:26:42.501775,00:00:01.415784,relu,0.05,64,AdamW,0.01,COMPLETE
3,3,0.708995,2021-02-05 19:26:42.503044,2021-02-05 19:26:44.089175,00:00:01.586131,relu,0.05,128,Adam,0.01,COMPLETE
4,4,0.722222,2021-02-05 19:26:44.090435,2021-02-05 19:26:45.561304,00:00:01.470869,sigmoid,0.1,128,Adam,0.01,COMPLETE
