In [1]:
# Team Members: JIANWEI Luo, WEIFENG Luo

# Preliminaries

In [2]:
import numpy as np
import pandas as pd
import os
import warnings
import torch
from transformers import AutoTokenizer
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from torch.optim import AdamW
from transformers import AutoModelForSequenceClassification,AutoModel
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,f1_score,recall_score, confusion_matrix
import numpy as np
from gensim.models import word2vec
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
import copy

In [3]:
warnings.filterwarnings('ignore')
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [4]:
# load datasets
def load_datasets(Task_1_folder, Task_2_folder):
    # read csv file
    Task_1_train_file_path = os.path.join(Task_1_folder, "data/train.csv")
    Task_1_test_file_path = os.path.join(Task_1_folder, "data/test.csv")
    Task_2_data_file_path = os.path.join(Task_2_folder, "data/Tweets.csv")
    Task_1_train = pd.read_csv(Task_1_train_file_path)
    Task_1_test = pd.read_csv(Task_1_test_file_path)
    Task_2_data = pd.read_csv(Task_2_data_file_path)
    # nan text processing and label mapping
    Task_2_data['text'] = Task_2_data['text'].replace(np.nan," ")
    label2index = {
        "negative":0,
        "neutral":1,
        "positive":2
    }
    Task_2_data['sentiment'] = Task_2_data['sentiment'].map(label2index)
    return Task_1_train, Task_1_test, Task_2_data
Task_1_folder = "./Task_1"
Task_2_folder = "./Task_2"
Task_1_train, Task_1_test, Task_2_data = load_datasets(Task_1_folder, Task_2_folder)


# Data preprocessing

In [5]:
# combine the text from task1 and task2
new_corpus = pd.DataFrame()
new_corpus['text'] = pd.concat([Task_1_train['text'], Task_1_test['text'], Task_2_data['text']], axis=0)

In [26]:
# Tokenize the words and prepare a pipeline for tensor representation of text.
class Text_Dataset(Dataset):
    def __init__(self, dataset, pretrain_path):
        super(Text_Dataset, self).__init__()
        self.dataset = dataset
        self.tokenizer = AutoTokenizer.from_pretrained(pretrain_path)
    def __getitem__(self, index):
        text = self.dataset['text'].iloc[index]
        encode_dict = self.tokenizer.encode_plus(text=text, max_length=512,padding='max_length', truncation=True)
        return encode_dict['input_ids'], encode_dict['attention_mask']
    def __len__(self):
        return len(self.dataset)


In [7]:
pretrain_path = "./roberta/roberta_base/roberta_base"
text_dataset = Text_Dataset(new_corpus, pretrain_path)

In [8]:
# Split the training data
Task_1_train_X, Task_1_train_Y = Task_1_train['text'], Task_1_train['target']
Task_1_train_X, Task_1_val_X, Task_1_train_Y, Task_1_val_Y = train_test_split( Task_1_train_X, Task_1_train_Y, test_size=0.2, random_state=42)
Task_2_train_X, Task_2_train_Y = Task_2_data['text'], Task_2_data['sentiment']
Task_2_train_X, Task_2_val_X, Task_2_train_Y, Task_2_val_Y = train_test_split( Task_2_train_X, Task_2_train_Y, test_size=0.2, random_state=42)


## Model building: we choose roberta as our pretrained model and we will explain it in our report.

In [9]:
# Model Building
pretrain_path = "./roberta/roberta_base/roberta_base"
Task_1_Bert = AutoModelForSequenceClassification.from_pretrained(pretrain_path,num_labels=2)
Task_2_Bert = AutoModelForSequenceClassification.from_pretrained(pretrain_path,num_labels=3)

Some weights of the model checkpoint at ./roberta/roberta_base/roberta_base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.bias', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at ./roberta/roberta_base/roberta_base and are newly initi

## III. Neural Multi-Task Learning

1. Train the model $M$ only for $T_1$ on $D_1^{train}$. Call the trained model $M_{D_1}$. Evaluate $M_{D_1}$ on $D_1^{val}$ and report its performance metric (F1 score) for the first task, ${Perf}_{T_1}(M_{D_1}|D_1^{val})$. Also print the confusion matrix.

In [10]:
# Train the model M1/M2 only for T1/T2 on D1_train/D2_train.T1 calculate F1 on D1_val, T2 calculate ACC on D1_val.
# bulid dataset
class Task_Dataset(Dataset):
    def __init__(self, dataset_X, dataset_Y, pretrain_path):
        super(Task_Dataset, self).__init__()
        self.dataset_X = dataset_X
        self.dataset_Y = dataset_Y
        self.tokenizer = AutoTokenizer.from_pretrained(pretrain_path)
    def __getitem__(self, index):
        text = self.dataset_X.iloc[index]
        target = self.dataset_Y.iloc[index]
        encode_dict = self.tokenizer.encode_plus(text=text, max_length=512,padding='max_length', truncation=True)
        return encode_dict['input_ids'], encode_dict['attention_mask'], target
    def __len__(self):
        return len(self.dataset_X)
pretrain_path = "./roberta/roberta_base/roberta_base"
Task_1_train_dataset = Task_Dataset(Task_1_train_X, Task_1_train_Y, pretrain_path)
Task_1_val_dataset = Task_Dataset(Task_1_val_X, Task_1_val_Y, pretrain_path)
Task_2_train_dataset = Task_Dataset(Task_2_train_X, Task_2_train_Y, pretrain_path)
Task_2_val_dataset = Task_Dataset(Task_2_val_X, Task_2_val_Y, pretrain_path)
# bulid dataloader
def get_dataloader(dataset, batch_size, shuffle):
    def padding_collate(batch):
        (input_ids, attention_masks, labels) = zip(*batch)
        input_ids = torch.tensor(input_ids).to(device)
        attention_masks = torch.tensor(attention_masks).to(device)
        labels = torch.tensor(labels).to(device)
        return {"input_ids":input_ids, "attention_mask":attention_masks, "labels":labels}
    return DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, collate_fn=padding_collate)
Task_1_train_Dataloader = get_dataloader(Task_1_train_dataset, 8, False)
Task_1_val_Dataloader = get_dataloader(Task_1_val_dataset, 8, False)
Task_2_train_Dataloader = get_dataloader(Task_2_train_dataset, 8, False)
Task_2_val_Dataloader = get_dataloader(Task_2_val_dataset, 8, False)
# train_val
def train(bert_model, train_Dataloader, val_Dataloader, task_type, metric_type, epoch, lr, weight_decay):
    bert_model.to(device)
    optimizer = AdamW(bert_model.parameters(), lr=lr, weight_decay=weight_decay)
    bert_model.train()
    for e in range(epoch):
        loss_all = 0
        for index, batch in enumerate(train_Dataloader):
            output = bert_model(**batch)
            loss = output.loss
            loss_all += loss.data
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
        print("Epoch:{}/{}, Loss:{}".format(e+1,epoch,loss_all/len(train_Dataloader)))
    bert_model.eval()
    prediction_label_list, true_label_list  = [], []
    with torch.no_grad():
        for index, batch in enumerate(val_Dataloader):
            output = bert_model(**batch)
            prediction_label_list.extend(output.logits.argmax(1).cpu().numpy().tolist())
            true_label_list.extend(batch['labels'].cpu().numpy().tolist())
    save_path = os.path.join(task_type, "trained_model")
    bert_model.save_pretrained(save_path)
    print("Save Model in", save_path)
    if metric_type=="F1":
        print(task_type, "F1 on val Dataset:{:.2f}".format(f1_score(true_label_list, prediction_label_list)))
    else:
        print(task_type, "Acc on val Dataset:{:.2f}".format(accuracy_score(true_label_list, prediction_label_list)))
    print(task_type, "confusion_matrix:")
    print(confusion_matrix(true_label_list, prediction_label_list))

2. Train the model $M$ only for $T_2$ on $D_2^{train}$. Call the trained model $M_{D_2}$. Evaluate $M_{D_2}$ on $D_2^{val}$ and report its performance metric (Accuracy) for the first task, ${Perf}_{T_2}(M_{D_2}|D_2^{val})$. Also print the confusion matrix.

In [11]:
task_type, metric_type, epoch, lr, weight_decay = "Task_1", "F1", 5, 2e-5, 1e-4
train(Task_1_Bert, Task_1_train_Dataloader, Task_1_val_Dataloader, task_type, metric_type, epoch, lr, weight_decay)

Epoch:1/5, Loss:0.44708535075187683
Epoch:2/5, Loss:0.35330304503440857
Epoch:3/5, Loss:0.28106847405433655
Epoch:4/5, Loss:0.2198081612586975
Epoch:5/5, Loss:0.18798157572746277
Save Model in Task_1/trained_model
Task_1 F1 on val Dataset:0.80
Task_1 confusion_matrix:
[[766 108]
 [146 503]]


2. Train the model $M$ only for $T_2$ on $D_2^{train}$. Call the trained model $M_{D_2}$. Evaluate $M_{D_2}$ on $D_2^{val}$ and report its performance metric (Accuracy) for the first task, ${Perf}_{T_2}(M_{D_2}|D_2^{val})$. Also print the confusion matrix.

In [12]:
task_type, metric_type, epoch, lr, weight_decay  = "Task_2", "ACC", 5, 2e-5, 1e-4
train(Task_2_Bert, Task_2_train_Dataloader, Task_2_val_Dataloader, task_type, metric_type, epoch, lr, weight_decay)

Epoch:1/5, Loss:0.5994905829429626
Epoch:2/5, Loss:0.4645318388938904
Epoch:3/5, Loss:0.37755507230758667
Epoch:4/5, Loss:0.3003614544868469
Epoch:5/5, Loss:0.2376321703195572
Save Model in Task_2/trained_model
Task_2 Acc on val Dataset:0.78
Task_2 confusion_matrix:
[[1182  324   56]
 [ 232 1680  318]
 [  34  254 1417]]


3. Estimate the sentiment labels for the tweets in $D_1$: For all text in $D_1$, create $\^{label^2}$ from the ${pred}^2$ output of $M_{D_2}(text)$. This will give you an augmented dataset $\^{D_1} = {({text}_i, {label}_i^1, \^{label}^2_1)}$

In [15]:
print("Loadding Model's Paramater which Trained on D1!")
trained_model_path = "Task_2/trained_model/pytorch_model.bin"
Task_2_Bert.load_state_dict(torch.load(trained_model_path))
def Prediction(model, Dataloader):
    model.to(device)
    model.eval()
    prediction_label_list = []
    for index, batch in enumerate(Dataloader):
        output = model(**batch)
        prediction_label = output.logits.argmax(1)
        prediction_label_list.extend(prediction_label.cpu().numpy().tolist())
    print("Finish the Predcition on Task_1 by Task_2_Bert!")
    return prediction_label_list
# import copy
Task_1_train_X, Task_1_train_Y, pretrain_path = Task_1_train['text'], Task_1_train['target'], "./roberta/roberta_base/roberta_base"
Task_1_train_all_Dataset = Task_Dataset(Task_1_train_X, Task_1_train_Y, pretrain_path)
Task_1_train_all_Dataloader = get_dataloader(Task_1_train_all_Dataset, 8, False)
Task_1_train_all_label_based_on_Task_2_Bert = Prediction(Task_2_Bert, Task_1_train_all_Dataloader)
Task_1_Augmented_Dataset = copy.deepcopy(Task_1_train)
Task_1_Augmented_Dataset['sentiment'] = Task_1_train_all_label_based_on_Task_2_Bert
Task_1_Augmented_Dataset.to_csv("Task_1_Augmented_Dataset/data.csv")


Loadding Model's Paramater which Trained on D1!
Finish the Predcition on Task_1 by Task_2_Bert!


4. Create another dataset by combining $D_1^{train}$ and $D_2^{train}$.

Noted: this dataset has missing labels and is different from the augmented dataset $\hat{D}_1$.

In [16]:
Task_12_train = pd.DataFrame()
Task_12_train['text'] = Task_1_train['text'].tolist() + Task_2_data['text'].tolist()
Task_1_train_label = Task_1_train['target'].tolist() + [None] * len(Task_2_data)
Task_2_train_label = [None] * len(Task_1_train) + Task_2_data['sentiment'].tolist()
Task_12_train['target']  = Task_1_train_label
Task_12_train['sentiment'] = Task_2_train_label

5. We want to train $M$ for both $T_1$ and $T_2$ on $D_{12}$ by minimizing a weighted loss $\lambda_1 l_1 + \lambda_2 l_2$, where $\lambda$ s are positive scaler weights between 0 and 1: higher the $\lambda$ more the emphasis on corresponding task while training; $l_1$, $l_2$ are task-specific loss functions.

In [20]:
# train model for both T1 and T2 by minimizing a weighted loss
class Combined_T1_T2_Dataset(Dataset):
    def __init__(self, dataset, pretrain_path):
        super(Combined_T1_T2_Dataset, self).__init__()
        self.dataset = dataset
        self.tokenizer = AutoTokenizer.from_pretrained(pretrain_path)
    def __getitem__(self, index):
        text = self.dataset['text'].iloc[index]
        if not pd.isnull(self.dataset['target'].iloc[index]):
            target = self.dataset['target'].iloc[index]
        else:
            target = -100
        if not pd.isnull(self.dataset['sentiment'].iloc[index]):
            sentiment = self.dataset['sentiment'].iloc[index]
        else:
            sentiment = -100
        encode_dict = self.tokenizer.encode_plus(text=text, max_length=512,padding='max_length', truncation=True)
        return encode_dict['input_ids'], encode_dict['attention_mask'], target, sentiment
    def __len__(self):
        return len(self.dataset)
pretrain_path = "./roberta/roberta_base/roberta_base"
Task_12_train_dataset = Combined_T1_T2_Dataset(Task_12_train, pretrain_path)
Task_1_Augmented_df = pd.read_csv('Task_1_Augmented_Dataset/data.csv')
Task_1_Augmented_val_df = Task_1_Augmented_df[int(len(Task_1_Augmented_df)*0.8):]
Task_1_Augmented_val_dataset = Combined_T1_T2_Dataset(Task_1_Augmented_val_df, pretrain_path)
def get_dataloader(dataset, batch_size, shuffle):
    def padding_collate(batch):
        (input_ids, attention_mask, targets, sentiments) = zip(*batch)
        input_ids = torch.tensor(input_ids).to(device)
        attention_mask = torch.tensor(attention_mask).to(device)
        targets = torch.tensor(targets).to(device)
        sentiments = torch.tensor(sentiments).to(device)
        return {"input_ids":input_ids, "attention_mask":attention_mask, "targets":targets, "sentiments":sentiments}
    return DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, collate_fn=padding_collate)
Task_12_train_dataloader = get_dataloader(Task_12_train_dataset, 8, False)
Task_1_Augmented_val_dataloader = get_dataloader(Task_1_Augmented_val_dataset, 8, False)
class Task_12_model(nn.Module):
    def __init__(self, pretrain_path, hidden_dim=768, task_1_label_num=2, task_2_label_num=3):
        super(Task_12_model, self).__init__()
        self.bert = AutoModel.from_pretrained(pretrain_path)
        self.disaster_classification = nn.Linear(hidden_dim, task_1_label_num)
        self.sentiment_classification = nn.Linear(hidden_dim, task_2_label_num)
    def forward(self, batch):
        out = self.bert(input_ids=batch["input_ids"], attention_mask= batch["attention_mask"])
        disaster_pred = self.disaster_classification(out["pooler_output"])
        sentiment_pred = self.sentiment_classification(out["pooler_output"])
        return disaster_pred, sentiment_pred
def train(train_Dataloader, val_Dataloader, pretrain_path, save_path, lambdas_1=0.5, lambdas_2=0.5,epoch=1):
    model = Task_12_model(pretrain_path)
    device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
    model.to(device)
    optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=1e-4)  # AdamW优化器
    crit_1, crit_2 = nn.CrossEntropyLoss(), nn.CrossEntropyLoss()
    model.train()
    for e in range(epoch):
        loss_all = 0
        for index, batch in enumerate(train_Dataloader):
            Task_1_prediction, Task_2_prediction = model(batch)
            Task_1_true_label = batch["targets"].long()
            Task_2_true_label = batch["sentiments"].long()
            Task_1_loss = lambdas_1 * crit_1(Task_1_prediction, Task_1_true_label)
            Task_2_loss = lambdas_2 * crit_2(Task_2_prediction, Task_2_true_label)
            loss = Task_1_loss + Task_2_loss
            loss_all += loss.data
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
        print("Epoch:{}/{}, Loss:{}".format(e + 1, epoch, loss_all / len(train_Dataloader)))
    model.eval()
    prediciton_label_Task_1_list = []
    true_label_Task_1_list = []
    with torch.no_grad():
        for index, batch in enumerate(val_Dataloader):
            Task_1_prob, Task_2_prob = model(batch)
            prediciton_label_Task_1_list.extend(torch.argmax(Task_1_prob, dim=1).cpu().numpy().tolist())
            true_label_Task_1_list.extend(batch["targets"].cpu().numpy().tolist())
    print("F1 on Task_1_val_Dataset:{:.2f}".format(f1_score(true_label_Task_1_list, prediciton_label_Task_1_list)))
    print("confusion_matrix:")
    print(confusion_matrix(true_label_Task_1_list, prediciton_label_Task_1_list))
    print("Save Model!")
    torch.save(model.state_dict(), os.path.join(save_path, "pytorch_model.pth"))

6. Disaster classification is our primary task. Obtain the best values of the hyper-parameters $\lambda_1$, $\lambda_2$ using any hyperparameter tuning method to optimise the metric ${Perf}_{T_1}(M_{D_{12}}|D_1^{val})$ for task 1. Note, $\hat{D}_1^{val}$ and ${D}_1^{val}$ have the same labels for task 1.

In [21]:
pretrain_path = "./roberta/roberta_base/roberta_base"
save_path = "./Task_12/trained_model"
train(Task_12_train_dataloader, Task_1_Augmented_val_dataloader, pretrain_path, save_path, 0.5, 0.3, 5)
train(Task_12_train_dataloader, Task_1_Augmented_val_dataloader, pretrain_path, save_path, 0.5, 0.5, 5)
train(Task_12_train_dataloader, Task_1_Augmented_val_dataloader, pretrain_path, save_path, 0.5, 0.7, 5)

Some weights of the model checkpoint at ./roberta/roberta_base/roberta_base were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch:1/5, Loss:0.19117502868175507
Epoch:2/5, Loss:0.15167516469955444
Epoch:3/5, Loss:0.12343809008598328
Epoch:4/5, Loss:0.10026174783706665
Epoch:5/5, Loss:0.08251768350601196
F1 on Task_1_val_Dataset:0.92
confusion_matrix:
[[756  58]
 [ 55 654]]
Save Model!


Some weights of the model checkpoint at ./roberta/roberta_base/roberta_base were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch:1/5, Loss:0.3175782561302185
Epoch:2/5, Loss:0.2452906221151352
Epoch:3/5, Loss:0.20399875938892365
Epoch:4/5, Loss:0.17204737663269043
Epoch:5/5, Loss:0.14348483085632324
F1 on Task_1_val_Dataset:0.76
confusion_matrix:
[[575 239]
 [124 585]]
Save Model!


Some weights of the model checkpoint at ./roberta/roberta_base/roberta_base were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch:1/5, Loss:0.37667176127433777
Epoch:2/5, Loss:0.2971390187740326
Epoch:3/5, Loss:0.2372390180826187
Epoch:4/5, Loss:0.19906166195869446
Epoch:5/5, Loss:0.16163714230060577
F1 on Task_1_val_Dataset:0.90
confusion_matrix:
[[787  27]
 [102 607]]
Save Model!


7. Train the model $M$ on $D_{12}^{train}$ with the best $\lambda$ s obtained in step 6. Call it $M_{{D^{\ast}}_{12}}$. Evaluate it on $D_1^{test}$ for the tasks and report ${Perf}_{T_1}(M^{\ast}_{D_{12}}|D_1^{test})$ by participating in the challenge. Submit a screenshot of your leaderboard score and position.

In [27]:
# eval Task_1_test by using the model M on T_12_train with the best lambdas obtained in pre-step.
print("Loading Paramater of the model M on T_12_train!")
pretrain_path = "./roberta/roberta_base/roberta_base"
model = Task_12_model(pretrain_path)
trained_model_path = "./Task_12/trained_model"
model.load_state_dict(torch.load(os.path.join(trained_model_path,"pytorch_model.pth")))
def get_dataloader(dataset, batch_size, shuffle):
    def pad_collate(batch):
        (input_ids, attention_mask) = zip(*batch)
        input_ids = torch.tensor(input_ids).to(device)
        attention_mask = torch.tensor(attention_mask).to(device)
        return {"input_ids":input_ids, "attention_mask":attention_mask}
    return DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, collate_fn=pad_collate)
print("Finish Loading!")
Task_1_test_dataset = Text_Dataset(Task_1_test, pretrain_path)
Task_1_test_dataloader = get_dataloader(Task_1_test_dataset, 8, False)
def test(model, Dataloader):
    res = pd.DataFrame()
    device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
    model.to(device)
    model.eval()
    prediciton_label_Task_1_list = []
    for index, batch in enumerate(Dataloader):
        Task_1_prob, Task_2_prob = model(batch)
        prediciton_label_Task_1_list.extend(torch.argmax(Task_1_prob, dim=1).cpu().numpy().tolist())
    res['prediciton'] = prediciton_label_Task_1_list
    res.to_csv("./submit/test_predict.csv")
    print("Finish Prediction!")
test(model, Task_1_test_dataloader)

Loading Paramater of the model M on T_12_train!


Some weights of the model checkpoint at ./roberta/roberta_base/roberta_base were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Finish Loading!
Finish Prediction!


8. In step 7, you performed multi-task learning, where data for the second task is externally sourced. Is the best value of hyper-parameter $\lambda_2$ from step 6 zero or postive, i.e. is $\lambda_2 = 0$ or $\lambda_2 > 0$? What does the value of $\lambda_2$ convey? Does the externally sourced sentiment data improve the model accuracy for our primary task of disaster classification?

9. For multi-task learning, why is the augmented dataset $\hat{D_1}$ used only for testing but not for training?

## Random Forest

1. How can the MTL strategy of loss combination in question 5 (part III) be used with a Random Forest? Explain.

2. Try the stated approach. Evaluate the model on $\hat{D}_1^{val}$ and report the performance for both the tasks. Print confusion matrics for both.

In [8]:
# Try the approach based on Random Forest
Task_1_Augmented_df = pd.read_csv("./Task_1_Augmented_Dataset/data.csv")
Task_1_Augmented_train_data = Task_1_Augmented_df[:int(len(Task_1_Augmented_df)*0.8)]
Task_1_Augmented_val_data = Task_1_Augmented_df[int(len(Task_1_Augmented_df)*0.8):]
def sentence_tokenize(datasets, pretrain_path):
    tokenizer = AutoTokenizer.from_pretrained(pretrain_path)
    tokenize_sentences= []
    for i in range(len(datasets)):
        tokenize_sentences.append(tokenizer.tokenize(datasets['text'].iloc[i].strip()))
    return tokenize_sentences
pretrain_path = "./roberta/roberta_base/roberta_base"
print("tokenize train text and val text!")
train_tokenize_sentences = sentence_tokenize(Task_1_Augmented_train_data, pretrain_path)
val_tokenize_sentences = sentence_tokenize(Task_1_Augmented_val_data, pretrain_path)
print("train word2vec!")
word2vec_model = word2vec.Word2Vec(train_tokenize_sentences + val_tokenize_sentences, \
                                   workers=4, \
                                   vector_size=300, min_count=40, \
                                   window=10, sample=1e-3)
word2vec_model.init_sims(replace=True)
index2word_set = set(word2vec_model.wv.index_to_key)
def get_avg_feat(tokenize_sentences, nums_features, model, index2word_set):
    sentences_embedding = []
    for sentences in tokenize_sentences:
        embedding = np.zeros((len(sentences), nums_features), dtype="float32")
        for index, word in enumerate(sentences):
            if word in index2word_set:
                embedding[index] = model.wv[word]
        if len(sentences)!=0:
            sentence_embedding = np.sum(embedding, axis=0) / len(sentences)
        
        sentences_embedding.append(sentence_embedding)
    return sentences_embedding
trainVec = get_avg_feat(train_tokenize_sentences, 300, word2vec_model, index2word_set)
valVec = get_avg_feat(val_tokenize_sentences, 300, word2vec_model, index2word_set)
forest = RandomForestClassifier(n_estimators=1)
print("Training random forest!")
def get_sample_weight(df):
    sample_weight = {0 : 0.5, 1 : 0.5, 2 : 0.5}
    return sample_weight[df['sentiment']]
Task_1_Augmented_train_data['sample_weight'] = Task_1_Augmented_train_data.apply(get_sample_weight, axis=1)
forest = forest.fit(trainVec, Task_1_Augmented_train_data["target"], sample_weight=Task_1_Augmented_train_data["sample_weight"])
Task_1_val_prediction = forest.predict(valVec)
print("F1 on Task_1_Augmented_train_data_Task_1 Using random forest:{:.2f}".format(f1_score(Task_1_Augmented_val_data["target"], Task_1_val_prediction)))
print("confusion_matrix:")
print(confusion_matrix(Task_1_Augmented_val_data["target"], Task_1_val_prediction))


tokenize train text and val text!
train word2vec!
Training random forest!
F1 on Task_1_Augmented_train_data_Task_1 Using random forest:0.57
confusion_matrix:
[[532 282]
 [312 397]]


3. Use any hyper parameter tuning method to find the best values of the $\lambda$ s to maiximize ${Perf}_{T_1}(M_{\hat{D}_1^{train}}|D_1^{val})$. Is the optimal value result in $\lambda_p = \lambda_n = \lambda_0$? If not, what is the ordering of $\lambda$ s, and what can we infer from this ordering?

4. Train the model with the best $\lambda$ s obtained in step 3. Evaluate it on $\hat{D}_1^{test}$ for the tasks and report ${Perf}_{T_1}(M_{\hat{D}_1^{train}}|D_1^{test})$ by participating in the challenge. Submit a screenshot of your leaderboard score and position.

In [12]:
# Evaluate it on Task_1_test_data
# load datasets
def load_datasets(Task_1_folder, Task_2_folder):
    # read csv file
    Task_1_train_file_path = os.path.join(Task_1_folder, "data/train.csv")
    Task_1_test_file_path = os.path.join(Task_1_folder, "data/test.csv")
    Task_2_data_file_path = os.path.join(Task_2_folder, "data/Tweets.csv")
    Task_1_train = pd.read_csv(Task_1_train_file_path)
    Task_1_test = pd.read_csv(Task_1_test_file_path)
    Task_2_data = pd.read_csv(Task_2_data_file_path)
    # nan text processing and label mapping
    Task_2_data['text'] = Task_2_data['text'].replace(np.nan," ")
    label2index = {
        "negative":0,
        "neutral":1,
        "positive":2
    }
    Task_2_data['sentiment'] = Task_2_data['sentiment'].map(label2index)
    return Task_1_train, Task_1_test, Task_2_data
Task_1_folder = "./Task_1"
Task_2_folder = "./Task_2"
Task_1_train, Task_1_test, Task_2_data = load_datasets(Task_1_folder, Task_2_folder)

test_tokenize_sentences = sentence_tokenize(Task_1_test, pretrain_path)
testVec = get_avg_feat(test_tokenize_sentences, 300, word2vec_model, index2word_set)
Task_1_prediction = forest.predict(testVec)
res = pd.DataFrame()
res['prediciton'] = Task_1_prediction
res.to_csv("./submit/test_predict.csv")
print("Predict Finish!")

Predict Finish!


## Further discussion

In [13]:
# approach_1 SVM
svc = SVC(kernel = 'linear')
svc.fit(trainVec, Task_1_Augmented_train_data["target"], sample_weight=Task_1_Augmented_train_data["sample_weight"])
Task_1_val_prediction = svc.predict(valVec)
print("F1 on Task_1_Augmented_train_data_Task_1 using SVM:{:.2f}".format(f1_score(Task_1_Augmented_val_data["target"], Task_1_val_prediction)))
print("confusion_matrix:")
print(confusion_matrix(Task_1_Augmented_val_data["target"], Task_1_val_prediction))

F1 on Task_1_Augmented_train_data_Task_1 using SVM:0.67
confusion_matrix:
[[579 235]
 [229 480]]


In [14]:
# approach_2 gbdt
GBDT = GradientBoostingClassifier()  # adaboost
GBDT = GBDT.fit(trainVec, Task_1_Augmented_train_data["target"], sample_weight=Task_1_Augmented_train_data["sample_weight"])  # 拟合训练集
Task_1_val_prediction = GBDT.predict(valVec)
print("F1 on Task_1_Augmented_train_data_Task_1 using GBDT:{:.2f}".format(f1_score(Task_1_Augmented_val_data["target"], Task_1_val_prediction)))
print("confusion_matrix:")
print(confusion_matrix(Task_1_Augmented_val_data["target"], Task_1_val_prediction))

F1 on Task_1_Augmented_train_data_Task_1 using GBDT:0.64
confusion_matrix:
[[634 180]
 [292 417]]
