In [1]:
%%time
import json
import numpy as np
import pandas as pd
import torch
from transformers import Trainer, TrainingArguments, BertForSequenceClassification, BertTokenizer, EarlyStoppingCallback
import re
import altair as alt
import krippendorff

ModuleNotFoundError: No module named 'krippendorff'

### 1. Load the data

In [2]:
%%time
# 2min

with open("Words.json") as f:
    words = json.loads(f.read())
    acronym = words["acronym"]
    stopwords = words["stopwords"]
    
def drop_stopwords(sentence: str):
    words = sentence.lower().replace("’", "'").split(" ")
    sentence = list()
    for word in words:
        if "http" in word:
            sentence.append("LINK")
        else:
            for w in re.findall("\w*'?\w+|\?|!", word):
                if w in acronym:
                    sentence.extend(acronym[w])
                else:
                    sentence.append(w)
    return " ".join(sentence)

data = pd.read_csv("Data/Training/si630w22-hw3-data.csv", index_col="question_id")
data["features"] = data.apply(lambda x: " [SEP] ".join([drop_stopwords(x.question_text), drop_stopwords(x.reply_text)]), axis=1)

def load_df(path: str):
    df = pd.read_csv(path, index_col="id")
    if "rating" not in df.columns:
        df["rating"] = 0
    df = df.merge(data, left_index=True, right_index=True)
    df = df.reset_index().drop_duplicates(["index"])[["features", "index", "question_text", "reply_text"]].merge(
        df.reset_index().groupby(["index"])["rating"].mean(),
        right_index=True,
        left_on="index"
    ).set_index(["index"])
    df.index.name = "question_id"
    return df

train_df = load_df("Data/Training/si630w22-hw3-train.csv")
dev_df = load_df("Data/Training/si630w22-hw3-dev.csv")
train_df.head()

CPU times: user 398 ms, sys: 21.1 ms, total: 419 ms
Wall time: 461 ms


Unnamed: 0_level_0,features,question_text,reply_text,rating
question_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
t3_n2714y,is there someone you turned down in the past a...,"Is there someone you turned down in the past, ...",Idk if this counts but my when I was younger m...,4.75
t3_n27873,what is in your opinion the saddest villain ba...,"What is, in your opinion, the saddest villain ...",My man Dr Heinz Doofenschmirts was born withou...,3.75
t3_n27b1e,explain like i'm five how do we still not know...,ELI5: How do we still not know how eels reprod...,"For a long time, it wasn't known how eels mate...",3.5
t3_n27qop,explain like i'm five why can't freshwater fis...,ELI5: Why can’t freshwater fish live in saltwa...,A living cell is designed to work at specific ...,4.0
t3_n27vu3,what's something nice you like to do just to b...,What's something nice you like to do just to b...,Give compliments. It’s extremely easy to do an...,4.4


### 2. Load the model and prepare the data set

In [3]:
%%time 
# 10s
tokenizer = BertTokenizer.from_pretrained("../../MiniLM")
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
batch_size = 24
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

CPU times: user 19.3 ms, sys: 11.6 ms, total: 30.9 ms
Wall time: 62 ms


In [4]:
train_tokenized = tokenizer(list(train_df["question_text"]), list(train_df["reply_text"]), padding=True, truncation=True, max_length=512)
dev_tokenized = tokenizer(list(dev_df["question_text"]), list(dev_df["reply_text"]), padding=True, truncation=True, max_length=512)
train_dataset = Dataset(train_tokenized, list(train_df["rating"]))
dev_dataset = Dataset(dev_tokenized, list(dev_df["rating"]))

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

### 3. Start training

In [5]:
def train(model, train_dataset):
    if device != "cpu":
        model.to(device)
    # freeze the bert paremeters
    for param in model.bert.parameters():
        param.requires_grad = False
    args = TrainingArguments(
        output_dir="output",
        evaluation_strategy="steps",
        eval_steps=100,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        learning_rate=5e-3,
        num_train_epochs=2,
        seed=123,
        load_best_model_at_end=True,
    )
    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=train_dataset,
        eval_dataset=dev_dataset,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
    )
    trainer.train()
    args.learning_rate = 1e-5
    args.num_train_epochs = 2
    for param in model.bert.parameters():
        param.requires_grad = True

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=train_dataset,
        eval_dataset=dev_dataset,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
    )
    trainer.train()
    return model

def predict(model, df_pred):
    trainer = Trainer(model=model)
    pred_tokenized = tokenizer(list(df_pred["question_text"]), list(df_pred["reply_text"]), padding=True, truncation=True, max_length=512)
    dataset_pred = Dataset(pred_tokenized, list(df_pred["rating"]))
    return trainer.predict(dataset_pred).predictions.flatten()
    
def validate(model, df_val):
    pred = predict(model, df_val)
    mse = np.sum((dev_df["rating"] - pred) ** 2 ) / len(df_val)
    return mse

In [12]:
model = BertForSequenceClassification.from_pretrained("MiniLM", num_labels=1)
train(model, train_dataset)
mse = validate(model, dev_df)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at MiniLM and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
***** Running training *****
  Num examples = 3779
  Num Epochs = 2
  Instantaneous batch size per device = 24
  Total train batch size (w. parallel, distributed & accumulation) = 24
  Gradient Accumulation steps = 1
  Total optimization steps = 316


Step,Training Loss,Validation Loss
100,No log,0.748431
200,No log,0.727368
300,No log,0.722907


***** Running Evaluation *****
  Num examples = 811
  Batch size = 24
***** Running Evaluation *****
  Num examples = 811
  Batch size = 24
***** Running Evaluation *****
  Num examples = 811
  Batch size = 24


Training completed. Do not forget to share your model on huggingface.co/models =)


***** Running training *****
  Num examples = 3779
  Num Epochs = 2
  Instantaneous batch size per device = 24
  Total train batch size (w. parallel, distributed & accumulation) = 24
  Gradient Accumulation steps = 1
  Total optimization steps = 316


RuntimeError: CUDA out of memory. Tried to allocate 288.00 MiB (GPU 0; 8.00 GiB total capacity; 6.99 GiB already allocated; 0 bytes free; 7.06 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
Trainer(model=model).save_model("{}".format(int(100 * mse)))

### 5. Validation

In [None]:
%%time
model = BertForSequenceClassification.from_pretrained("40", num_labels=1)
mse = validate(model, dev_df)

In [None]:
%%time
test_df = load_df("Data/si630w22-hw3-test.public.csv")
pred = predict(model, test_df)
test_df["predicted"] = pred

In [None]:
test_df[["question_id", "predicted"]].rename(columns={"question_id", "id"}).set_index(["id"]).to_csv("TestResult.csv", index_label="id")

### 6. Correlation Plot

In [None]:
%%time

eva_df = pd.read_csv("Data/si630w22-hw3-train.csv", index_col="id")
eva_df = eva_df.merge(data, left_index=True, right_index=True).reset_index()[["rating", "index", "group"]].groupby(["index", "group"]).mean().reset_index().set_index(["group"])

eva_dev_df = pd.read_csv("Data/si630w22-hw3-dev.csv", index_col="id")
eva_dev_df = eva_dev_df.merge(data, left_index=True, right_index=True).reset_index()[["rating", "index", "group"]].groupby(["index", "group"]).mean().reset_index().set_index(["group"])
model_name = "MiniLM"

def train_model(eva_df, group):
    mean = eva_df[eva_df.index!=group]["rating"].mean()
    datasets = list()
    df_s = [
        eva_df, # Train
        eva_dev_df[eva_dev_df.index!=group], # A
        eva_dev_df[eva_dev_df.index==group] # B
    ]
    df_s.append(df_s[-1][["index"]].merge(df_s[1], left_on="index", right_on="index")) # C
    df_s[0].loc[group, "rating"] = mean # Train
    for i, df in enumerate(df_s): # Construct the dataset
        df.index.name = "question_id"
        df_s[i] = df_s[i].reset_index().merge(
            data[["question_text", "reply_text", "features"]],
            right_index=True,
            left_on="index"
        )
        df = df_s[i]
        tokenized = tokenizer(list(df["question_text"]), list(df["reply_text"]), padding=True, truncation=True, max_length=512)
        datasets.append(Dataset(tokenized, list(df["rating"])))


    model = BertForSequenceClassification.from_pretrained("MiniLM", num_labels=1)
    model = train(model, datasets[0])
    trainer = Trainer(model=model)
    corr = list()
    for i, dataset in enumerate(datasets[1:]):
        pred = trainer.predict(dataset)
        df_s[i+1]["pred"] = pred.predictions.flatten()
        corr.append(df_s[i+1][["rating", "pred"]].corr().values[0][1])

    dev_pred = trainer.predict(dev_dataset).predictions.flatten()
    mse = np.sum((dev_df["rating"] - dev_pred) ** 2 ) / len(dev_pred)
    return {"trainer": trainer, "corr": corr, "group": group, "mse": mse}

In [None]:
corr_df = list()
for group in sorted(set(eva_df.index))[:1]:
    res = train_model(eva_df, group)
    rec = {"group": group}
    for i, df_type in enumerate(["a", "b", "c"]):
        rec[df_type] = res["corr"][i]
    corr_df.append(rec)
corr_df = pd.DataFrame(corr_df)
corr_df

In [None]:
chart1 = alt.Chart(corr_df[corr_df["group"].index<39]).mark_bar().encode(
    x=alt.X('corr_type:N', title=None),
    y='corr:Q',
    color='corr_type:N',
    column='group:N'
)

chart2 = alt.Chart(corr_df[corr_df["group"]>=39]).mark_bar().encode(
    x=alt.X('corr_type:N', title=None),
    y='corr:Q',
    color='corr_type:N',
    column='group:N'
)
chart = chart1 & chart2
chart

# 7. In-group Analysis

In [None]:
df = pd.read_csv("Group2Annotation.csv", index_col="question_id", sep="\t")
df_sub = df[["pandapcd", "xuelw", "liulim"]]
df_sub = df_sub.rename(columns={"pandapcd":"annotator1", "xuelw": "annotator2", "liulim": "annotator3"})
df_sub = df_sub.melt(ignore_index=False, var_name="annotator", value_name ="rating").sort_values(["annotator"]).sort_index(kind="stable")
df_sub.to_csv("Group2Submission.csv", index_label="question_id")

In [None]:
df[["pandapcd", "xuelw", "liulim"]].corr()

In [None]:
krippendorff.alpha(df[["pandapcd", "xuelw", "liulim"]].T, value_domain=list(range(1, 6)), level_of_measurement="ordinal")

In [None]:
krippendorff.alpha(df[["pandapcd", "xuelw", "liulim"]].T, value_domain=list(range(1, 6)), level_of_measurement="nominal")

In [None]:
%%time
df_dev = pd.read_csv("Data/Training/si630w22-hw3-dev.csv", index_col="id")
df_train = pd.read_csv("Data/Training/si630w22-hw3-train.csv", index_col="id")
df_public = pd.concat([df_dev, df_train])
df_public = df_public[df_public["group"]!="group_02"]
df_public = df_public[df_public.index.isin(df.index)]
df_public = df_public["rating"].reset_index().groupby(["id"]).mean()
df_public.head()
df_public["group2_rating"] = df_sub["rating"].reset_index().groupby(["question_id"]).mean()
df_public["diff"] = (df_public["rating"] - df_public["group2_rating"]).abs()
df_public = df_public.sort_values(["diff"], ascending=False)
df_public

In [None]:
top_diff = df_public.iloc[:10]
df_diff = df[df.index.isin(top_diff.index)]
df_diff = df_diff.merge(top_diff, left_index=True, right_index=True)
df_diff.to_csv("Group2Diff.csv", index_label="question_id")
df_diff