<a href="https://colab.research.google.com/github/RonGGG/secret_mission/blob/main/Baseline_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Loading

In [None]:
# Code to download file into Colaboratory:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

#get the data from the drive
id = '1NWsWT3ABNrRfPyAMxoSZ8jq2doXxjxWS'
downloaded = drive.CreateFile({'id':id}) 
downloaded.GetContentFile('train.csv')

id = '1fXe2TmzeN-Mh3QjduSKBEifDX_C4xMtG'
downloaded = drive.CreateFile({'id':id}) 
downloaded.GetContentFile('val.csv')

id = '1HHkO_48qPfJCeDugWsF-47frJAAnNjMq'
downloaded = drive.CreateFile({'id':id}) 
downloaded.GetContentFile('test_without_labels.csv')

Load data

In [None]:
import pandas as pd
df_train = pd.read_csv('train.csv')
df_val = pd.read_csv('val.csv')
df_test = pd.read_csv('test_without_labels.csv')

Have a look at the training data

In [None]:
df_train.sample(10)

Unnamed: 0,sents,labels
19227,ahaha,O
23462,[SEPA] gg [SEPA] wipe,SEPA S SEPA S
25629,gg,S
19631,haha [SEPA] im dead [SEPA] just slark,O SEPA O T SEPA O C
23343,fuck u,T P
14367,gg,S
7791,report jakiro ty,S C O
11724,gg,S
24978,GG WEST BOY,S O O
16957,time 2 punish,O O O


Convert data from df to list

In [None]:
train_sents = df_train['sents'].tolist()
train_labels = df_train['labels'].tolist()

In [None]:
val_sents = df_val['sents'].tolist()
val_labels = df_val['labels'].tolist()

In [None]:
test_sents = df_test['sents'].tolist()

# Data Preprocessing

## Tokenize and cleaning

In [None]:
def data_preprocessing(sents_list):
    processed_sents_list = []
    for sent in sents_list:
        text_tokens = sent.lower().split(' ')
        processed_sents_list.append(text_tokens)
    return processed_sents_list

In [None]:
train_tkzd = data_preprocessing(train_sents)
val_tkzd = data_preprocessing(val_sents)
test_tkzd = data_preprocessing(test_sents)

In [None]:
def label_preprocessing(label_list):
    processed_label_list = []
    for labels in label_list:
        processed_label_list.append(labels.split(' '))
    return processed_label_list

In [None]:
train_targets = label_preprocessing(train_labels)
val_targets = label_preprocessing(val_labels)

In [None]:
train_tkzd[11]

['my', 'arrows', '[sepa]', 'always', 'decent', '[sepa]', 'fuck', 'u']

In [None]:
train_targets[11]

['P', 'O', 'SEPA', 'O', 'O', 'SEPA', 'T', 'P']

In [None]:
val_tkzd[12]

['i', 'dont', 'care', 'about', 'that', 'i', 'need', 'fair', 'game']

In [None]:
val_targets[12]

['P', 'O', 'S', 'O', 'P', 'P', 'O', 'O', 'O']

In [None]:
test_tkzd[421]

['huska', 'gave', 'us', 'free', 'farm', 'too']

## Generate word_to_ix and tag_to_ix

In [None]:
word_to_ix = {}
for sentence in train_tkzd + val_tkzd + test_tkzd:
    for word in sentence:
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)
word_list = list(word_to_ix.keys())

START_TAG = "<START>"
STOP_TAG = "<STOP>"
tag_to_ix = {START_TAG:0, STOP_TAG:1}
for tags in train_targets + val_targets:
    for tag in tags:
        if tag not in tag_to_ix:
            tag_to_ix[tag] = len(tag_to_ix)

In [None]:
tag_to_ix

{'<START>': 0,
 '<STOP>': 1,
 'C': 8,
 'D': 7,
 'O': 2,
 'P': 4,
 'S': 6,
 'SEPA': 5,
 'T': 3}

## convert dataset into idxs

In [None]:
def to_index(data, to_ix):
    input_index_list = []
    for sent in data:
        input_index_list.append([to_ix[w] for w in sent])
    return input_index_list

train_input_index =  to_index(train_tkzd,word_to_ix)
train_output_index = to_index(train_targets,tag_to_ix)
val_input_index = to_index(val_tkzd,word_to_ix)
val_output_index = to_index(val_targets,tag_to_ix)
test_input_index = to_index(test_tkzd,word_to_ix)

In [None]:
train_input_index[11]

[15, 22, 6, 23, 24, 6, 25, 26]

In [None]:
train_tkzd[11]

['my', 'arrows', '[sepa]', 'always', 'decent', '[sepa]', 'fuck', 'u']

In [None]:
train_targets[11]

['P', 'O', 'SEPA', 'O', 'O', 'SEPA', 'T', 'P']

In [None]:
train_output_index[11]

[4, 2, 5, 2, 2, 5, 3, 4]

# Baseline model construction

In [None]:
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.optim as optim

torch.manual_seed(1)

def argmax(vec):
    # return the argmax as a python int
    _, idx = torch.max(vec, 1)
    return idx.item()


# Compute log sum exp in a numerically stable way for the forward algorithm
def log_sum_exp(vec):
    max_score = vec[0, argmax(vec)]
    max_score_broadcast = max_score.view(1, -1).expand(1, vec.size()[1])
    return max_score + \
        torch.log(torch.sum(torch.exp(vec - max_score_broadcast)))

class BiLSTM_CRF(nn.Module):

    def __init__(self, vocab_size, tag_to_ix, embedding_dim, hidden_dim):
        super(BiLSTM_CRF, self).__init__()
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size
        self.tag_to_ix = tag_to_ix
        self.tagset_size = len(tag_to_ix)

        self.word_embeds = nn.Embedding(vocab_size, embedding_dim)

        """Here we use the embedding matrix as the initial weights of nn.Embedding"""
        # self.word_embeds.weight.data.copy_(torch.from_numpy(embedding_matrix))
        # Optional: set requires_grad = False to make this lookup table untrainable
        self.word_embeds.weight.requires_grad = False
        
        self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2,
                            num_layers=1, bidirectional=True)

        # Maps the output of the LSTM into tag space.
        self.hidden2tag = nn.Linear(hidden_dim, self.tagset_size)

        # Matrix of transition parameters.  Entry i,j is the score of
        # transitioning *to* i *from* j.
        self.transitions = nn.Parameter(
            torch.randn(self.tagset_size, self.tagset_size))

        # These two statements enforce the constraint that we never transfer
        # to the start tag and we never transfer from the stop tag
        self.transitions.data[tag_to_ix[START_TAG], :] = -10000
        self.transitions.data[:, tag_to_ix[STOP_TAG]] = -10000

        self.hidden = self.init_hidden()

    def init_hidden(self):
        return (torch.randn(2, 1, self.hidden_dim // 2).to(device),
                torch.randn(2, 1, self.hidden_dim // 2).to(device))

    def _forward_alg(self, feats):
        # Do the forward algorithm to compute the partition function
        init_alphas = torch.full((1, self.tagset_size), -10000.).to(device)
        # START_TAG has all of the score.
        init_alphas[0][self.tag_to_ix[START_TAG]] = 0.

        # Wrap in a variable so that we will get automatic backprop
        forward_var = init_alphas

        # Iterate through the sentence
        for feat in feats:
            alphas_t = []  # The forward tensors at this timestep
            for next_tag in range(self.tagset_size):
                # broadcast the emission score: it is the same regardless of
                # the previous tag
                emit_score = feat[next_tag].view(
                    1, -1).expand(1, self.tagset_size)
                # the ith entry of trans_score is the score of transitioning to
                # next_tag from i
                trans_score = self.transitions[next_tag].view(1, -1)
                # The ith entry of next_tag_var is the value for the
                # edge (i -> next_tag) before we do log-sum-exp
                next_tag_var = forward_var + trans_score + emit_score
                # The forward variable for this tag is log-sum-exp of all the
                # scores.
                alphas_t.append(log_sum_exp(next_tag_var).view(1))
            forward_var = torch.cat(alphas_t).view(1, -1)
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        alpha = log_sum_exp(terminal_var)
        return alpha

    def _get_lstm_features(self, sentence):
        self.hidden = self.init_hidden()
        embeds = self.word_embeds(sentence).view(len(sentence), 1, -1)
        lstm_out, self.hidden = self.lstm(embeds, self.hidden)
        lstm_out = lstm_out.view(len(sentence), self.hidden_dim)
        lstm_feats = self.hidden2tag(lstm_out)
        return lstm_feats

    def _score_sentence(self, feats, tags):
        # Gives the score of a provided tag sequence
        score = torch.zeros(1).to(device)
        tags = torch.cat([torch.tensor([self.tag_to_ix[START_TAG]], dtype=torch.long).to(device), tags])
        for i, feat in enumerate(feats):
            score = score + \
                self.transitions[tags[i + 1], tags[i]] + feat[tags[i + 1]]
        score = score + self.transitions[self.tag_to_ix[STOP_TAG], tags[-1]]
        return score

    def _viterbi_decode(self, feats):
        backpointers = []

        # Initialize the viterbi variables in log space
        init_vvars = torch.full((1, self.tagset_size), -10000.).to(device)
        init_vvars[0][self.tag_to_ix[START_TAG]] = 0

        # forward_var at step i holds the viterbi variables for step i-1
        forward_var = init_vvars
        for feat in feats:
            bptrs_t = []  # holds the backpointers for this step
            viterbivars_t = []  # holds the viterbi variables for this step

            for next_tag in range(self.tagset_size):
                # next_tag_var[i] holds the viterbi variable for tag i at the
                # previous step, plus the score of transitioning
                # from tag i to next_tag.
                # We don't include the emission scores here because the max
                # does not depend on them (we add them in below)
                next_tag_var = forward_var + self.transitions[next_tag]
                best_tag_id = argmax(next_tag_var)
                bptrs_t.append(best_tag_id)
                viterbivars_t.append(next_tag_var[0][best_tag_id].view(1))
            # Now add in the emission scores, and assign forward_var to the set
            # of viterbi variables we just computed
            forward_var = (torch.cat(viterbivars_t) + feat).view(1, -1)
            backpointers.append(bptrs_t)

        # Transition to STOP_TAG
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        best_tag_id = argmax(terminal_var)
        path_score = terminal_var[0][best_tag_id]

        # Follow the back pointers to decode the best path.
        best_path = [best_tag_id]
        for bptrs_t in reversed(backpointers):
            best_tag_id = bptrs_t[best_tag_id]
            best_path.append(best_tag_id)
        # Pop off the start tag (we dont want to return that to the caller)
        start = best_path.pop()
        assert start == self.tag_to_ix[START_TAG]  # Sanity check
        best_path.reverse()
        return path_score, best_path

    def neg_log_likelihood(self, sentence, tags):
        feats = self._get_lstm_features(sentence)
        forward_score = self._forward_alg(feats)
        gold_score = self._score_sentence(feats, tags)
        return forward_score - gold_score

    def forward(self, sentence):  # dont confuse this with _forward_alg above.
        # Get the emission scores from the BiLSTM
        lstm_feats = self._get_lstm_features(sentence)

        # Find the best path, given the features.
        score, tag_seq = self._viterbi_decode(lstm_feats)
        return score, tag_seq

In [None]:
from sklearn.metrics import f1_score
def cal_acc(model, input_index, output_index):
    '''
    model: BiLSTM_CRF
    input_index: [[0],[1,2,3,4,5,6,7,8,9],...]
                  shape is (num_samples, seq_length)
    output_index: [[2],[3,2,4,2,2,2,4,2,2],...]
                  shape is (num_samples, seq_length)
    '''
    predicted = []
    ground_truth = []

    with torch.no_grad():
        for i in input_index:
            sentence_in = torch.tensor(i, dtype=torch.long).to(device)
            _, output = model(sentence_in)
            predicted.append(output)

    predicted = [i for item in predicted for i in item]
    ground_truth = [i for item in output_index for i in item]

    counter = 0
    for i in range(len(ground_truth)):
        if ground_truth[i] == predicted[i]:
            counter += 1
    accuracy = counter/len(ground_truth)

    f_1 = f1_score(ground_truth, predicted, average='micro')

    return predicted, ground_truth, accuracy, f_1

## Initialize Model

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
HIDDEN_DIM = 50
EMBEDDING_DIM = 25
model = BiLSTM_CRF(len(word_to_ix), tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM).to(device)
optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)

# Training and validating

In [None]:
"""Each epoch will take about 8-9 minutes"""

import datetime

for epoch in range(2):  
    time1 = datetime.datetime.now()
    train_loss = 0

    model.train()
    for i, idxs in enumerate(train_input_index):
        tags_index = train_output_index[i]

        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Step 2. Get our inputs ready for the network, that is,
        # turn them into Tensors of word indices.
        sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)
        targets = torch.tensor(tags_index, dtype=torch.long).to(device)

        # Step 3. Run our forward pass.
        loss = model.neg_log_likelihood(sentence_in, targets)

        # Step 4. Compute the loss, gradients, and update the parameters by
        # calling optimizer.step()
        loss.backward()
        optimizer.step()

        train_loss+=loss.item()

    model.eval()
    # Call the cal_acc functions you implemented as required
    _, _, train_acc, train_f1_score = cal_acc(model,train_input_index,train_output_index)
    _, _, val_acc, vali_f1_score = cal_acc(model,val_input_index,val_output_index)

    val_loss = 0
    for i, idxs in enumerate(val_input_index):
        tags_index = val_output_index[i]
        sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)
        targets = torch.tensor(tags_index, dtype=torch.long).to(device)
        loss = model.neg_log_likelihood(sentence_in, targets)
        val_loss+=loss.item()
    time2 = datetime.datetime.now()

    print("Epoch:%d, Training loss: %.2f, train acc: %.4f, train f1: %.4f, val loss: %.2f, val acc: %.4f, val f1: %.4f, time: %.2fs" %(epoch+1, train_loss,train_acc,train_f1_score, val_loss, val_acc,vali_f1_score, (time2-time1).total_seconds()))

Epoch:1, Training loss: 48351.38, train acc: 0.9293, train f1: 0.9293, val loss: 8850.62, val acc: 0.9275, val f1: 0.9275, time: 522.61s
Epoch:2, Training loss: 21006.39, train acc: 0.9597, train f1: 0.9597, val loss: 5745.28, val acc: 0.9578, val f1: 0.9578, time: 522.92s


# Prediction

In [None]:
def prediction(model, input_index):
    '''
    model: BiLSTM_CRF
    input_index: [[0],[1,2,3,4,5,6,7,8,9],...]
                  shape is (num_samples, seq_length)
    '''
    predicted = []

    with torch.no_grad():
        for i in input_index:
            sentence_in = torch.tensor(i, dtype=torch.long).to(device)
            _, output = model(sentence_in)
            predicted.append(output)

    predicted = [i for item in predicted for i in item]

    return predicted

In [None]:
y_pred = prediction(model,test_input_index)

def decode_output(output_list):
    ix_to_tag = {v:k for k,v in tag_to_ix.items()}
    return [ix_to_tag[output] for output in output_list]

y_pred_decode = decode_output(y_pred)

In [None]:
len(y_pred_decode)

2326

In [None]:
submission_label = []
counter = 0
for i in y_pred_decode:
    cur_list = []
    cur_list.append(str(counter))
    cur_list.append(i)
    submission_label.append(cur_list)
    counter += 1

In [None]:
import pandas as pd
name=['ID','Predicted']
results=pd.DataFrame(columns=name,data=submission_label)
results.to_csv('Baseline_model_results.csv',encoding='gbk',index=None)