# Neural-based propaganda detection
Propaganda is the new weapon that influences people's opinions or beliefs about a certain ideology, whether that ideology is right or wrong. This assignment requires you to design a propaganda content identifier. Below presents the sample code for using the provided dataset to train an MLP-based propaganda detector.

In [8]:
from __future__ import absolute_import, division, print_function, unicode_literals

import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

Num GPUs Available:  1


In [5]:
import torch
torch.cuda.current_device()
torch.cuda.device(0)
torch.cuda.device_count()
torch.cuda.get_device_name(0)
torch.cuda.is_available()

True

In [9]:
# load data
import pandas as pd
from sklearn.utils import shuffle
df = pd.read_table('coursework2_train.tsv')
df = shuffle(df) # randomly shuffle data entries 
df

Unnamed: 0,article_id,article_title,label,sentence_text
4309,730865684,Puerto Rico Hurricane Recovery Worsened By Nea...,non-propaganda,"After 2011, the territory adopted a uniform bu..."
3841,730268758,Evidence shows Pope Francis is a ‘principal in...,non-propaganda,Reports say Pope Francis personally received t...
7857,790677230,Kavanaugh's Nomination Saved?,non-propaganda,Attorneys for new Kavanaugh accusers Deborah R...
7031,703821117,The Cunning CIA,non-propaganda,"Shapira points out, “Several news organization..."
7053,703821117,The Cunning CIA,propaganda,"Interesting enough, in the Chile regime-change..."
...,...,...,...,...
5256,728972961,FOR THE FIRST TIME ONLINE: Archbishop Lefebvre...,non-propaganda,"I was in Melbourne, Australia, during the 40th..."
7848,790677230,Kavanaugh's Nomination Saved?,non-propaganda,“Democratic staff was invited to participate a...
6999,703821117,The Cunning CIA,propaganda,There could be only one answer: communists.
3865,730268758,Evidence shows Pope Francis is a ‘principal in...,non-propaganda,"“This is Barros present, in the room when the ..."


In [10]:
raw_labels = df.label.values.tolist()
docs = df.sentence_text.values.tolist()
titles = df.article_title.values.tolist()

label_dic = {'non-propaganda':0, 'propaganda':1}

assert len(docs) == len(raw_labels) == len(titles)
labels = [label_dic[rl] for rl in raw_labels] # transfer raw labels (strings) to integer numbers
print('total data size: {}, label type num: {}'.format(len(docs), len(label_dic)))

total data size: 11464, label type num: 2


In [11]:
# take a look at some sentences in the dataset
print(docs[19])
print(titles[19])
print(labels[19])

Trumpism is not a detour, after which we can all get back on the interstate to the New World Order.
Patrick J. Buchanan: Sorry, Jeff Flake, It's Trump's Party Now!
1


In [7]:
# df.to_csv(r'D:/data/text/data_edited.csv', header=None, index=None, sep='\t', mode='a') 

In [12]:
print('num of non-propoganda entries', len([l for l in labels if l== 0]))
print('num of propoganda entries', len([l for l in labels if l== 1]))

num of non-propoganda entries 8227
num of propoganda entries 3237


In [13]:
# split the data into train, dev and test

train_ratio, dev_ratio, test_ratio = 0.6, 0.2, 0.2
train_docs = docs[:int(len(docs)*train_ratio)]
train_labels = labels[:int(len(docs)*train_ratio)]

dev_docs = docs[int(len(docs)*train_ratio):int(len(docs)*(train_ratio+dev_ratio))]
dev_labels = labels[int(len(docs)*train_ratio):int(len(docs)*(train_ratio+dev_ratio))]

test_docs = docs[-int(len(docs)*(test_ratio)):]
test_labels = labels[-int(len(docs)*(test_ratio)):]

print('train size {}, dev size {}, test size {}'.format(len(train_labels), len(dev_labels), len(test_labels)))

train size 6878, dev size 2293, test size 2292


## Machine Learning Model

In [47]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
#for i in range(1,35000,1000):
#max_feature_num = 1000
def logistic(train_data,test_data,train_labels,test_labels):
    train_vectorizer = TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                encoding='utf-8',input='content', lowercase=True, max_df=20000,
                max_features=40000, min_df=1, ngram_range=(1, 2), norm='l2',
                preprocessor=None, smooth_idf=True, stop_words=None,
                strip_accents=None, sublinear_tf=False,
                token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True)
    train_vecs = train_vectorizer.fit_transform(train_data)
    test_vecs = TfidfVectorizer(vocabulary=train_vectorizer.vocabulary_).fit_transform(test_data)

# train model
    from sklearn.linear_model import LogisticRegression
    clf = LogisticRegression(C=10000.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=10000,
                   multi_class='multinomial', n_jobs=None, penalty='l2',
                   random_state=None, solver='saga', tol=0.0001, verbose=0,
                   warm_start=False).fit(train_vecs, train_labels)

# test model
    test_pred = clf.predict(test_vecs)
    from sklearn.metrics import precision_recall_fscore_support,accuracy_score
    acc = accuracy_score(test_labels, test_pred)
    pre, rec, f1, _ = precision_recall_fscore_support(test_labels, test_pred, average='macro')
    # print(max_feature_num)
    print('acc', acc)
    print('precision', pre)
    print('rec', rec)
    print('f1', f1)

In [21]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
def word_removal(data):
    for i in range(0,len(data)):
        for word in data[i]:
            if word in string.punctuation: # remove all punctuations
                data[i].remove(word)
            elif word in stop_words:
                data[i].remove(word)
            elif (word.isnumeric() == True):
                data[i].remove(word)
    return data

In [16]:
# Building different sets of data
all_text = pd.DataFrame()
all_text['text'] = df['sentence_text']
all_lables = df['label'].tolist()
# Set 1 : lemmatized data
# Set 2 : Stemmed data
import nltk
import string
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
from nltk.stem.porter import *
from nltk.tokenize import word_tokenize
ps_stemmer = PorterStemmer()
all_text['text_tokenized']=all_text['text'].apply(nltk.word_tokenize)
all_text['text_lemmatized']=all_text['text'].apply(nltk.word_tokenize).apply(lambda row: list(wordnet_lemmatizer.lemmatize(row[row.index(y)].lower()) for y in row))
all_text['text_stemmed']=all_text['text'].apply(nltk.word_tokenize).apply(lambda row: list(ps_stemmer.stem(row[row.index(y)].lower()) for y in row))

In [22]:
all_text['text_tokenized_stopwordspunctnum_removal'] = word_removal(all_text['text_tokenized'])
all_text['text_lemmatized_stopwordspunctnum_removal'] = word_removal(all_text['text_lemmatized'])
all_text['text_stemmed_stopwordspunctnum_removal'] = word_removal(all_text['text_stemmed'])

In [26]:
train_text_tokenized =[" ".join(review) for review in all_text['text_tokenized'][:int(len(docs)*0.8)].values]
train_text_lemmatized =[" ".join(review) for review in all_text['text_lemmatized'][:int(len(docs)*0.8)].values]
train_text_stemmed =[" ".join(review) for review in all_text['text_stemmed'][:int(len(docs)*0.8)].values]
train_text_tokenized_stopwordspunctnum_removal =[" ".join(review) for review in all_text['text_tokenized_stopwordspunctnum_removal'][:int(len(docs)*0.8)].values]
train_text_lemmatized_stopwordspunctnum_removal =[" ".join(review) for review in all_text['text_lemmatized_stopwordspunctnum_removal'][:int(len(docs)*0.8)].values]
train_text_stemmed_stopwordspunctnum_removal =[" ".join(review) for review in all_text['text_stemmed_stopwordspunctnum_removal'][:int(len(docs)*0.8)].values]

In [45]:
len(test_labels)

2292

In [42]:
train_labels = labels[:int(len(docs)*(0.8)):]
test_text = docs[-int(len(docs)*(test_ratio)):]
test_labels = labels[-int(len(docs)*(test_ratio)):]
# train_labels = all_lables[:35000]
# test_text = [" ".join(review) for review in all_text['text'][35000:].values]
# test_labels = all_lables[35000:]

In [48]:
print("train_text_tokenized")
logistic(train_text_tokenized,test_text,train_labels,test_labels)
print("train_text_lemmatized")
logistic(train_text_lemmatized,test_text,train_labels,test_labels)
print("train_text_stemmed")
logistic(train_text_stemmed,test_text,train_labels,test_labels)
print("train_text_tokenized_stopwordspunctnum_removal")
logistic(train_text_tokenized_stopwordspunctnum_removal,test_text,train_labels,test_labels)
print("train_text_lemmatized_stopwordspunctnum_removal")
logistic(train_text_lemmatized_stopwordspunctnum_removal,test_text,train_labels,test_labels)
print("train_text_stemmed_stopwordspunctnum_removal")
logistic(train_text_stemmed_stopwordspunctnum_removal,test_text,train_labels,test_labels)

train_text_tokenized




acc 0.7185863874345549
precision 0.6375897393701273
rec 0.628914098705684
f1 0.6325629451633117
train_text_lemmatized




acc 0.7046247818499127
precision 0.6247899651078926
rec 0.6233864416285773
f1 0.6240659394705277
train_text_stemmed




acc 0.6631762652705061
precision 0.5952572754166929
rec 0.6060908305073476
f1 0.5981486107767567
train_text_tokenized_stopwordspunctnum_removal




acc 0.7203315881326352
precision 0.6407634941011064
rec 0.6331465725345985
f1 0.636439224035854
train_text_lemmatized_stopwordspunctnum_removal




acc 0.6958987783595113
precision 0.6212254690114469
rec 0.6265087583876126
f1 0.6235130757423751
train_text_stemmed_stopwordspunctnum_removal
acc 0.6762652705061082
precision 0.5953776020376224
rec 0.5983717696308439
f1 0.5967066166461937




In [50]:
# load the glove pre-trained embedding
from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

path_of_downloaded_files = "D:/Downloads/glove.6B/glove.6B.300d.txt"
glove_file = datapath(path_of_downloaded_files)
word2vec_glove_file = get_tmpfile("glove.6B.300d.txt")
glove2word2vec(glove_file, word2vec_glove_file)
word_vectors = KeyedVectors.load_word2vec_format(word2vec_glove_file)

In [51]:
from nltk.tokenize import word_tokenize
import numpy as np

# Out-of-vocabulary (OOV) words: words that are not included in the pre-trained embedding model
# There exist many ways to vectorize OOV words, e.g. use a random vector to represent all OOV words
# Feel free to search and employ other ways to vectorize OOV words
word_vec_dim = 300 # make sure this number matches the embedding you use
oov_vec = np.random.rand(word_vec_dim) 
def vectorize_sent(word_vectors, sent):
    word_vecs = []
    for token in word_tokenize(sent): 
        if token not in word_vectors: 
            word_vecs.append(oov_vec)
        else:
            word_vecs.append(word_vectors[token].astype('float64'))
    return np.mean(word_vecs,axis=0)

vv = vectorize_sent(word_vectors, 'hello world ! this is a test sentence !')

In [52]:
# create vector representations; 
# TODO: consider to apply necessary text cleaning/normalization techniques
# TODO: consider whether to use titles information (the example below does not use titles but only sentences)

train_vecs = np.array([vectorize_sent(word_vectors, ss) for ss in train_docs])
dev_vecs = np.array([vectorize_sent(word_vectors, ss) for ss in dev_docs])
print(train_vecs.shape)

(6878, 300)


In [53]:
# define a simple MLP (multi-layer perceptron) as the classifation model
import torch
import torch.nn as nn

class MLP(nn.Module):
    def __init__(self, input_dim, out_dim, dp_rate):
        super(MLP, self).__init__()
        self.hidden_layer = nn.Linear(input_dim, input_dim*2)
        self.output_layer = nn.Linear(input_dim*2, out_dim)
        self.dropout = nn.Dropout(dp_rate)
        self.relu = torch.nn.ReLU()
       
    def forward(self, x_in):
        z1 = self.dropout(x_in) # output of the input layer, after dropout
        z2 = self.relu(self.hidden_layer(z1)) # output of the hidden layer
        logits = self.output_layer(z2)
        return logits

In [54]:
# build model
dropout_rate = 0.5 
model = MLP(word_vec_dim,len(label_dic),dropout_rate) 
loss_fnc = torch.nn.CrossEntropyLoss()

# hyper parameters
n_epochs = 50 # number of epoch (i.e. number of iterations)
batch_size = 32 # mini batch size
lr = 0.001 # initial learning rate

# initialize optimizer and scheduler (lr adjustor)
import torch.optim as optim
optimizer = optim.Adam(params=model.parameters(), lr=lr) # use Adam as the optimizer
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.9) # decays the learning rate of each parameter group by gamma every step_size epochs.

In [12]:
best_f1 = -1.
best_model = None
import copy
import numpy as np
from sklearn.metrics import precision_recall_fscore_support

for epoch_i in range(n_epochs):
    # the inner loop is over the batches in the dataset
    model.train() # let pytorch know that gradients should be computed, so as to update the model
    for idx in range(0,len(train_vecs),batch_size):
        # Step 0: Get the data
        x_data = torch.tensor(train_vecs[idx:idx+batch_size], dtype=torch.float)
        if x_data.shape[0] == 0: continue
        y_target = torch.tensor(train_labels[idx:idx+batch_size], dtype=torch.int64)

        # Step 1: Clear the gradients 
        optimizer.zero_grad()

        # Step 2: Compute the forward pass of the model
        y_pred = model(x_data)

        # Step 3: Compute the loss value that we wish to optimize
        loss = loss_fnc(y_pred, y_target)

        # Step 4: Propagate the loss signal backward
        loss.backward()

        # Step 5: Trigger the optimizer to perform one update
        optimizer.step()
    
    # after each epoch, we can test the model's performance on the dev set
    with torch.no_grad(): # let pytorch know that no gradient should be computed
        model.eval() # let the model know that it in test mode, i.e. no gradient and no dropout
        dev_data = torch.tensor(dev_vecs, dtype=torch.float)
        dev_target = torch.tensor(dev_labels, dtype=torch.int64)
        dev_prediction = model(dev_data)
        pred_labels = [np.argmax(dp.numpy()) for dp in dev_prediction]
        pre, rec, f1, _ = precision_recall_fscore_support(dev_target, pred_labels, average='macro')
        print('\n---> after epoch {} the macro-f1 on dev set is {}'.format(epoch_i, f1))
        for param_group in optimizer.param_groups:
            print('learning rate', param_group['lr'])
        
        # save the best model
        if f1 > best_f1:
            best_f1 = f1
            best_model = copy.deepcopy(model.state_dict())
            print('best model updated; new best f1',f1)
            
    # (optional) adjust learning rate according to the scheduler
    scheduler.step()
    


---> after epoch 0 the macro-f1 on dev set is 0.41890522047643186
learning rate 0.001
best model updated; new best f1 0.41890522047643186

---> after epoch 1 the macro-f1 on dev set is 0.46072138778872235
learning rate 0.001
best model updated; new best f1 0.46072138778872235

---> after epoch 2 the macro-f1 on dev set is 0.48078330190388874
learning rate 0.001
best model updated; new best f1 0.48078330190388874

---> after epoch 3 the macro-f1 on dev set is 0.46893978383942353
learning rate 0.001

---> after epoch 4 the macro-f1 on dev set is 0.4967415175678973
learning rate 0.001
best model updated; new best f1 0.4967415175678973

---> after epoch 5 the macro-f1 on dev set is 0.545107263023039
learning rate 0.001
best model updated; new best f1 0.545107263023039

---> after epoch 6 the macro-f1 on dev set is 0.5077161347612124
learning rate 0.001

---> after epoch 7 the macro-f1 on dev set is 0.5348631251886264
learning rate 0.001

---> after epoch 8 the macro-f1 on dev set is 0.566

In [13]:
# test on the test set

# load the best model weights
model.load_state_dict(best_model) 
test_vecs = np.array([vectorize_sent(word_vectors, ss) for ss in test_docs])

with torch.no_grad(): 
    model.eval()
    test_data = torch.tensor(test_vecs, dtype=torch.float)
    test_target = torch.tensor(test_labels, dtype=torch.int64)
    test_prediction = model(test_data)
    pred_labels = [np.argmax(dp.numpy()) for dp in test_prediction]
    pre, rec, f1, _ = precision_recall_fscore_support(test_target, pred_labels, average='macro')
    print('macro-f1 on test data', f1)

macro-f1 on test data 0.6538237226426551


In [55]:
train_ratio, dev_ratio, test_ratio = 0.6, 0.2, 0.2
train_docs = docs[:int(len(docs)*train_ratio)]
train_labels = labels[:int(len(docs)*train_ratio)]

dev_docs = docs[int(len(docs)*train_ratio):int(len(docs)*(train_ratio+dev_ratio))]
dev_labels = labels[int(len(docs)*train_ratio):int(len(docs)*(train_ratio+dev_ratio))]

test_docs = docs[-int(len(docs)*(test_ratio)):]
test_labels = labels[-int(len(docs)*(test_ratio)):]

print('train size {}, dev size {}, test size {}'.format(len(train_labels), len(dev_labels), len(test_labels)))

train size 6878, dev size 2293, test size 2292


## LSTM

In [56]:
labels_list = ['pos','neg']
import torch
import torch.nn as nn

class RNN_Classifier(nn.Module):
    def __init__(self, embd_dim, hidden_dim, model_type, cls_num, pooler_type, dropout, gpu):
        super(RNN_Classifier, self).__init__()
        assert model_type in ['rnn','lstm','bilstm','gru']
        assert pooler_type in ['max','avg']
        # rnn type
        if model_type == 'rnn':
            self.rnn = nn.RNN(hidden_size=hidden_dim, batch_first=True, input_size=embd_dim, dropout=dropout)
        elif model_type == 'lstm':
            self.rnn = nn.LSTM(hidden_size=hidden_dim, batch_first=True, input_size=embd_dim, dropout=dropout)
        elif model_type == 'bilstm':
            self.rnn = nn.LSTM(hidden_size=hidden_dim, batch_first=True, input_size=embd_dim, bidirectional=True, dropout=dropout)
        else: # model_type == 'gru'
            self.rnn = nn.GRU(hidden_size=hidden_dim, batch_first=True, input_size=embd_dim, dropout=dropout)
        # map from rnn output to logits
        if model_type == 'bilstm':
            self.fc = nn.Linear(2*hidden_dim, cls_num)
        else:
            self.fc = nn.Linear(hidden_dim, cls_num)
        # pooler type
        self.pooler_type = pooler_type
        # gpu or not
        self.gpu = gpu
        if gpu: self.to('cuda')
            
    def forward(self, input_matrix):
        token_num = input_matrix.shape[1]
        hidden_vecs = self.rnn(input_matrix)[0]
        if self.pooler_type == 'max':
            pooler = nn.MaxPool1d(token_num)
        else: 
            pooler = nn.AvgPool1d(token_num)
        if self.gpu: pooler.to('cuda')
        pooled_hidden = pooler(torch.transpose(hidden_vecs,1,2)).squeeze()
        return self.fc(pooled_hidden)

In [58]:
from nltk.tokenize import word_tokenize
import numpy as np
labels_list= ['non-propaganda', 'propaganda']
embd_dim = 300
hidden_dim = 300
rnn_type = 'bilstm'
pooler_type = 'avg'
dropout = 0.5
gpu = True

oov_vec = oov_vec = np.random.rand(embd_dim)

def get_sent_word_vecs(word_vectors, sent_words, largest_len):
    vecs = []
    for ww in sent_words:
        if ww in word_vectors:
            vecs.append(word_vectors[ww])
        else:
            vecs.append(oov_vec)
    return np.array(vecs)

def build_mini_batch(sent_list, word_vectors):
    tokenized_sents = [word_tokenize(ss.lower()) for ss in sent_list]
    largest_len = np.max([len(tokens) for tokens in tokenized_sents])
    text_vecs = []
    for ts in tokenized_sents:
        vv = get_sent_word_vecs(word_vectors, ts, largest_len)
        text_vecs.append(vv)
    # print('mini batch shape',np.array(text_vecs).shape)
    return np.array(text_vecs)

def make_batch_prediction(sent_list, word_vectors, model, use_gpu=True):
    batch = build_mini_batch(sent_list, word_vectors)
    batch_logits = torch.tensor([])
    if use_gpu: batch_logits = batch_logits.to('cuda')
    for i in range(batch.shape[0]):
        input_sents = torch.from_numpy(batch[i]).float()
        if use_gpu: input_sents = input_sents.to('cuda')
        logits = model(input_sents.unsqueeze(0))
        batch_logits = torch.cat( (batch_logits, logits) )
    return batch_logits.view(batch.shape[0],-1)
  
# sanity check 
model = RNN_Classifier(embd_dim, hidden_dim, rnn_type, len(labels_list), pooler_type, dropout, gpu)
batch_pred = make_batch_prediction(
    ['hello world!','hello','another test sentence this is'],
    word_vectors, model, gpu)
print(batch_pred)

  "num_layers={}".format(dropout, num_layers))


tensor([[ 0.0353, -0.0811],
        [ 0.0526, -0.0938],
        [-0.0293, -0.0376]], device='cuda:0', grad_fn=<ViewBackward>)


In [71]:
loss_fnc = torch.nn.CrossEntropyLoss() # cross entropy loss

# hyper parameters
n_epochs = 50 # number of epoch (i.e. number of iterations)
batch_size = 50
lr = 0.001 # initial learning rate

# init optimizer and scheduler (lr adjustor)
import torch.optim as optim
optimizer = optim.Adam(params=model.parameters(), lr=lr) # use Adam as the optimizer
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.999) # after each epoch, the learning rate is discounted to its 95%

In [72]:
best_f1 = -1.
best_model = None
import copy
import numpy as np
from sklearn.metrics import precision_recall_fscore_support
from nltk.tokenize import word_tokenize
from tqdm import tqdm

for epoch_i in tqdm(range(n_epochs)):
    # the inner loop is over the batches in the dataset
    model.train() # let pytorch know that gradients should be computed, so as to update the model
    ep_loss = []
    for idx in range(0,len(train_docs),batch_size):
        # Step 0: Get the data
        sents = train_docs[idx:idx+batch_size]
        if len(sents) == 0: break
        y_target = torch.tensor([train_labels[idx:idx+batch_size]], dtype=torch.int64).squeeze()
        if gpu:
            y_target = y_target.to('cuda')
        
        # Step 1: Clear the gradients 
        optimizer.zero_grad()

        # Step 2: Compute the forward pass of the model
        y_pred = make_batch_prediction(sents, word_vectors, model, gpu)
        pred_labels = [np.argmax(entry) for entry in y_pred.cpu().detach().numpy()]
        # print('pred labels', pred_labels)
        # print('true labels', y_target)

        # Step 3: Compute the loss value that we wish to optimize
        loss = loss_fnc(y_pred, y_target)
        # print(loss)
        ep_loss.append(loss.cpu().detach().numpy())

        # Step 4: Propagate the loss signal backward
        loss.backward()
        
        # Step 4+: clip the gradient, to avoid gradient explosion
        nn.utils.clip_grad_value_(model.parameters(), clip_value=3.)

        # Step 5: Trigger the optimizer to perform one update
        optimizer.step()
    
    print('\n======epoch {} loss======'.format(epoch_i),np.mean(ep_loss))
    
    # after each epoch, we can test the model's performance on the dev set
    with torch.no_grad(): # let pytorch know that no gradient should be computed
        model.eval() # let the model know that it in test mode, i.e. no gradient and no dropout
        predictions = []
        test_docs = dev_docs
        test_labels = dev_labels
        
        for idx in range(0,len(test_docs),batch_size):
            y_pred = make_batch_prediction(
                test_docs[idx:idx+batch_size], word_vectors, model, gpu)
            pred_labels = [np.argmax(entry) for entry in y_pred.cpu().detach().numpy()]
            predictions += pred_labels
        pre, rec, f1, _ = precision_recall_fscore_support(test_labels, predictions,average='macro')
        print('\n---> after epoch {} the macro-F1 on dev set is {}'.format(epoch_i, f1))
        for param_group in optimizer.param_groups:
            print('learning rate', param_group['lr'])
        
        # save the best model
        if f1 > best_f1:
            best_f1 = f1
            best_model = copy.deepcopy(model.state_dict())
            print('best model updated; new best macro-F1',f1)
    
    # (optional) adjust learning rate according to the scheduler
    scheduler.step()

  0%|          | 0/50 [00:00<?, ?it/s]




  2%|▏         | 1/50 [00:43<35:44, 43.76s/it]


---> after epoch 0 the macro-F1 on dev set is 0.6790821538489711
learning rate 0.001
best model updated; new best macro-F1 0.6790821538489711



  4%|▍         | 2/50 [01:27<34:59, 43.75s/it]


---> after epoch 1 the macro-F1 on dev set is 0.6782908274614954
learning rate 0.000999



  6%|▌         | 3/50 [02:13<34:41, 44.28s/it]


---> after epoch 2 the macro-F1 on dev set is 0.6797603562239285
learning rate 0.000998001
best model updated; new best macro-F1 0.6797603562239285



  8%|▊         | 4/50 [02:56<33:51, 44.16s/it]


---> after epoch 3 the macro-F1 on dev set is 0.6411568603863701
learning rate 0.000997002999



 10%|█         | 5/50 [03:41<33:12, 44.29s/it]


---> after epoch 4 the macro-F1 on dev set is 0.6491294849007614
learning rate 0.000996005996001



 12%|█▏        | 6/50 [04:31<33:39, 45.90s/it]


---> after epoch 5 the macro-F1 on dev set is 0.6596624534389918
learning rate 0.000995009990004999



 14%|█▍        | 7/50 [05:16<32:46, 45.73s/it]


---> after epoch 6 the macro-F1 on dev set is 0.6742952156548185
learning rate 0.000994014980014994



 16%|█▌        | 8/50 [06:03<32:12, 46.00s/it]


---> after epoch 7 the macro-F1 on dev set is 0.6618870920698706
learning rate 0.0009930209650349789



 18%|█▊        | 9/50 [06:47<31:10, 45.63s/it]


---> after epoch 8 the macro-F1 on dev set is 0.6331740723728738
learning rate 0.0009920279440699439



 20%|██        | 10/50 [07:33<30:23, 45.59s/it]


---> after epoch 9 the macro-F1 on dev set is 0.6586502597793511
learning rate 0.0009910359161258739



 22%|██▏       | 11/50 [08:18<29:27, 45.31s/it]


---> after epoch 10 the macro-F1 on dev set is 0.6719150583074991
learning rate 0.000990044880209748



 24%|██▍       | 12/50 [09:04<28:52, 45.58s/it]


---> after epoch 11 the macro-F1 on dev set is 0.6790704760724283
learning rate 0.0009890548353295382



 26%|██▌       | 13/50 [09:51<28:20, 45.95s/it]


---> after epoch 12 the macro-F1 on dev set is 0.7078331415168136
learning rate 0.0009880657804942088
best model updated; new best macro-F1 0.7078331415168136



 28%|██▊       | 14/50 [10:38<27:46, 46.29s/it]


---> after epoch 13 the macro-F1 on dev set is 0.6610761899844078
learning rate 0.0009870777147137145



 30%|███       | 15/50 [11:21<26:32, 45.51s/it]


---> after epoch 14 the macro-F1 on dev set is 0.6821604052111312
learning rate 0.0009860906369990009



 32%|███▏      | 16/50 [12:05<25:33, 45.10s/it]


---> after epoch 15 the macro-F1 on dev set is 0.656842931219534
learning rate 0.000985104546362002



 34%|███▍      | 17/50 [12:50<24:45, 45.01s/it]


---> after epoch 16 the macro-F1 on dev set is 0.6624148084430622
learning rate 0.00098411944181564



 36%|███▌      | 18/50 [13:34<23:46, 44.59s/it]


---> after epoch 17 the macro-F1 on dev set is 0.647422453212752
learning rate 0.0009831353223738242



 38%|███▊      | 19/50 [14:19<23:04, 44.65s/it]


---> after epoch 18 the macro-F1 on dev set is 0.6717951626319258
learning rate 0.0009821521870514505



 40%|████      | 20/50 [15:02<22:11, 44.39s/it]


---> after epoch 19 the macro-F1 on dev set is 0.6782805723607472
learning rate 0.000981170034864399



 42%|████▏     | 21/50 [15:47<21:29, 44.47s/it]


---> after epoch 20 the macro-F1 on dev set is 0.6601195869830296
learning rate 0.0009801888648295347



 44%|████▍     | 22/50 [16:30<20:28, 43.86s/it]


---> after epoch 21 the macro-F1 on dev set is 0.681331112732898
learning rate 0.000979208675964705



 46%|████▌     | 23/50 [17:14<19:49, 44.07s/it]


---> after epoch 22 the macro-F1 on dev set is 0.6805712008617039
learning rate 0.0009782294672887404



 48%|████▊     | 24/50 [17:57<18:55, 43.68s/it]


---> after epoch 23 the macro-F1 on dev set is 0.6201383464863389
learning rate 0.0009772512378214517



 50%|█████     | 25/50 [18:43<18:28, 44.34s/it]


---> after epoch 24 the macro-F1 on dev set is 0.6794505524211143
learning rate 0.0009762739865836303



 52%|█████▏    | 26/50 [19:31<18:14, 45.60s/it]


---> after epoch 25 the macro-F1 on dev set is 0.6577581444254349
learning rate 0.0009752977125970467



 54%|█████▍    | 27/50 [20:18<17:37, 45.97s/it]


---> after epoch 26 the macro-F1 on dev set is 0.6625399421748094
learning rate 0.0009743224148844496



 56%|█████▌    | 28/50 [21:08<17:18, 47.18s/it]


---> after epoch 27 the macro-F1 on dev set is 0.6685689025059294
learning rate 0.0009733480924695652



 58%|█████▊    | 29/50 [21:54<16:25, 46.91s/it]


---> after epoch 28 the macro-F1 on dev set is 0.6340830831958405
learning rate 0.0009723747443770956



 60%|██████    | 30/50 [22:40<15:31, 46.58s/it]


---> after epoch 29 the macro-F1 on dev set is 0.6542411100274601
learning rate 0.0009714023696327184



 62%|██████▏   | 31/50 [23:29<14:58, 47.31s/it]


---> after epoch 30 the macro-F1 on dev set is 0.653039011198007
learning rate 0.0009704309672630857



 64%|██████▍   | 32/50 [24:15<14:05, 46.99s/it]


---> after epoch 31 the macro-F1 on dev set is 0.6498245165876251
learning rate 0.0009694605362958226



 66%|██████▌   | 33/50 [25:02<13:17, 46.90s/it]


---> after epoch 32 the macro-F1 on dev set is 0.6361280084633695
learning rate 0.0009684910757595268



 68%|██████▊   | 34/50 [25:50<12:35, 47.19s/it]


---> after epoch 33 the macro-F1 on dev set is 0.6451139999527634
learning rate 0.0009675225846837673



 70%|███████   | 35/50 [26:35<11:36, 46.45s/it]


---> after epoch 34 the macro-F1 on dev set is 0.6817968754818455
learning rate 0.0009665550620990835



 72%|███████▏  | 36/50 [27:21<10:48, 46.35s/it]


---> after epoch 35 the macro-F1 on dev set is 0.6583771899801487
learning rate 0.0009655885070369844



KeyboardInterrupt: 

## CNN

In [61]:
from nltk.tokenize import word_tokenize
import numpy as np

word_vec_dim = 300 # make sure this number matches the embedding you use

# Out-of-vocabulary (OOV) words: words that are not included in the pre-trained embedding model
# There exist many ways to vectorize OOV words, e.g. use a random vector to represent all OOV words
# Feel free to search and employ other ways to vectorize OOV words
oov_vec = oov_vec = np.random.rand(word_vec_dim)

def get_sent_word_vecs(word_vectors, sent_words, largest_len):
    vecs = []
    for ww in sent_words:
        if ww in word_vectors:
            vecs.append(word_vectors[ww])
        else:
            vecs.append(oov_vec)
    for i in range(largest_len-len(sent_words)):
        vecs.append([0.]*word_vec_dim)
    return np.array(np.transpose(vecs))

def build_mini_batch(sent_list, word_vectors):
    tokenized_sents = [word_tokenize(ss.lower()) for ss in sent_list]
    largest_len = np.max([len(tokens) for tokens in tokenized_sents])
    text_vecs = []
    for ts in tokenized_sents:
        vv = get_sent_word_vecs(word_vectors, ts, largest_len)
        text_vecs.append(vv)
    # print('mini batch shape',np.array(text_vecs).shape)
    return np.array(text_vecs)
  
# sanity check 
build_mini_batch(['hello world!','HELLO','this is a long sentence!'], word_vectors)

array([[[-0.33712   , -0.25830999,  0.23726   ,  0.        ,
          0.        ,  0.        ],
        [-0.21691   ,  0.43643999, -0.46050999,  0.        ,
          0.        ,  0.        ],
        [-0.0066365 , -0.1138    ,  0.07555   ,  0.        ,
          0.        ,  0.        ],
        ...,
        [ 0.40558001,  0.081697  ,  0.086256  ,  0.        ,
          0.        ,  0.        ],
        [ 0.18073   , -0.0044191 ,  0.16498999,  0.        ,
          0.        ,  0.        ],
        [ 0.64249998, -0.14102   ,  0.60500002,  0.        ,
          0.        ,  0.        ]],

       [[-0.33712   ,  0.        ,  0.        ,  0.        ,
          0.        ,  0.        ],
        [-0.21691   ,  0.        ,  0.        ,  0.        ,
          0.        ,  0.        ],
        [-0.0066365 ,  0.        ,  0.        ,  0.        ,
          0.        ,  0.        ],
        ...,
        [ 0.40558001,  0.        ,  0.        ,  0.        ,
          0.        ,  0.        ],
  

In [62]:
import numpy as np
import torch
import torch.nn as nn

class CNN_Clf(nn.Module):
    def __init__(self, embd_dim, filter_size_list, filter_num_list, class_num, dp_rate=0.5, gpu=True):
        super(CNN_Clf, self).__init__()
        self.embd_dim = embd_dim
        assert len(filter_size_list) == len(filter_num_list)
        self.output_dim = class_num
        self.tanh = nn.Tanh()
        self.dropout = nn.Dropout(dp_rate)
        self.fc = nn.Linear(np.sum(filter_num_list), class_num)
        self.gpu = gpu
        self.convs = self.build_convs(filter_size_list, filter_num_list, gpu)
        if self.gpu:
            self.to('cuda')
            
    def build_convs(self, f_sizes, f_nums, gpu):
        convs = nn.ModuleList()
        for fs, fn in zip(f_sizes, f_nums):
            padding_size = fs-1
            m = nn.Conv1d(self.embd_dim, fn, fs, padding=padding_size)
            if gpu: m.to('cuda')
            convs.append(m)
        return convs
        
    def get_conv_output(self, input_matrix, conv, gpu):
        # step 1: compute convolution 
        assert input_matrix.shape[1] == self.embd_dim
        conv_output = conv(input_matrix)
        # step 2: pass through an activation function 
        conv_relu = self.tanh(conv_output)
        # step 3: max-over-time pooling
        maxp = nn.MaxPool1d(conv_relu.shape[2])
        maxp_output = maxp(conv_relu)
        return maxp_output
       
    def forward(self, all_text_vectors):
        cnn_repr = torch.tensor([])
        if self.gpu: cnn_repr = cnn_repr.to('cuda')
        for cv in self.convs:
            cv_output = self.get_conv_output(all_text_vectors, cv, self.gpu)
            cnn_repr = torch.cat((cnn_repr, cv_output), dim=1)
        # print(cnn_repr.shape)
        after_dp = self.dropout(cnn_repr.squeeze())
        logit = self.fc(after_dp)
        # the CrossEntropyLoss provided by pytorch includes softmax; so you do not need to include a softmax layer in your net
        return logit

In [68]:
dropout_rate = 0.5 # dropout rate
filter_sizes = [2,3,4]
filter_nums = [100]*len(filter_sizes)

gpu = True # whether use gpu to accelerate the training
model = CNN_Clf(word_vec_dim, filter_sizes, filter_nums, len(labels_list), dropout_rate, gpu)
loss_fnc = torch.nn.CrossEntropyLoss() # cross entropy loss

# hyper parameters
n_epochs = 50 # number of epoch (i.e. number of iterations) # tried 10
batch_size = 100 # earlier 50
lr = 0.001 # initial learning rate

# init optimizer and scheduler (lr adjustor)
import torch.optim as optim
optimizer = optim.Adam(params=model.parameters(), lr=lr) # use Adam as the optimizer
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.95) # after each epoch, the learning rate is discounted to its 95%

In [69]:
from sklearn.metrics import precision_recall_fscore_support
with torch.no_grad(): # let pytorch know that no gradient should be computed
    model.eval() # let the model know that it in test mode, i.e. no gradient and no dropout
    dev_predictions = []
    for idx in range(0,len(dev_docs),batch_size):
        x_data = build_mini_batch(dev_docs[idx:idx+batch_size], word_vectors)
        if x_data.shape[0] == 0: continue # to avoid empty batch
        # print(x_data.shape)
        x_tensor = torch.tensor(x_data, dtype=torch.float)
        if gpu:
            x_tensor = x_tensor.to('cuda')
        y_pred = model(x_tensor).cpu().detach().numpy()
        # print(y_pred)
        pred_labels = [np.argmax(entry) for entry in y_pred]
        # print(pred_labels)
        dev_predictions += pred_labels
    pre, rec, f1, _ = precision_recall_fscore_support(dev_labels, dev_predictions,average='macro')
    print('\n---> macro-F1 on dev set is {}'.format(f1))


---> macro-F1 on dev set is 0.22392575159822461


In [75]:
best_f1 = -1.
best_model = None
import copy
import numpy as np
from sklearn.metrics import precision_recall_fscore_support
from nltk.tokenize import word_tokenize
from tqdm import tqdm

for epoch_i in tqdm(range(n_epochs)):
    # the inner loop is over the batches in the dataset
    model.train() # let pytorch know that gradients should be computed, so as to update the model
    ep_loss = []
    for idx in range(0,len(train_docs),batch_size):
        # Step 0: Get the data
        x_data = build_mini_batch(train_docs[idx:idx+batch_size], word_vectors)
        if x_data.shape[0] == 0: continue # to avoid empty batch
        y_target = torch.tensor([train_labels[idx:idx+batch_size]], dtype=torch.int64).squeeze()
        if gpu:
            y_target = y_target.to('cuda')
        
        # Step 1: Clear the gradients 
        optimizer.zero_grad()

        # Step 2: Compute the forward pass of the model
        x_tensor = torch.tensor(x_data, dtype=torch.float)
        if gpu:
            x_tensor = x_tensor.to('cuda')
        y_pred = model(x_tensor)
        pred_labels = [np.argmax(entry) for entry in y_pred.cpu().detach().numpy()]
        # print('pred labels', pred_labels)
        # print('true labels', y_target)

        # Step 3: Compute the loss value that we wish to optimize
        loss = loss_fnc(y_pred, y_target)
        # print(loss)
        ep_loss.append(loss.cpu().detach().numpy())

        # Step 4: Propagate the loss signal backward
        loss.backward()

        # Step 5: Trigger the optimizer to perform one update
        optimizer.step()
    
    print('\n======epoch {} loss======'.format(epoch_i),np.mean(ep_loss))
    
    # after each epoch, we can test the model's performance on the dev set
    with torch.no_grad(): # let pytorch know that no gradient should be computed
        model.eval() # let the model know that it in test mode, i.e. no gradient and no dropout
        dev_predictions = []
        for idx in range(0,len(dev_docs),batch_size):
            x_data = build_mini_batch(dev_docs[idx:idx+batch_size], word_vectors)
            if x_data.shape[0] == 0: continue # to avoid empty batch
            x_tensor = torch.tensor(x_data, dtype=torch.float)
            if gpu:
                x_tensor = x_tensor.to('cuda')
            y_pred = model(x_tensor).cpu().detach().numpy()
            pred_labels = [np.argmax(entry) for entry in y_pred]
            dev_predictions += pred_labels
            # print(pred_labels)
        pre, rec, f1, _ = precision_recall_fscore_support(dev_labels, dev_predictions,average='macro')
        print('\n---> after epoch {} the macro-F1 on dev set is {}'.format(epoch_i, f1))
        for param_group in optimizer.param_groups:
            print('learning rate', param_group['lr'])
        
        # save the best model
        if f1 > best_f1:
            best_f1 = f1
            best_model = copy.deepcopy(model.state_dict())
            print('best model updated; new best macro-F1',f1)
    
    # (optional) adjust learning rate according to the scheduler
    scheduler.step()


  0%|          | 0/50 [00:00<?, ?it/s][A





  2%|▏         | 1/50 [00:12<10:11, 12.48s/it][A


---> after epoch 0 the macro-F1 on dev set is 0.6891445274803374
learning rate 0.0009175547491935324
best model updated; new best macro-F1 0.6891445274803374




  4%|▍         | 2/50 [00:24<09:47, 12.25s/it][A


---> after epoch 1 the macro-F1 on dev set is 0.660178282801715
learning rate 0.0009166371944443389




  6%|▌         | 3/50 [00:36<09:40, 12.35s/it][A


---> after epoch 2 the macro-F1 on dev set is 0.6793825270463361
learning rate 0.0009157205572498945




  8%|▊         | 4/50 [00:49<09:32, 12.44s/it][A


---> after epoch 3 the macro-F1 on dev set is 0.6711881640280652
learning rate 0.0009148048366926446




 10%|█         | 5/50 [01:02<09:25, 12.56s/it][A


---> after epoch 4 the macro-F1 on dev set is 0.6906125192577546
learning rate 0.000913890031855952
best model updated; new best macro-F1 0.6906125192577546




 12%|█▏        | 6/50 [01:16<09:29, 12.94s/it][A


---> after epoch 5 the macro-F1 on dev set is 0.6739367621143553
learning rate 0.000912976141824096




 14%|█▍        | 7/50 [01:29<09:25, 13.15s/it][A


---> after epoch 6 the macro-F1 on dev set is 0.6617372044016039
learning rate 0.0009120631656822719




 16%|█▌        | 8/50 [01:43<09:17, 13.28s/it][A


---> after epoch 7 the macro-F1 on dev set is 0.6836387980979728
learning rate 0.0009111511025165896




 18%|█▊        | 9/50 [01:58<09:25, 13.80s/it][A


---> after epoch 8 the macro-F1 on dev set is 0.6786795001199646
learning rate 0.000910239951414073




 20%|██        | 10/50 [02:12<09:15, 13.89s/it][A


---> after epoch 9 the macro-F1 on dev set is 0.6766986985423395
learning rate 0.0009093297114626589




 22%|██▏       | 11/50 [02:25<08:48, 13.56s/it][A


---> after epoch 10 the macro-F1 on dev set is 0.6647767747971888
learning rate 0.0009084203817511963




 24%|██▍       | 12/50 [02:38<08:36, 13.60s/it][A


---> after epoch 11 the macro-F1 on dev set is 0.6798763462305544
learning rate 0.000907511961369445




 26%|██▌       | 13/50 [02:53<08:38, 14.02s/it][A


---> after epoch 12 the macro-F1 on dev set is 0.6700476344618269
learning rate 0.0009066044494080756




 28%|██▊       | 14/50 [03:07<08:15, 13.77s/it][A


---> after epoch 13 the macro-F1 on dev set is 0.6747162810999563
learning rate 0.0009056978449586675




 30%|███       | 15/50 [03:19<07:48, 13.40s/it][A


---> after epoch 14 the macro-F1 on dev set is 0.6707611524067221
learning rate 0.0009047921471137089




 32%|███▏      | 16/50 [03:31<07:21, 12.98s/it][A


---> after epoch 15 the macro-F1 on dev set is 0.6662470545194195
learning rate 0.0009038873549665952




 34%|███▍      | 17/50 [03:44<07:07, 12.95s/it][A


---> after epoch 16 the macro-F1 on dev set is 0.6743320170792
learning rate 0.0009029834676116286




 36%|███▌      | 18/50 [03:57<06:53, 12.93s/it][A


---> after epoch 17 the macro-F1 on dev set is 0.6721977101427257
learning rate 0.000902080484144017




 38%|███▊      | 19/50 [04:10<06:38, 12.86s/it][A


---> after epoch 18 the macro-F1 on dev set is 0.6730507298266677
learning rate 0.000901178403659873




 40%|████      | 20/50 [04:22<06:20, 12.67s/it][A


---> after epoch 19 the macro-F1 on dev set is 0.6633014827371916
learning rate 0.0009002772252562131




 42%|████▏     | 21/50 [04:34<06:05, 12.60s/it][A


---> after epoch 20 the macro-F1 on dev set is 0.6558879150298125
learning rate 0.0008993769480309569




 44%|████▍     | 22/50 [04:47<05:53, 12.63s/it][A


---> after epoch 21 the macro-F1 on dev set is 0.6767019925010072
learning rate 0.0008984775710829259




 46%|████▌     | 23/50 [05:00<05:40, 12.62s/it][A


---> after epoch 22 the macro-F1 on dev set is 0.6810669644988718
learning rate 0.000897579093511843




 48%|████▊     | 24/50 [05:12<05:28, 12.62s/it][A


---> after epoch 23 the macro-F1 on dev set is 0.694313418732037
learning rate 0.0008966815144183311
best model updated; new best macro-F1 0.694313418732037




 50%|█████     | 25/50 [05:24<05:13, 12.53s/it][A


---> after epoch 24 the macro-F1 on dev set is 0.6892315241947349
learning rate 0.0008957848329039128




 52%|█████▏    | 26/50 [05:37<04:58, 12.45s/it][A


---> after epoch 25 the macro-F1 on dev set is 0.6933689932846885
learning rate 0.0008948890480710088




 54%|█████▍    | 27/50 [05:49<04:43, 12.31s/it][A


---> after epoch 26 the macro-F1 on dev set is 0.6680765706722999
learning rate 0.0008939941590229378




 56%|█████▌    | 28/50 [06:01<04:29, 12.27s/it][A


---> after epoch 27 the macro-F1 on dev set is 0.6786795001199646
learning rate 0.0008931001648639148




 58%|█████▊    | 29/50 [06:14<04:21, 12.43s/it][A


---> after epoch 28 the macro-F1 on dev set is 0.6606237824561685
learning rate 0.0008922070646990509




 60%|██████    | 30/50 [06:26<04:08, 12.43s/it][A


---> after epoch 29 the macro-F1 on dev set is 0.6601199713181862
learning rate 0.0008913148576343518




 62%|██████▏   | 31/50 [06:39<03:56, 12.43s/it][A


---> after epoch 30 the macro-F1 on dev set is 0.6703689073943366
learning rate 0.0008904235427767174




 64%|██████▍   | 32/50 [06:52<03:47, 12.65s/it][A


---> after epoch 31 the macro-F1 on dev set is 0.6797784463929883
learning rate 0.0008895331192339407




 66%|██████▌   | 33/50 [07:06<03:41, 13.06s/it][A


---> after epoch 32 the macro-F1 on dev set is 0.6818009209904052
learning rate 0.0008886435861147067




 68%|██████▊   | 34/50 [07:19<03:27, 12.99s/it][A


---> after epoch 33 the macro-F1 on dev set is 0.6650113089832845
learning rate 0.000887754942528592




 70%|███████   | 35/50 [07:31<03:13, 12.87s/it][A


---> after epoch 34 the macro-F1 on dev set is 0.6699879720890454
learning rate 0.0008868671875860634




 72%|███████▏  | 36/50 [07:46<03:08, 13.44s/it][A


---> after epoch 35 the macro-F1 on dev set is 0.6800930435488454
learning rate 0.0008859803203984774




 74%|███████▍  | 37/50 [08:00<02:58, 13.76s/it][A


---> after epoch 36 the macro-F1 on dev set is 0.6743269160990681
learning rate 0.0008850943400780789




 76%|███████▌  | 38/50 [08:14<02:42, 13.55s/it][A


---> after epoch 37 the macro-F1 on dev set is 0.6869531159660857
learning rate 0.0008842092457380008




 78%|███████▊  | 39/50 [08:26<02:25, 13.19s/it][A


---> after epoch 38 the macro-F1 on dev set is 0.6676027054004464
learning rate 0.0008833250364922628




 80%|████████  | 40/50 [08:39<02:11, 13.15s/it][A


---> after epoch 39 the macro-F1 on dev set is 0.6886769608170553
learning rate 0.0008824417114557706




 82%|████████▏ | 41/50 [08:52<01:56, 13.00s/it][A


---> after epoch 40 the macro-F1 on dev set is 0.6879662460251472
learning rate 0.0008815592697443149




 84%|████████▍ | 42/50 [09:04<01:43, 12.92s/it][A


---> after epoch 41 the macro-F1 on dev set is 0.6793646572849161
learning rate 0.0008806777104745705




 86%|████████▌ | 43/50 [09:17<01:29, 12.80s/it][A


---> after epoch 42 the macro-F1 on dev set is 0.6931743061366736
learning rate 0.000879797032764096




 88%|████████▊ | 44/50 [09:31<01:18, 13.08s/it][A


---> after epoch 43 the macro-F1 on dev set is 0.6929331514056518
learning rate 0.0008789172357313319




 90%|█████████ | 45/50 [09:44<01:05, 13.18s/it][A


---> after epoch 44 the macro-F1 on dev set is 0.6766799363383911
learning rate 0.0008780383184956006




 92%|█████████▏| 46/50 [09:59<00:55, 13.86s/it][A


---> after epoch 45 the macro-F1 on dev set is 0.6815400595812876
learning rate 0.000877160280177105




 94%|█████████▍| 47/50 [10:14<00:42, 14.02s/it][A


---> after epoch 46 the macro-F1 on dev set is 0.671113023522662
learning rate 0.0008762831198969279




 96%|█████████▌| 48/50 [10:26<00:27, 13.58s/it][A


---> after epoch 47 the macro-F1 on dev set is 0.6770977464785093
learning rate 0.000875406836777031




 98%|█████████▊| 49/50 [10:39<00:13, 13.37s/it][A


---> after epoch 48 the macro-F1 on dev set is 0.6834390312032406
learning rate 0.000874531429940254




100%|██████████| 50/50 [10:52<00:00, 13.05s/it][A


---> after epoch 49 the macro-F1 on dev set is 0.6877554935184891
learning rate 0.0008736568985103137





## SAVE YOUR TRAINED MODEL
After you have obtained the best model, save your trained model and other necessary components to a file. The markers will load your model from the saved file and test your trained model on some held-out test data. Make sure that you have included all necessary files to re-run your model. **The markers will NOT re-run your code to train your model; instead, they will directly use your trained model to run the test**. 

Below is the sample code for saving the model and other necessary components, using the *pickle* package in Python. *You should adjust the code to save all necessary components for re-running your model.*

In [2]:
import pickle

# save model and other necessary components of your model
# DO NOT include the embedding files in your submission

all_info_want_to_save = {
    'input_dim': word_vec_dim,
    'dropout_rate': dropout_rate,
    'neural_weights': best_model,
    'oov_vector': oov_vec
    'class_num' : class_num
}
save_path = open("cw2_sample_saved_file.pickle","wb")
pickle.dump(all_info_want_to_save, save_path)
save_path.close()

SyntaxError: invalid syntax (<ipython-input-2-7e810a48bae3>, line 11)

### REPORT :

#### Please note that some analysis/experiments were performed in tensorflow and the tf code used is commented below. My final model and results are saved in pytorch.To provide the answers below I have used many resources online from pytorch website,tensorflow website, stackoverflow, and excerpts from medium articles on the relevant subject matter.

#### 1.How you use the data to develop your model, e.g. how to split the data into train/dev/test sets, and how you clean/normalize the data;

##### This data is class imbalanced. To split this data we have few options :
##### 1. Under sampling : Reducing the number of samples in consideration from the class with greater number of samples. An aspect to be careful about while undersampling is not losing critical information.
######   This can be randomized and aggregated as well(similar to ensemble models) for consistency. From my observation in tensoflow and an attempt at using the imblearn package, undersampling does marginally better many times.

###### 2. Over sampling : Oversampling can involve using multiple techniques, one way to do it is sampling the minority class more than one time or using synthetic oversampling. Synthetic oversampling is create an artificial data point by taking the vector between one of those k neighbors, and the current data point. Multiply this vector by a random number x which lies between 0 and 1.This vector results in the new data point being created.

###### I have settled for the 60/20/20 split of the dataset in the interest of time.Also, I noticed that oversampling does not increase performance greatly and undersampling could result in loss of important data. There is a lot of conditions to permutate and I decided to try to make the model better with existing conditions for this assignment.

###### With respect to cleaning and normalization, I have tried both stemming,lemmatization with stop words, punctuation, numbers(if any ) removal for the machine learning algorithm.
###### For the neural network I have not cleaned the data much since the aim of the architecture is sequence modelling.


#### 2.Which techniques or embeddings you have used to represent the texts and why you choose the use them ?

##### I have used the Glove embedding. Glove embeddings are word based models, they take in words as inputs and provide word embedded vectors as ouput.Glove embedding is very good at capturing the semantics of analogy, it can capture a more general flavour of the context.
##### When compared with word2vec, word2vec does not have any explicit global information embedded in it by default. GloVe creates a global co-occurrence matrix by estimating the probability a given word will co-occur with other words.I have retained the default embedding(Glove) for the very same reason.

##### Each row of the matrix represents a word, while each column represents the contexts that words can appear in. The matrix values represent the frequency a word appears in a given context. Then, dimensionality reduction/mapping is applied to this matrix to create the resulting embedding matrix (each row will be a word’s embedding vector).



#### 3.Which neural architecture(s) you have used to develop the classifier and why you choose to use them? 4.Which techniques (e.g. optimizers, regularization mechanisms, hyper-parameter tuning tricks) you have used to train the neural model and why you choose to use them.

##### Answering q3 and q4 :

##### I have tried 3 neural architectures :
##### 1. Simple layered architecture with two linear layers with dropout and relu activation layer.
##### 2. LSTM
##### 3. CNN 

##### In the first neural architecture used : 
        #self.hidden_layer = nn.Linear(input_dim, input_dim*2)
        #self.output_layer = nn.Linear(input_dim*2, out_dim)
        #self.dropout = nn.Dropout(dp_rate)
        #self.relu = torch.nn.ReLU()

##### nn :A fully-connected ReLU network with one hidden layer, trained to predict y from x by minimizing squared Euclidean distance.This implementation uses the nn package from PyTorch to build the network. PyTorch autograd makes it easy to define computational graphs and take gradients, but raw autograd can be a bit too low-level for defining complex neural networks; this is where the nn package can help. The nn package defines a set of Modules, which you can think of as a neural network layer that has produces output from input and may have some trainable weights.

##### Linear layer: A linear operation in which every input is connected to every output by a weight (so there are n_inputs * n_outputs weights. Generally followed by a non-linear activation function. ie: Applies a linear transformation to the incoming data, i.e. //y= Ax+b//. The input tensor given in forward(input) must be either a vector (1D tensor) or matrix (2D tensor).

##### Hidden layer role: If the input is a matrix, then each row is assumed to be an input sample of given batch.The module automatically creates the weight and bias tensors which we'll use in the forward method. You can access the weight and bias tensors once the network once it's create at net.hidden.weight and net.hidden.bias.

##### Dropout layer: In passing 0.5, every hidden unit (neuron) is set to 0 with a probability of 0.5.There’s a 50% change that the output of a given neuron will be forced to 0. Dropout can help a model generalize by randomly setting the output for a given neuron to 0. In setting the output to 0, the cost function becomes more sensitive to neighbouring neurons changing the way the weights will be updated during the process of backpropagation.

##### Relu layer: The ReLU is a function with the form of y=max(0,x). The layer applies the function to all  


##### In the second neural architecture used : LSTM

##### embd_dim = 300
##### hidden_dim = 300
##### rnn_type = 'bilstm'
##### pooler_type = 'avg'
##### dropout = 0.5

##### embed_dim – total dimension of the model.
##### num_heads – parallel attention heads.
##### dropout – a Dropout layer on attn_output_weights. Default: 0.0.
##### bias – add bias as module parameter. Default: True.

##### Optimizer used -  Adam optimizer :                                                                                                Adaptive Moment Estimation (Adam) [14] is another method that computes adaptive learning rates for each parameter. In addition to storing an exponentially decaying average of past squared gradients like Adadelta and RMSprop, Adam also keeps an exponentially decaying average of past gradients, similar to momentum. Whereas momentum can be seen as a ball running down a slope, Adam behaves like a heavy ball with friction, which thus prefers flat minima in the error surface.

#### Very little hyperparameter tuning tried while working with this algorithm.

##### In the second neural architecture used : CNN 

n_epochs = 50 # number of epoch (i.e. number of iterations) # tried 10
batch_size = 100 # earlier 50
lr = 0.001 # initial learning rate
Adam optimizer used

Hyperparameters played with is number of epochs, batch size and learning rate. After playing around with these parameters, I have settled on using 50 epochs. Initially when I tried using 20 epochs with 50 sample batch size, There was very slow increase in f1 score. I have also looked at the risk of overfitting by increasing the number of epochs and batch_size. After observing the learning rate and judging the nature of the dataset I felt more number of epochs might learn the data a lot more. I also looked at the stability of the algorithm with multiple iterations. 


#### I have finally used the model that has the best F1 score, displayed stability in multiple trials does not show indicators of overfitting or underfitting. I have picked CNN as the algorithm of choice





#### 5.How you compare and analyze the performance of your developed models

As mentioned above. The performance criteria were as follows :

1. Look at macro F1 score since the classes are imbalanced.
2. Since the weights are initialized randomly - repeat for consistency. 
3. Observe effect of fine tuning hyper parameters
4. Observe effect of data cleaning

Clearly the neural network performs better than the machine learning model for this type of data where the data is more imbalanced and the differences between the classes are harder to detect and requires sequences or convolutional networks to identify multiple patterns






In [None]:

# train_ratio, dev_ratio, test_ratio = 0.6, 0.2, 0.2
# train_docs = docs[:int(len(docs)*train_ratio)]
# train_labels = labels[:int(len(docs)*train_ratio)]
# train_labels_text = raw_labels[:int(len(docs)*train_ratio)]

# dev_docs = docs[int(len(docs)*train_ratio):int(len(docs)*(train_ratio+dev_ratio))]
# dev_labels = labels[int(len(docs)*train_ratio):int(len(docs)*(train_ratio+dev_ratio))]

# test_docs = docs[-int(len(docs)*(test_ratio)):]
# test_labels = labels[-int(len(docs)*(test_ratio)):]

# print(vocab_size = 40000
# oov_tok = '<OOV>'
# tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
# tokenizer.fit_on_texts(train_docs)
# word_index = tokenizer.word_index
# dict(list(word_index.items())[0:10])'train size {}, dev size {}, test size {}'.format(len(train_labels), len(dev_labels), len(test_labels)))
# train_sequences = tokenizer.texts_to_sequences(train_docs)
# train_padded = pad_sequences(train_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
# train_padded_new = train_padded.reshape(1, train_padded.shape[0], train_padded.shape[1])
# validation_sequences = tokenizer.texts_to_sequences(dev_docs)
# validation_padded = pad_sequences(validation_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
# val_padded_new = validation_padded.reshape(1, validation_padded.shape[0], validation_padded.shape[1])
# label_tokenizer = Tokenizer()
# label_tokenizer.fit_on_texts(train_labels_text)

# training_label_seq = np.array(label_tokenizer.texts_to_sequences(train_labels_text))
# validation_label_seq = np.array(label_tokenizer.texts_to_sequences(dev_labels_text))
# embedding = "https://tfhub.dev/google/tf2-preview/gnews-swivel-20dim/1"
# hub_layer = hub.KerasLayer(embedding, input_shape=[], 
#                            dtype=tf.string, trainable=True)
# hub_layer(train_docs)
# model = tf.keras.Sequential()
# model.add(hub_layer)
# # model.add(tf.keras.layers.Flatten())
# model.add(tf.keras.layers.Dense(16, activation='relu'))
# model.add(tf.keras.layers.Dense(1))

# model.summary()
# from tensorflow.keras.backend import backend as K

# def recall_m(y_true, y_pred):
#     true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
#     possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
#     recall = true_positives / (possible_positives + K.epsilon())
#     return recall

# def precision_m(y_true, y_pred):
#     true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
#     predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
#     precision = true_positives / (predicted_positives + K.epsilon())
#     return precision

# def f1_score(y_true, y_pred):
#     precision = precision_m(y_true, y_pred)
#     recall = recall_m(y_true, y_pred)
#     return 2*((precision*recall)/(precision+recall+K.epsilon()))
# from sklearn.model_selection import train_test_split
# import tensorflow as tf
# from tensorflow import keras
# import numpy as np

# def create_f1():
#     def f1_function(y_true, y_pred):
#         y_pred_binary = tf.where(y_pred>=0.5, 1., 0.)
#         tp = tf.reduce_sum(y_true * y_pred_binary)
#         predicted_positives = tf.reduce_sum(y_pred_binary)
#         possible_positives = tf.reduce_sum(y_true)
#         return tp, predicted_positives, possible_positives
#     return f1_function


# class F1_score(keras.metrics.Metric):
#     def __init__(self, **kwargs):
#         super().__init__(**kwargs) # handles base args (e.g., dtype)
#         self.f1_function = create_f1()
#         self.tp_count = self.add_weight("tp_count", initializer="zeros")
#         self.all_predicted_positives = self.add_weight('all_predicted_positives', initializer='zeros')
#         self.all_possible_positives = self.add_weight('all_possible_positives', initializer='zeros')

#     def update_state(self, y_true, y_pred,sample_weight=None):
#         tp, predicted_positives, possible_positives = self.f1_function(y_true, y_pred)
#         self.tp_count.assign_add(tp)
#         self.all_predicted_positives.assign_add(predicted_positives)
#         self.all_possible_positives.assign_add(possible_positives)

#     def result(self):
#         precision = self.tp_count / self.all_predicted_positives
#         recall = self.tp_count / self.all_possible_positives
#         f1 = 2*(precision*recall)/(precision+recall)
#         return f1

# X = np.random.random(size=(1000, 10))     
# Y = np.random.randint(0, 2, size=(1000,))
# X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)

# # model = tf.keras.Sequential()
# # model.add(hub_layer)
# # model.add(tf.keras.layers.Dense(16, activation='relu'))
# # model.add(tf.keras.layers.Dense(1))

# model.compile(optimizer='adam',
#               loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
#               metrics=[F1_score()])
# history = model.fit(train_padded.reshape(-1),
#                     epochs=20,
# #                     validation_data=validation_padded.reshape(-1),
#                     verbose=1)
