# **NLP- Homework 2**
*Ofir Haim ID 213496110*

*Maor Dikter ID 214169377*

### **Imports**

In [115]:
!pip uninstall gensim
!pip install --upgrade gensim==4.2.0
#we had a problem because the version wasn't the new one so we uninstalled it and downloaded the new one

Found existing installation: gensim 4.2.0
Uninstalling gensim-4.2.0:
  Would remove:
    /usr/local/lib/python3.8/dist-packages/gensim-4.2.0.dist-info/*
    /usr/local/lib/python3.8/dist-packages/gensim/*
Proceed (y/n)? y
  Successfully uninstalled gensim-4.2.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting gensim==4.2.0
  Using cached gensim-4.2.0-cp38-cp38-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (24.1 MB)
Installing collected packages: gensim
Successfully installed gensim-4.2.0


In [116]:
# Data processing and visualization
import pandas as pd
import numpy as np
import csv, re, tqdm

# sklearn
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, f1_score

# PyTorch
import torch
from torch import nn, optim
from torch.utils.data import DataLoader, TensorDataset

# Gensim
import gensim
from gensim.models import Word2Vec
from gensim import downloader

# Imbalanced learning
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler

# Pickle
import pickle

In [117]:
torch.manual_seed(0)
np.random.seed(0)


### **Load our data**

In [118]:
# for convenience reasons we save the paths for every file here
TRAIN_PATH = './data/train.tagged'
DEV_PATH = './data/dev.tagged'
TEST_PATH = './data/test.untagged'

In [119]:
model = downloader.load('glove-twitter-200') # download glove

### **Extracting relevant data from a file**

In [120]:
def get_X_Y(data_path):
  """
  the function gets the processed corpus and extract from it all the words and gets their
  word2vec representation. It also gets the matching tag values for each word and
  return the words embedding and their representation.
  """
  vecs = []
  relevant_idx = []
  for idx, word in enumerate(data_path):
    if model.has_index_for(word):
      vecs.append(model.get_vector(word))
      relevant_idx.append(idx)
    elif model.has_index_for(word.lower()):
      vecs.append(model.get_vector(word.lower()))
      relevant_idx.append(idx)

  x = np.array(vecs)
  # Note that y = labels[relevant_idx] when exists
  return x, relevant_idx

### **Writing results into file**

In [121]:
def write_results_to_file(original_path, new_path, words, y_preds):
    """
    This function writes the predictions of the different models on a new file,
    following the required guidelines of the file.
    """
    src_f = open(original_path, 'r')
    lines = np.array(src_f.readlines())
    # Check for spaces in the original file
    source_file_spaces_idx = np.where(lines == '\n')[0]
    
    f = open(new_path, 'w', encoding="utf-8")
    y_preds_file = np.where(y_preds == 0, 'O', y_preds)
    
    idx = 0
    for word, pred in zip(words, y_preds_file):
        if idx in source_file_spaces_idx:
            f.write('\n') # Add spaces to our file
            idx += 1
        f.write('{}\t{}\n'.format(str(word), pred))
        idx += 1
    if idx in source_file_spaces_idx:
        f.write('\n')
        idx += 1

## **Process the files**

### **Process the Train**

In [122]:
# Load training data
train_data_raw = pd.read_csv(TRAIN_PATH, sep='\t', header=None, quoting=csv.QUOTE_NONE)
train_data_raw.columns = ['words', 'tags']
# Pereprocess corpus
train_corpus = train_data_raw['words'].to_numpy()
# Remove non alphanumerical chars
processed_train_corpus = np.array([re.sub(r'[^a-zA-Z0-9]', '', str(word)) for word in train_corpus])
train_labels = train_data_raw['tags'].to_numpy()
train_labels = np.where(train_labels == 'O', 0, 1)


In [123]:
X_train, relevant_idx_train = get_X_Y(processed_train_corpus)
y_train = train_labels[relevant_idx_train]

### **Process the Dev**

In [124]:
# read the dev file and save the words, tags in numpy arrays for future use
dev = pd.read_csv(DEV_PATH, sep='\t', header=None, quoting=csv.QUOTE_NONE) 
dev.columns = ['words', 'tags']
dev_corpus = dev['words'].to_numpy(dtype=str)
# Remove non alphanumerical chars
processed_dev_corpus = np.array([re.sub(r'[^a-zA-Z0-9]', '', word) for word in dev_corpus]) 
dev_tags = dev['tags'].to_numpy()
dev_tags = np.where(dev_tags == 'O', 0, 1) # we do binary classification between 'o' and the other tags

In [125]:
x_dev, relevant_idx_dev = get_X_Y(processed_dev_corpus)
y_dev = dev_tags[relevant_idx_dev]

### **Process the Test**

In [126]:
# Prepare the test-set
test_data_raw = pd.read_csv(TEST_PATH, sep='\t', header=None, quoting=csv.QUOTE_NONE)
test_data_raw.columns = ['words']
test_corpus = test_data_raw['words'].to_numpy(dtype=str)
processed_test_corpus = np.array([re.sub(r'[^a-zA-Z0-9]', '', word) for word in test_corpus]) # remove non alphanumerical chars

In [127]:
x_test, relevant_idx_test = get_X_Y(processed_test_corpus)

## **Model1 - SVM**

In [128]:
linear_model = SVC()
#train the SVM model on the train file
linear_model.fit(X_train, y_train) 

SVC()

### **Model1 - SVM test on train**

In [129]:
# Model prediction on train
pred = linear_model.predict(X_train) 
y_train_pred_svm = np.zeros(train_labels.size, dtype=int)
# Prediction on all tags- not just the relevant ones
y_train_pred_svm[relevant_idx_train] = pred
# F1 score calculation
f1 = f1_score(train_labels, y_train_pred_svm)
print("Train: the f1 score is: ", f1)

Train: the f1 score is:  0.8358870967741935


### **Model1 - SVM test on dev**

In [130]:
# Model prediction on dev
pred = linear_model.predict(x_dev) 
y_dev_pred_svm = np.zeros(dev_tags.size, dtype=int)
# Prediction on all tags- not just the relevant ones
y_dev_pred_svm[relevant_idx_dev] = pred
# F1 score calculation
f1 = f1_score(dev_tags, y_dev_pred_svm)
print("Dev: the f1 score is: ", f1) 

Dev: the f1 score is:  0.5515320334261838


In [131]:
con_mat = confusion_matrix(dev_tags, y_dev_pred_svm)
con_mat

array([[14434,    49],
       [  756,   495]])

## **Model2 - NN**


In [132]:
class FFNetwork(nn.Module):
    def __init__(self):
        super(FFNetwork, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(200, 80),
            nn.ReLU(),
            nn.Linear(80, 20),
            nn.ReLU(),
            nn.Linear(20, 1))
        self.prediction = nn.Sigmoid()

    def forward(self, x):
        x = self.fc(x)
        return self.prediction(x)

### **Train the FFN**


In [133]:
# Batch size and number of epochs
batch_size = 64
epochs = 10
# Loading the dataset
dataset = TensorDataset(torch.Tensor(X_train), torch.Tensor(y_train))
trainloader = DataLoader(dataset=dataset, batch_size=batch_size, shuffle=True)

# Choose gpu if available
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") 
# Connect the nn to the cpu
ff = FFNetwork().to(device)
# Defining out loss- Binary Cross Entropy 
loss_function = nn.BCELoss()
#optimizer = optim.Adam(ff.parameters())
# We chose stochastic gradiant decent as the optimization method
optimizer = torch.optim.SGD(ff.parameters(), lr = 0.03) 


In [134]:
for e in range(epochs):
    train_loss = 0.0
    for data, labels in tqdm.tqdm(trainloader):
        # Transfer Data to GPU if available
        #if torch.cuda.is_available():
            # data, labels = data.cuda(), labels.cuda()
        data = data.to(device)
        labels = labels.to(device)
        # Clear the gradients
        optimizer.zero_grad()
        # Forward Pass
        target = ff.forward(data)
        # Find the Loss
        loss = loss_function(target,labels.view(-1,1))
        # Calculate gradients
        loss.backward()
        # Update Weights
        optimizer.step()
        # Calculate Loss
        train_loss += loss.item()
    print(f'Epoch {e+1} \t\t Training Loss: {train_loss / len(trainloader)}')

100%|██████████| 801/801 [00:00<00:00, 969.42it/s]


Epoch 1 		 Training Loss: 0.20436863150080195


100%|██████████| 801/801 [00:00<00:00, 948.57it/s]


Epoch 2 		 Training Loss: 0.13451913989075784


100%|██████████| 801/801 [00:00<00:00, 991.04it/s] 


Epoch 3 		 Training Loss: 0.12312882672413979


100%|██████████| 801/801 [00:00<00:00, 968.53it/s]


Epoch 4 		 Training Loss: 0.11856653993616761


100%|██████████| 801/801 [00:00<00:00, 1007.18it/s]


Epoch 5 		 Training Loss: 0.11545220279389441


100%|██████████| 801/801 [00:00<00:00, 1005.78it/s]


Epoch 6 		 Training Loss: 0.11299492968025428


100%|██████████| 801/801 [00:00<00:00, 988.68it/s] 


Epoch 7 		 Training Loss: 0.11070808804199825


100%|██████████| 801/801 [00:00<00:00, 1030.34it/s]


Epoch 8 		 Training Loss: 0.10841988230568267


100%|██████████| 801/801 [00:00<00:00, 1004.46it/s]


Epoch 9 		 Training Loss: 0.10655491331594527


100%|██████████| 801/801 [00:00<00:00, 999.57it/s] 

Epoch 10 		 Training Loss: 0.1045380730133099





### **Model2 - NN test on train**

In [135]:
# Test model on the train set
X_train_tensor = torch.from_numpy(X_train).type(torch.float)
X_train_tensor = X_train_tensor.to(device)
with torch.no_grad():
  predictions = ff.forward(X_train_tensor)
  predictions = predictions.to(device)
  predictions = np.where(predictions > 0.5, 1, 0).reshape(-1)

y_train_pred_nn = np.zeros(train_labels.size, dtype=int)
y_train_pred_nn[relevant_idx_train] = predictions
# F1 score calculation
f1 = f1_score(train_labels, y_train_pred_nn)
print("Train: the f1 score is: ", f1)

Train: the f1 score is:  0.8077427159995967


### **Model2 - NN test  on dev**


In [136]:
X_dev_tensor = torch.from_numpy(x_dev).type(torch.float)
X_dev_tensor = X_dev_tensor.to(device)
with torch.no_grad():
  predictions = ff.forward(X_dev_tensor)
  predictions = predictions.to(device)
  predictions = np.where(predictions > 0.5, 1, 0).reshape(-1)

y_dev_pred_nn = np.zeros(dev_tags.size, dtype=int)
y_dev_pred_nn[relevant_idx_dev] = predictions
# F1 score calculation
f1 = f1_score(dev_tags, y_dev_pred_nn)
print("Dev: the f1 score is: ", f1)

Dev: the f1 score is:  0.5467314964883847


In [137]:
con_mat = confusion_matrix(dev_tags, y_dev_pred_nn)
con_mat

array([[14389,    94],
       [  745,   506]])

## **Model 3 - LSTM**

### **Pickle functions**

In [138]:
def save_linear_model(path, model):
    pickle.dump(model, open(path, 'wb'))

### **Train preprocessing**

In [139]:
def is_begin_tweet(word):
  # checking if a word is the start of the tweet 
  word = str(word)
  return len(word) > 0 and word[0] == '@'
tweet_start_mask = np.vectorize(is_begin_tweet)
is_tweet_start = tweet_start_mask(train_corpus)
tweet_start_idx = np.where(is_tweet_start)[0]

In [140]:
def get_X_Y_with_features(processed_data_path, data_path):
  vecs = []
  for idx, word in enumerate(processed_data_path):
    original_word = data_path[idx]
    if idx in tweet_start_idx:
        vecs.append(tweet_start_token)
    elif original_word == '.':
        vecs.append(eos_token)
    elif original_word == ',':
        vecs.append(comma_token)
    elif original_word == '"':
        vecs.append(quotes_token)
    elif model.has_index_for(word):
        vecs.append(model.get_vector(word))
    elif model.has_index_for(word.lower()):
        vecs.append(model.get_vector(word.lower()))
    else:
        vecs.append(np.zeros(200))
  x = np.array(vecs)
  return x


In [141]:
# for the competetive part we will use a new train  data
tweet_start_token = np.random.randn(200)
eos_token = np.random.randn(200)
comma_token = np.random.randn(200)
quotes_token = np.random.randn(200)

X_train = get_X_Y_with_features(processed_train_corpus, train_corpus)
y_train = train_labels

# saving the tokens for future use
token_list = [tweet_start_token, eos_token, comma_token, quotes_token]
np.save('./tokens.npy', token_list)

**Creating a feature for words with capital letters - Excluding the ones in the beginning of a sentence**

In [142]:
def is_capitalized(previous_word, word):
  # Check if  words that appear in the middle of the sentence are captilized
  previous_word = str(previous_word)
  word = str(word)
  return previous_word != '.' and len(word) > 0 and (word[0] != word[0].lower())
capitalization_mask = np.vectorize(is_capitalized)
is_capitalized_vec = capitalization_mask(train_corpus[:-1], train_corpus[1:])
is_capitalized_vec = np.concatenate(([np.False_], is_capitalized_vec))

In [143]:
# Add the begin tweet & capitalization feature
capitalized_feature = is_capitalized_vec.astype(float).reshape(-1,1)

X_train_new = np.append(X_train, capitalized_feature, axis = 1)
X_train_new.shape

(65124, 201)

**In this model we would like to consider the type of tag, instead of the previous ones in which we had only 0 and 1.**

**We will sort them by their category**

In [144]:
# Visualize the distribution of all the positive tags in the train set
def group_positive_tags(tag):
  tag = str(tag)
  if tag != 'O':
    return tag[2:]
  return tag
fixed_tags = train_data_raw['tags'].apply(lambda x: group_positive_tags(x))

# Find all the descriptive labels of the new training data
descriptive_labels = fixed_tags.to_numpy()

# Convert the binary labels to integer labels according to the various catrgories
categories = np.unique(np.copy(descriptive_labels))
for idx, category in enumerate(categories):
    category_idx = np.where(descriptive_labels == category)[0]
    descriptive_labels[category_idx] = idx

y_train_new = descriptive_labels.astype(int)

**Creating sentences**

In [145]:
# Generate a list of sentences to feed the sequential model
sentences = []
current_sentence = []
sentences_labels = []
current_labels = []

for idx, word_embedding in enumerate(X_train_new):
    if all(word_embedding[:-1] == tweet_start_token):
        if len(current_sentence) > 0:
            current_sentence = np.array(current_sentence)
            current_sentence = torch.tensor(current_sentence)
            # New shape is: (1,L,E) == batch_first
            current_sentence = current_sentence.reshape((1, current_sentence.shape[0], current_sentence.shape[1]))
            sentences.append(current_sentence)
            current_labels = torch.tensor(np.array(current_labels))
            sentences_labels.append(current_labels)
        current_sentence = [word_embedding]
        current_labels = [y_train_new[idx]]
    else:
        current_sentence.append(word_embedding)
        current_labels.append(y_train_new[idx])

current_sentence = np.array(current_sentence)
current_sentence = torch.tensor(current_sentence)
# New shape is: (1,L,E) == batch_first
current_sentence = current_sentence.reshape((1, current_sentence.shape[0], current_sentence.shape[1]))
sentences.append(current_sentence)
current_labels = torch.tensor(np.array(current_labels))
sentences_labels.append(current_labels)

sentences_labels_binary = []
for sentence_labels in sentences_labels:
    sentences_labels_binary.append(torch.tensor(np.where(sentence_labels == 0, 0, 1)).double())

### **Define the model**

In [146]:
class LSTMModel(nn.Module):

    def __init__(self, input_size=201, hidden_size=100, num_of_categories=11, mlp_mid_size=50):
        super().__init__()
        self.lstm = nn.LSTM(input_size=input_size, num_layers=2, hidden_size=hidden_size//2, bidirectional=True, batch_first=True)
        self.fc = nn.Sequential(
            nn.Linear(hidden_size,hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, mlp_mid_size),
            nn.ReLU(),
            nn.Linear(mlp_mid_size, num_of_categories)
        )
        self.binary_fc = nn.Sequential(
            nn.Linear(num_of_categories, 1),
            nn.Sigmoid()
        )

    def forward(self, input_sequence):
        lstm_hidden, _ = self.lstm(input_sequence)
        out_1 = self.fc(lstm_hidden)
        out_2 = self.binary_fc(out_1)
        return out_1, out_2

### **Train the network**

In [147]:
# Define CPU or GPU device
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
network = LSTMModel().to(device)
network = network.double()

# Loss function and Optimizer
loss_fn_1 = nn.CrossEntropyLoss()
loss_fn_2 = nn.BCELoss()
optimizer = optim.Adam(network.parameters())

# Train the network
epochs = 14
lamda = 2e-1

for epoch in tqdm.tqdm(range(epochs)):
  for sequence, full_targets, binary_targets in zip(sentences, sentences_labels, sentences_labels_binary):
    sequence = sequence.double()
    sequence = sequence.to(device)
    full_targets = full_targets.to(device)
    binary_targets = binary_targets.to(device)
    sequence_len = sequence.shape[1]

    optimizer.zero_grad()
    full_preds, binary_preds = network.forward(sequence)
    loss_1 = loss_fn_1(full_preds.view((sequence_len,-1)), full_targets.type(torch.long))
    loss_2 = loss_fn_2(binary_preds.view((sequence_len,-1)), binary_targets.view((sequence_len, -1)))
    loss = lamda * loss_1 + loss_2
    loss.backward()
    optimizer.step()

100%|██████████| 14/14 [11:30<00:00, 49.31s/it]


In [148]:
# Save the trained model to file
torch.save(network.state_dict(), './TrainedLSTM.pkl')

### **Dev preprocessing**

In [149]:
# Get tweet start indices in dev
is_tweet_start_dev = tweet_start_mask(dev_corpus)
tweet_start_idx_dev = np.where(is_tweet_start_dev)[0]

X_dev = get_X_Y_with_features(processed_dev_corpus, dev_corpus)
y_dev = dev_tags

# Get the indecies of capitlized words that appear in the middle of the sentence
is_capitalized_vec_dev = capitalization_mask(dev_corpus[:-1], dev_corpus[1:])
is_capitalized_vec_dev = np.concatenate(([np.False_], is_capitalized_vec_dev))

# Add the begin tweet & capitalization feature
capitalized_feature_dev = is_capitalized_vec_dev.astype(float).reshape(-1,1)
X_dev_new = np.append(X_dev, capitalized_feature_dev, axis=1)

# Find all the descriptive labels of the new training data
dev_descriptive_labels = dev['tags'].to_numpy()
group_positive_tags_vec = np.vectorize(group_positive_tags)
dev_descriptive_labels = group_positive_tags_vec(dev_descriptive_labels)

# Convert the binary labels to integer labels according to the various catrgories
for idx, category in enumerate(categories):
    dev_category_idx = np.where(dev_descriptive_labels == category)[0]
    dev_descriptive_labels[dev_category_idx] = idx
y_dev_new = dev_descriptive_labels.astype(int)

In [150]:
# Generate a list of sentences to feed the sequential model
dev_sentences = []
current_sentence = []
dev_sentences_labels = []
current_labels = []

for idx, word_embedding in enumerate(X_dev_new):
    if all(word_embedding[:-1] == tweet_start_token):
        if len(current_sentence) > 0:
            current_sentence = np.array(current_sentence)
            current_sentence = torch.tensor(current_sentence)
             # New shape is: (1,L,E) == batch_first
            current_sentence = current_sentence.reshape((1, current_sentence.shape[0], current_sentence.shape[1]))
            dev_sentences.append(current_sentence)
            current_labels = torch.tensor(np.array(current_labels))
            dev_sentences_labels.append(current_labels)
        current_sentence = [word_embedding]
        current_labels = [y_dev_new[idx]]
    else:
        current_sentence.append(word_embedding)
        current_labels.append(y_dev_new[idx])

current_sentence = np.array(current_sentence)
current_sentence = torch.tensor(current_sentence)
 # New shape is: (1,L,E) == batch_first
current_sentence = current_sentence.reshape((1, current_sentence.shape[0], current_sentence.shape[1]))
dev_sentences.append(current_sentence)
current_labels = torch.tensor(np.array(current_labels))
dev_sentences_labels.append(current_labels)

dev_sentences_labels_binary = []
for sentence_labels in dev_sentences_labels:
    dev_sentences_labels_binary.append(torch.tensor(np.where(sentence_labels == 0, 0, 1)).double())

### **Model 3- test on Dev**

In [151]:
# Get the predictions on test set and print the results
tot_preds = []
with torch.no_grad():
  for sequence in tqdm.tqdm(dev_sentences):
    sequence = sequence.double()
    sequence = sequence.to(device)
    sequence_len = sequence.shape[1]
    _, binary_preds = network.forward(sequence)
    predictions = list(np.where(binary_preds > 0.5, 1, 0).reshape(-1))
    tot_preds += predictions
comp_dev_all_preds = tot_preds
f1 = f1_score(dev_tags, comp_dev_all_preds)
print("Dev: the f1 score is: ", f1)

100%|██████████| 414/414 [00:01<00:00, 250.47it/s]

Dev: the f1 score is:  0.6164102564102564





In [152]:
con_mat = confusion_matrix(dev_tags, comp_dev_all_preds)
con_mat

array([[14385,    98],
       [  650,   601]])

### **Test preprocessing**

In [153]:
# Get tweet start indices
is_tweet_start_test = tweet_start_mask(test_corpus)
tweet_start_idx_test = np.where(is_tweet_start_test)[0]
X_test = get_X_Y_with_features(processed_test_corpus, test_corpus)

# Get the indecies of capitlized words that appear in the middle of the sentence
is_capitalized_vec_test = capitalization_mask(test_corpus[:-1], test_corpus[1:])
is_capitalized_vec_test = np.concatenate(([np.False_], is_capitalized_vec_test))

# Add the begin tweet & capitalization feature
capitalized_feature_test = is_capitalized_vec_test.astype(float).reshape(-1,1)
X_test_new = np.append(X_test, capitalized_feature_test, axis=1)

In [154]:
# Generate a list of sentences to feed the sequential model
test_sentences = []
current_sentence = []
for idx, word_embedding in enumerate(X_test_new):
    if all(word_embedding[:-1] == tweet_start_token):
        if len(current_sentence) > 0:
            current_sentence = np.array(current_sentence)
            current_sentence = torch.tensor(current_sentence)
            current_sentence = current_sentence.reshape((1, current_sentence.shape[0], current_sentence.shape[1])) # New shape is: (1,L,E) == batch_first
            test_sentences.append(current_sentence)
        current_sentence = [word_embedding]
    else:
        current_sentence.append(word_embedding)

current_sentence = np.array(current_sentence)
current_sentence = torch.tensor(current_sentence)
current_sentence = current_sentence.reshape((1, current_sentence.shape[0], current_sentence.shape[1])) # New shape is: (1,L,E) == batch_first
test_sentences.append(current_sentence)

### **Model3 - prediction on test set**

In [155]:
# Get the predictions on test set and print the results
tot_preds = []
with torch.no_grad():
  for sequence in tqdm.tqdm(test_sentences):
    sequence = sequence.double()
    sequence = sequence.to(device)
    sequence_len = sequence.shape[1]

    _, binary_preds = network.forward(sequence)
    predictions = list(np.where(binary_preds > 0.5, 1, 0).reshape(-1))
    tot_preds += predictions

comp_test_all_preds = tot_preds

100%|██████████| 613/613 [00:02<00:00, 249.86it/s]


In [156]:
write_results_to_file(TEST_PATH, './comp_213496110_214169377.tagged', test_corpus, comp_test_all_preds)