In [1]:
pip install jsonlines

Collecting jsonlines
  Downloading jsonlines-3.0.0-py3-none-any.whl (8.5 kB)
Installing collected packages: jsonlines
Successfully installed jsonlines-3.0.0


In [2]:
#import libraries

import csv
import jsonlines
import json
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import SGD, Adam
from typing import *
import string
from torch.utils.data import Dataset, DataLoader

In [3]:
# def of paths

pth_train = "/content/drive/MyDrive/data/train.jsonl"
pth_dev = "/content/drive/MyDrive/data/dev.jsonl"
pth_test = "/content/drive/MyDrive/data/test.jsonl"
pathGlove = "/content/drive/MyDrive/Copia di glove.6B.300d.txt"
path_out_dev = "/content/predictions_dev.tsv"
path_out_test = "/content/predictions_test.tsv"

In [4]:
#organization of dataset

def organize_data(path: str, labels_on=True):

  data = jsonlines.open(path)
  texts = []
  new_texts = []
  labels = []
  ids = []

  if(labels_on):

    for line in data:

      #read texts of dataset
      text = line['text']   
      #remove punctuation from words
      text = ''.join( c for c in text if c not in string.punctuation) 
      #save texts, labels and ids
      texts.append(line['text'])
      labels.append(line['label'])
      ids.append(line['id'])

    #create a list of words from texts
    for t in texts:
      new_texts.append(t.split(" "))

    #checks
    assert len(new_texts) == len(labels) == len(ids) 
  
    return new_texts, labels, ids

  else:

    for line in data:

      #read texts of dataset
      text = line['text']
      #remove punctuation from words
      text = ''.join( c for c in text if c not in string.punctuation) 
      #save texts and ids
      texts.append(line['text'])
      ids.append(line['id'])

    #create a list of words from texts
    for t in texts:
      new_texts.append(t.split(" ")) 

    #checks
    assert len(new_texts) == len(ids) 

    return new_texts, ids

In [5]:
#transformation of words in tensors using previos vocabulary

def VecFromWord(voc: dict, text: list):

  new_vecs = []

  for t in text:

    #associate each word in every text to a tensor, using the vocabulary(pre-trained word-embedding)
    vecs = transform(voc, t)

    # form: [[tensor1_1,....,tensor1_N],...,[tensorM_1,.....,tensorM_N]]
    new_vecs.append(vecs)

  print("Found "+str(len(new_vecs))+" samples.")

  return new_vecs

#function for effective transormation
def transform(v: dict, l: list):

  vec = []

  for word in l:
    word = word.lower()
    if word in v.keys():
      vec.append(v[word].squeeze(0))
    #Oss: ingoring missing words from vocabulary

  return vec

#function to do the mean between the words of a text
def aggregation(data: list):

  new_data = []

  for d in data:

    sum_vec = []

    summ = 0
    for el in d:
      summ+=el
    summ = summ/len(d)

    new_data.append(summ)

  #print("Found "+str(len(new_data))+" samples.")

  return torch.stack(new_data)

In [6]:
#Find the class belonging to data and encode in one-hot encoding

def define_classes(labels: list):

  #find number of different classes
  classes = []
  for i in np.unique(labels):
    classes.append(i)
  num_class = len(classes)

  #associate an integer to each class
  classes_dict = {}
  counter = 0
  for elem in classes:
    if elem not in classes_dict.keys():
        classes_dict[elem]=counter
        counter+=1

  print("Found "+str(num_class)+" classes")

  #create a dictionary where each int correspond to a classe encoded with one-hot encoding
  ref = list(classes_dict.values())
  app = torch.tensor(ref)
  new_labels = list(F.one_hot(app, num_classes=num_class))
  one_hot_dict = {}
  for i,j in zip(ref,new_labels):
    one_hot_dict.update({i:j})

  #create a set inf form: [[tenso1_1,...tensor1_N],...,[tensorM_1,...tensorM_N]]
  #where each list of tensors is a class represented in one-hot encoding
  y = []
  for i in labels:
    y.append(one_hot_dict.get(classes_dict.get(i)))
  y = torch.stack(y).squeeze()

  return y, classes_dict

In [7]:
#create a dataset linking each text to its class, useful to pass data at the classification model
#Oss: this function has a double scope, it can link text to its ids to create the dataset for test (where there are not labels)

def create_dataset(x,y,batch_size):
  dataset = []
  dataset = [(data,label) for data,label in zip(x,y)]
  return DataLoader(dataset, batch_size)

In [8]:
#create a vocabulary that link each word to a set of tensor, using a pre-trained word embedding

def vocabulary(pth):

  #read pre-trained glove model
  with open(pth) as f:
    glove = f.readlines()
  glove = [i.split(maxsplit = 1) for i in glove]

  #create a vocabulary from previous pre-trained model
  vocabulary = {}
  tensor = torch.tensor
  for w, vec in glove:
    vec = vec[:-2] #cut '\n'
    vec2 = vec.split()
    #i = 0
    #vec3 = [len(vec2)]
    vec3 = [np.float64(vec2[i]) for i in range(len(vec2))]
    tensor = torch.tensor(vec3)
    vocabulary.update({w:tensor})

  return vocabulary

In [9]:
#fuction to invert keys and values of a dictionary
def invert_dictionary(classes: dict):

  items_list = classes.items()
  new_dict = {}
  for i in list(items_list):
    new_dict.update({i[1]:i[0]})

  return new_dict

In [10]:
#classification model

class SentenceClassifier(torch.nn.Module):

    def __init__(self, n_features: int, n_hidden: int, n_classes: int, loss):
        super().__init__()
        self.layerIn = torch.nn.Linear(n_features, n_hidden)
        self.hidden1 = torch.nn.Linear(n_hidden, 100)
        self.hidden2 = torch.nn.Linear(100,100)
        self.hidden3 = torch.nn.Linear(100,80)
        self.hidden4 = torch.nn.Linear(80,70)
        self.hidden5 = torch.nn.Linear(70,50)
        self.hidden6 = torch.nn.Linear(50,30)
        self.hidden7 = torch.nn.Linear(30,20)
        #self.hidden8 = torch.nn.Linear(50,50)
        #self.hidden9 = torch.nn.Linear(50,50)
        #self.hidden10 = torch.nn.Linear(50,30)
        self.layerOut = torch.nn.Linear(20,n_classes)
        self.loss_fn = loss

    def forward(self, x: torch.Tensor, y: Optional[torch.Tensor] = None) -> Dict[str, torch.Tensor]:
        
        out = self.layerIn(x)
        out = torch.relu(out)
        out = self.hidden1(out)
        out = torch.relu(out)
        out = self.hidden2(out)
        out = torch.relu(out)
        out = self.hidden3(out)
        out = torch.relu(out)
        out = self.hidden4(out)
        out = torch.relu(out)
        out = self.hidden5(out)
        out = torch.relu(out)
        out = self.hidden6(out)
        out = torch.relu(out)
        out = self.hidden7(out)
        out = torch.relu(out)
        '''out = self.hidden8(out)
        out = torch.relu(out)
        out = self.hidden9(out)
        out = torch.relu(out)
        out = self.hidden10(out)
        out = torch.relu(out)'''
        out = self.layerOut(out)
        out = nn.Softmax(dim=1)(out)

        result = {'pred': out}

        if y is not None:
            loss = self.loss_fn(out, y)
            result['loss'] = loss

        return result

In [11]:
def train(model: nn.Module, optimizer: torch.optim.Optimizer, data: DataLoader, epochs: int = 20):

    for epoch in range(epochs):

        print("\nepoch: "+str(epoch+1)+"/"+str(epochs))
        tot_loss = []

        #batches of the training set
        for x, y in data:

            optimizer.zero_grad()
            data_out = model(x.type(torch.float), y.type(torch.float))
            loss = data_out['loss']

            tot_loss.append(loss.item())

            loss.backward()
            optimizer.step()

        print("loss: "+str(sum(tot_loss)/len(tot_loss))+"\n")

In [12]:
def test(model: nn.Module, dataloader: DataLoader):

  num_true = 0
  dim_set = 0

  for x, y in dataloader:
        
    with torch.no_grad():
        data_out = model(x.type(torch.float))
        pred = data_out['pred']

    for i,j in zip(pred,y):
        p = torch.argmax(i)
        truth = torch.argmax(j)
        dim_set += 1
        num_true += (truth == p).int()

  acc = ((num_true / dim_set).item())*100
  err = (1 - (num_true / dim_set).item())*100

  print(f'# accuracy: {acc:.2f}')
  print(f'# error-rate: {err:.2f}')

In [13]:
def predictions(model: nn.Module, dataloader: DataLoader, classes: dict, pth_out: str):

  test_pred = []
  
  for x, y in dataloader:
            
    with torch.no_grad():
        data_out = model(x.type(torch.float))
        pred = data_out['pred']

    for i,j in zip(pred,y):
        p = torch.argmax(i).item()
        c = str(j.item())+'\t'+classes.get(p)
        test_pred.append(c)

  #open a .tsv file and save data on it
  with open(pth_out, 'w') as f_output:
    tsv_output = csv.writer(f_output, delimiter='\n',quotechar =' ')
    tsv_output.writerow(test_pred)

  print("File saved.")

  return test_pred

In [22]:
#HYPERPARAMETERS

EPOCHS = 20
opt = Adam
lr=0.001
batches = 200
loss = torch.nn.MSELoss()

In [15]:
#read the pre-trained word embedding
vocab = vocabulary(pathGlove)

In [16]:
#read training data

texts_t, labels_t, ids_t = organize_data(pth_train)
list_of_tensors_t = VecFromWord(vocab, texts_t)
x_train = aggregation(list_of_tensors_t)
y_train, _ = define_classes(labels_t)
train_data = create_dataset(x_train,y_train,batches)

Found 186282 samples.
Found 15 classes


In [17]:
#read dev data

texts_d, labels_d, ids_d = organize_data(pth_dev)
list_of_tensors_d = VecFromWord(vocab, texts_d)
x_dev = aggregation(list_of_tensors_d)
y_dev, classes_dict = define_classes(labels_d)
dev_data = create_dataset(x_dev,y_dev,batches) #set for test the model
dev_data_for_test = create_dataset(x_dev, ids_d, batches) #set for generate tsv file

Found 6844 samples.
Found 15 classes


In [18]:
#read dev data

texts_test, ids_test = organize_data(pth_test, labels_on=False)
list_of_tensors_test = VecFromWord(vocab, texts_test)
x_test = aggregation(list_of_tensors_test)
test_data = create_dataset(x_test, ids_test,batches)

Found 6849 samples.


In [23]:
#define model and opitmizer
n_feature = x_train.shape[1]
n_class = y_train.shape[1]
model = SentenceClassifier(n_feature, n_feature, n_class, loss)
optimizer = opt(model.parameters(), lr=lr)

In [24]:
#train the model
train(model,optimizer,train_data,EPOCHS)


epoch: 1/20
loss: 0.03458220688746083


epoch: 2/20
loss: 0.0249467492313424


epoch: 3/20
loss: 0.022667509973320968


epoch: 4/20
loss: 0.02166410177734245


epoch: 5/20
loss: 0.020976291498760874


epoch: 6/20
loss: 0.020435712552057828


epoch: 7/20
loss: 0.01999594488924633


epoch: 8/20
loss: 0.019581406411231204


epoch: 9/20
loss: 0.019175931418618355


epoch: 10/20
loss: 0.018796366590470907


epoch: 11/20
loss: 0.01840076168055762


epoch: 12/20
loss: 0.018141225538947998


epoch: 13/20
loss: 0.017877186053980126


epoch: 14/20
loss: 0.017571812613503487


epoch: 15/20
loss: 0.017394943033892645


epoch: 16/20
loss: 0.017183728108398393


epoch: 17/20
loss: 0.01700854342822545


epoch: 18/20
loss: 0.01679049124301695


epoch: 19/20
loss: 0.0166245567894145


epoch: 20/20
loss: 0.016461902766692346



In [None]:
#test the model
test(model, dev_data)

# accuracy: 75.28
# error-rate: 24.72


In [None]:
#predict and save predictions into a .tsv file
new_class_dict = invert_dictionary(classes_dict)
pred_dev = predictions(model, dev_data_for_test, new_class_dict, path_out_dev)
pred_test = predictions(model, test_data, new_class_dict, path_out_test)

File saved.
File saved.
