# Imports

In [112]:
!pip install emoji
!pip install nltk
!pip install tqdm
!pip install simplejson



In [113]:
import sys
import shutil
import json
import urllib
import tarfile
from pathlib import Path
import simplejson as sj

import pandas as pd
import numpy as np

from typing import Dict, OrderedDict, List

import re
import emoji


import nltk
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk.corpus import wordnet
from tqdm import tqdm


import gensim
import gensim.downloader as gloader



from typing import Iterable
from tqdm import tqdm

# Task 1: Corpus

## 1. Downloading the dataset

First of all we need to **download** the `A1/data` folder.

In [114]:
class DownloadProgressBar(tqdm):
    def update_to(self, b=1, bsize=1, tsize=None):
        if tsize is not None:
            self.total = tsize
        self.update(b * bsize - self.n)

def download_url(download_path: Path, url: str):
    with DownloadProgressBar(unit='B', unit_scale=True,
                             miniters=1, desc=url.split('/')[-1]) as t:
        urllib.request.urlretrieve(url, filename=download_path, reporthook=t.update_to)

In [115]:
def download_dataset(download_path: Path, url: str):
    print("Downloading dataset...")
    download_url(url=url, download_path=download_path)
    print("Download complete!")

In [116]:
# Here we put all the urls
urls = {
    "training": "https://raw.githubusercontent.com/nlp-unibo/nlp-course-material/main/2024-2025/Assignment%201/data/training.json",
    "test": "https://raw.githubusercontent.com/nlp-unibo/nlp-course-material/main/2024-2025/Assignment%201/data/test.json",
    "validation": "https://raw.githubusercontent.com/nlp-unibo/nlp-course-material/main/2024-2025/Assignment%201/data/validation.json"
}

In [117]:
print(f"Current work directory: {Path.cwd()}")
dataset_folder = Path.cwd().joinpath("Datasets")

Current work directory: /content


In [118]:
if not dataset_folder.exists():
    dataset_folder.mkdir(parents=True)

In [119]:
for name, url in urls.items():
    download_path = dataset_folder.joinpath(f"{name}.json")
    download_dataset(download_path, url)

Downloading dataset...


training.json: 6.23MB [00:00, 61.5MB/s]


Download complete!
Downloading dataset...


test.json: 500kB [00:00, 7.71MB/s]


Download complete!
Downloading dataset...


validation.json: 1.16MB [00:00, 5.82MB/s]                            

Download complete!





## 2. Load the three JSON files and encode them as pandas dataframes.

In [120]:
def load_json_file(file_path: Path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return json.load(f)

In [121]:
dataframe_rows = []


for name, url in urls.items():
    # per ogni file creiamo il file_path e leggiamo il file
    file_path = dataset_folder.joinpath(f"{name}.json")

    json_data = load_json_file(file_path)

    # per ogni chiave nel json_data creo una dataframe_row
    for key in json_data.keys():
        df_row = json_data[key]
        df_row["split"] = name
        dataframe_rows.append(df_row)


In [122]:
folder = Path.cwd().joinpath("Datasets", "Dataframes")
if not folder.exists():
    folder.mkdir(parents=True)


# transform the list of rows in a proper dataframe
df = pd.DataFrame(dataframe_rows)

for name, url in urls.items():
  df_path = folder.with_name(name + ".pkl")
  df.to_pickle(df_path)

## 3. Generate hard labels

Generate hard labels for Task 1 using majority voting and store them in a new dataframe column called `hard_label_task1`. Items without a clear majority will be removed from the dataset.

In [123]:
def compute_majority_voting(labels: list):

    unique_labels, counts = np.unique(labels, return_counts=True)
    majority_label = np.argwhere(counts == np.max(counts))

    majority_label = unique_labels[majority_label].flatten().tolist()

    if len(majority_label) > 1:
        majority_label = None


    return majority_label

In [124]:
def generate_hard_labels(df):
    hard_labels = []

    for index, row in df.iterrows():
        # Estrai le etichette dalla colonna 'labels_task1'
        labels = row['labels_task1']
        # print(labels)

        # Verifica se 'labels' è una lista e contiene elementi
        if isinstance(labels, list) and len(labels) > 0:
            # Calcola la moda (voto di maggioranza)
            most_common_label = compute_majority_voting(labels)
            # print(most_common_label)
            hard_labels.append(most_common_label)

    # Aggiungi le hard labels come nuova colonna
    df['hard_label_task1'] = hard_labels

    # Rimuovi le righe senza una chiara maggioranza (se necessario)
    df = df[df['hard_label_task1'].notnull()]

    return df

In [125]:
df = generate_hard_labels(df)
print(df.head())

  id_EXIST lang                                              tweet  \
0   100001   es  @TheChiflis Ignora al otro, es un capullo.El p...   
1   100002   es  @ultimonomada_ Si comicsgate se parece en algo...   
2   100003   es  @Steven2897 Lee sobre Gamergate, y como eso ha...   
4   100005   es  @novadragon21 @icep4ck @TvDannyZ Entonces como...   
5   100006   es  @yonkykong Aaah sí. Andrew Dobson. El que se d...   

   number_annotators                                         annotators  \
0                  6  [Annotator_1, Annotator_2, Annotator_3, Annota...   
1                  6  [Annotator_7, Annotator_8, Annotator_9, Annota...   
2                  6  [Annotator_7, Annotator_8, Annotator_9, Annota...   
4                  6  [Annotator_19, Annotator_20, Annotator_21, Ann...   
5                  6  [Annotator_25, Annotator_26, Annotator_27, Ann...   

    gender_annotators                          age_annotators  \
0  [F, F, F, M, M, M]  [18-22, 23-45, 46+, 46+, 23-45, 18-22]  

## 4. Filter the DataFrame

Filter the DataFrame to keep only rows where the `lang` column is `'en'`.

In [126]:
df = df[df['lang'] == 'en']
print(df.shape)

(3314, 12)


## 5. Remove unwanted columns

Keep only `id_EXIST`, `lang`, `tweet`, and `hard_label_task1`.

In [127]:
def remove_unwanted_columns(df):

    columns_to_keep = ['id_EXIST', 'lang', 'tweet', 'hard_label_task1', 'split']
    df = df[columns_to_keep]
    return df

In [128]:
df = remove_unwanted_columns(df)

## 6. Encode the hard_label_task1 column

Use 1 to represent "YES" and 0 to represent "NO" in the `hard_label_task1 column`.

In [129]:
df['hard_label_task1'] = df['hard_label_task1'].apply(lambda x: 1 if x[0] == 'YES' else 0)
print(df.head())

     id_EXIST lang                                              tweet  \
3661   200002   en  Writing a uni essay in my local pub with a cof...   
3662   200003   en  @UniversalORL it is 2021 not 1921. I dont appr...   
3665   200006   en  According to a customer I have plenty of time ...   
3666   200007   en  So only 'blokes' drink beer? Sorry, but if you...   
3667   200008   en  New to the shelves this week - looking forward...   

      hard_label_task1     split  
3661                 1  training  
3662                 1  training  
3665                 1  training  
3666                 1  training  
3667                 0  training  


# Task 2: Data Cleaning

In [130]:
nltk.download('omw-1.4')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

Check this link: [all existing emojis](https://www.unicode.org/Public/emoji/1.0//emoji-data.txt). And also this: [emojis unicode consortium](https://unicode.org/emoji/charts/full-emoji-list.html).

In [131]:
def lower(text):
    return text.lower()

In [132]:
def remove_emoji(text):
    return emoji.replace_emoji(text, replace='')

In [133]:
def remove_hastag(text):
    at = re.compile(r'#\S+')
    return at.sub(r'',text)

In [134]:
def remove_mention(text):
    at = re.compile(r'@\S+')
    return at.sub(r'',text)

In [135]:
def remove_URL(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'',text)

In [136]:
def remove_special_characters(text):
    pattern = r'[^a-zA-Z0-9\s]'
    return re.sub(pattern, '', text)

In [137]:
def remove_quotes(text):
    pattern = r'^"|"$‘’'
    return re.sub(pattern, '', text)

In [138]:
def remove_extra_spaces(text):
    text.strip()
    pattern = r'\s+'
    return re.sub(pattern, ' ', text)

In [139]:
def lemmatize(text):
    lemmatizer = WordNetLemmatizer()


    def get_wordnet_key(pos_tag):
        if pos_tag.startswith('J'):
            return wordnet.ADJ
        elif pos_tag.startswith('V'):
            return wordnet.VERB
        elif pos_tag.startswith('N'):
            return wordnet.NOUN
        elif pos_tag.startswith('R'):
            return wordnet.ADV
        else:
            return 'n'


    def lem_text(text: str):
        tokens = nltk.word_tokenize(text)
        tagged = pos_tag(tokens)
        words = [lemmatizer.lemmatize(word, get_wordnet_key(tag)) for word, tag in tagged]
        return " ".join(words)


    return lem_text(text)

In [140]:
def clean_tweet(tweet: str):
    tweet = lower(tweet)
    tweet = remove_emoji(tweet)
    tweet = remove_hastag(tweet)
    tweet = remove_mention(tweet)
    tweet = remove_URL(tweet)
    tweet = remove_special_characters(tweet)
    tweet = remove_quotes(tweet)
    tweet = remove_extra_spaces(tweet)
    tweet = lemmatize(tweet)
    return tweet

In [141]:
df['tweet'] = df['tweet'].apply(clean_tweet)

In [148]:
print(df['tweet'].head())

3661    write a uni essay in my local pub with a coffe...
3662    it be 2021 not 1921 i dont appreciate that on ...
3665    accord to a customer i have plenty of time to ...
3666    so only blokes drink beer sorry but if you are...
3667    new to the shelf this week look forward to rea...
Name: tweet, dtype: object


# Task 3: Text Encoding

## Splitting the dataset

Here we split the dataset in train, test and validation.

In [149]:
df_train = df.loc[df['split'] == "training"].drop(columns=['split'])
df_test = df.loc[df['split'] == "test"].drop(columns=['split'])
df_val = df.loc[df['split'] == "validation"].drop(columns=['split'])

print(f"Train size: {df_train.shape}")
print(f"Test size: {df_test.shape}")
print(f"Validation size: {df_val.shape}")

Train size: (2870, 4)
Test size: (286, 4)
Validation size: (158, 4)


### Try to use Tokenizer to build the vocabulary

 ## Vocabulary Creation

We create the vocabulary only using the training dataset.

In [150]:
def build_vocabulary(df: pd.DataFrame) -> (Dict[int, str], Dict[str, int], List[str]):
    """
    Given a dataset, builds the corresponding word vocabulary.

    :param df: dataset from which we want to build the word vocabulary (pandas.DataFrame)
    :return:
      - word vocabulary: vocabulary index to word
      - inverse word vocabulary: word to vocabulary index
      - word listing: set of unique terms that build up the vocabulary
    """
    idx_to_word = OrderedDict()
    word_to_idx = OrderedDict()

    curr_idx = 0
    # '''
    # Here we add the special token [UNK] to our vocabulary
    word_to_idx["[UNK]"] = curr_idx
    idx_to_word[curr_idx] = "[UNK]"
    curr_idx += 1
    # '''
    for sentence in tqdm(df.tweet.values):
        tokens = sentence.split()
        for token in tokens:
            if token not in word_to_idx:
                word_to_idx[token] = curr_idx
                idx_to_word[curr_idx] = token
                curr_idx += 1



    word_listing = list(idx_to_word.values())
    return idx_to_word, word_to_idx, word_listing

In [151]:
idx_to_word_train, word_to_idx_train, word_listing_train = build_vocabulary(df_train)

print(f'[Debug] Index -> Word vocabulary size: {len(idx_to_word_train)}')
print(f'[Debug] Word -> Index vocabulary size: {len(word_to_idx_train)}')
print(f'[Debug] Some words: {[(idx_to_word_train[idx], idx) for idx in np.arange(10) + 1]}')

100%|██████████| 2870/2870 [00:00<00:00, 95169.84it/s]

[Debug] Index -> Word vocabulary size: 9819
[Debug] Word -> Index vocabulary size: 9819
[Debug] Some words: [('write', 1), ('a', 2), ('uni', 3), ('essay', 4), ('in', 5), ('my', 6), ('local', 7), ('pub', 8), ('with', 9), ('coffee', 10)]





### Vocabulary Evaluation

In [152]:
def evaluate_vocabulary(idx_to_word: Dict[int, str], word_to_idx: Dict[str, int],
                        word_listing: List[str], df: pd.DataFrame, check_default_size: bool = False):
    print("[Vocabulary Evaluation] Size checking...")
    assert len(idx_to_word) == len(word_to_idx)
    assert len(idx_to_word) == len(word_listing)

    print("[Vocabulary Evaluation] Content checking...")
    for i in tqdm(range(len(idx_to_word))):
        assert idx_to_word[i] in word_to_idx
        assert word_to_idx[idx_to_word[i]] == i

    print("[Vocabulary Evaluation] Consistency checking...")
    _, _, first_word_listing = build_vocabulary(df)
    _, _, second_word_listing = build_vocabulary(df)
    assert first_word_listing == second_word_listing

    print("[Vocabulary Evaluation] Toy example checking...")
    toy_df = pd.DataFrame.from_dict({
        'tweet': ["all that glitters is not gold", "all in all i like this assignment"]
    })
    _, _, toy_word_listing = build_vocabulary(toy_df)
    toy_valid_vocabulary = set(' '.join(toy_df.tweet.values).split())
    # Includi anche [UNK] nel confronto
    toy_valid_vocabulary.add("[UNK]")
    assert set(toy_word_listing) == toy_valid_vocabulary

In [153]:
print("Vocabulary evaluation...")
evaluate_vocabulary(idx_to_word_train, word_to_idx_train, word_listing_train, df_train)
print("Evaluation completed!")

Vocabulary evaluation...
[Vocabulary Evaluation] Size checking...
[Vocabulary Evaluation] Content checking...


100%|██████████| 9819/9819 [00:00<00:00, 1678165.97it/s]


[Vocabulary Evaluation] Consistency checking...


100%|██████████| 2870/2870 [00:00<00:00, 173039.31it/s]
100%|██████████| 2870/2870 [00:00<00:00, 153312.69it/s]


[Vocabulary Evaluation] Toy example checking...


100%|██████████| 2/2 [00:00<00:00, 3509.88it/s]

Evaluation completed!





## Saving Vocabulary

In [154]:
vocab_path = Path.cwd().joinpath('Datasets', 'vocab.json')

print(f"Saving vocabulary to {vocab_path}")
with vocab_path.open(mode='w') as f:
    sj.dump(word_to_idx_train, f, indent=4)
print("Saving completed!")

Saving vocabulary to /content/Datasets/vocab.json
Saving completed!


## GloVe embedding

In [155]:
def load_embedding_model(model_type: str,
                         embedding_dimension: int = 50) -> gensim.models.keyedvectors.KeyedVectors:
    """
    Loads a pre-trained word embedding model via gensim library.

    :param model_type: name of the word embedding model to load.
    :param embedding_dimension: size of the embedding space to consider

    :return
        - pre-trained word embedding model (gensim KeyedVectors object)
    """
    download_path = ""

    if model_type.strip().lower() == 'glove':
        download_path = "glove-wiki-gigaword-{}".format(embedding_dimension)

    else:
        raise AttributeError("Unsupported embedding model type! Available ones: word2vec, glove, fasttext")

    try:
        emb_model = gloader.load(download_path)
    except ValueError as e:
        print("Invalid embedding model name! Check the embedding dimension:")
        print("Glove: 50, 100, 200, 300")
        raise e

    return emb_model

In [156]:
embedding_model = load_embedding_model(model_type="glove",
                                       embedding_dimension=50)



In [157]:
def check_OOV_terms(embedding_model: gensim.models.keyedvectors.KeyedVectors,
                    word_listing: List[str]):
    """
    Checks differences between pre-trained embedding model vocabulary
    and dataset specific vocabulary in order to highlight out-of-vocabulary terms.

    :param embedding_model: pre-trained word embedding model (gensim wrapper)
    :param word_listing: dataset specific vocabulary (list)

    :return
        - list of OOV terms
    """
    embedding_vocabulary = set(embedding_model.key_to_index.keys())
    oov = set(word_listing).difference(embedding_vocabulary)
    return list(oov)

In [158]:
oov_terms_train = check_OOV_terms(embedding_model, word_listing_train)
oov_percentage_train = float(len(oov_terms_train)) * 100 / len(word_listing_train)
print(f"Total OOV terms in training set: {len(oov_terms_train)} ({oov_percentage_train:.2f}%)")

Total OOV terms in training set: 1841 (18.75%)


Try to use Blob correct in order to correct spelling errors in tweets.

### Handling train dataset OOV terms

We have to put all the unseen tokens in the train dataset in our vocabulary.

In [159]:
def build_embedding_matrix(embedding_model: gensim.models.keyedvectors.KeyedVectors,
                           embedding_dimension: int,
                           word_to_idx: Dict[str, int],
                           vocab_size: int,
                           oov_terms: List[str]) -> np.ndarray:
    """
    Builds the embedding matrix of a specific dataset given a pre-trained word embedding model

    :param embedding_model: pre-trained word embedding model (gensim wrapper)
    :param word_to_idx: vocabulary map (word -> index) (dict)
    :param vocab_size: size of the vocabulary
    :param oov_terms: list of OOV terms (list)

    :return
        - embedding matrix that assigns a high dimensional vector to each word in the dataset specific vocabulary (shape |V| x d)
    """
    embedding_matrix = np.zeros((vocab_size, embedding_dimension), dtype=np.float32)
    for word, idx in tqdm(word_to_idx.items()):
        try:
            embedding_vector = embedding_model[word]
        except (KeyError, TypeError):
          if word == '[UNK]':
            # we assign a random embedding to the [UNK] token
            embedding_vector = np.random.uniform(low=-0.05, high=0.05, size=embedding_dimension)
          else:
            embedding_vector = np.random.uniform(low=-0.05, high=0.05, size=embedding_dimension)

        embedding_matrix[idx] = embedding_vector

    return embedding_matrix

In [160]:
# Testing
embedding_dimension = 50
embedding_matrix = build_embedding_matrix(embedding_model, embedding_dimension, word_to_idx_train, len(word_to_idx_train), oov_terms_train)
print(f"\nEmbedding matrix shape: {embedding_matrix.shape}")

100%|██████████| 9819/9819 [00:00<00:00, 284475.97it/s]

Embedding matrix shape: (9819, 50)



