# Imports

In [7]:
!pip install emoji
!pip install nltk
!pip install tqdm

Collecting emoji
  Downloading emoji-2.14.0-py3-none-any.whl.metadata (5.7 kB)
Downloading emoji-2.14.0-py3-none-any.whl (586 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/586.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━[0m [32m337.9/586.9 kB[0m [31m9.9 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m586.9/586.9 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: emoji
Successfully installed emoji-2.14.0


In [8]:
import sys
import shutil
import json
import urllib
import tarfile
from pathlib import Path

import pandas as pd
import numpy as np

import re
import emoji


import nltk
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk.corpus import wordnet
from tqdm import tqdm



from typing import Iterable
from tqdm import tqdm

# Task 1: Corpus

## 1. Downloading the dataset

First of all we need to **download** the `A1/data` folder.

In [9]:
class DownloadProgressBar(tqdm):
    def update_to(self, b=1, bsize=1, tsize=None):
        if tsize is not None:
            self.total = tsize
        self.update(b * bsize - self.n)

def download_url(download_path: Path, url: str):
    with DownloadProgressBar(unit='B', unit_scale=True,
                             miniters=1, desc=url.split('/')[-1]) as t:
        urllib.request.urlretrieve(url, filename=download_path, reporthook=t.update_to)

In [10]:
def download_dataset(download_path: Path, url: str):
    print("Downloading dataset...")
    download_url(url=url, download_path=download_path)
    print("Download complete!")

In [11]:
# Here we put all the urls
urls = {
    "training": "https://raw.githubusercontent.com/nlp-unibo/nlp-course-material/main/2024-2025/Assignment%201/data/training.json",
    "test": "https://raw.githubusercontent.com/nlp-unibo/nlp-course-material/main/2024-2025/Assignment%201/data/test.json",
    "validation": "https://raw.githubusercontent.com/nlp-unibo/nlp-course-material/main/2024-2025/Assignment%201/data/validation.json"
}

In [12]:
print(f"Current work directory: {Path.cwd()}")
dataset_folder = Path.cwd().joinpath("Datasets")

Current work directory: /content


In [13]:
if not dataset_folder.exists():
    dataset_folder.mkdir(parents=True)

In [14]:
for name, url in urls.items():
    download_path = dataset_folder.joinpath(f"{name}.json")
    download_dataset(download_path, url)

Downloading dataset...


training.json: 6.23MB [00:00, 14.3MB/s]                            


Download complete!
Downloading dataset...


test.json: 500kB [00:00, 2.11MB/s]                            


Download complete!
Downloading dataset...


validation.json: 1.16MB [00:00, 4.22MB/s]                            

Download complete!





## 2. Load the three JSON files and encode them as pandas dataframes.

In [15]:
def load_json_file(file_path: Path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return json.load(f)

In [16]:
dataframe_rows = []


for name, url in urls.items():
    # per ogni file creiamo il file_path e leggiamo il file
    file_path = dataset_folder.joinpath(f"{name}.json")

    json_data = load_json_file(file_path)

    # per ogni chiave nel json_data creo una dataframe_row
    for key in json_data.keys():
        df_row = json_data[key]
        df_row["split"] = name
        dataframe_rows.append(df_row)


In [17]:
folder = Path.cwd().joinpath("Datasets", "Dataframes")
if not folder.exists():
    folder.mkdir(parents=True)


# transform the list of rows in a proper dataframe
df = pd.DataFrame(dataframe_rows)

for name, url in urls.items():
  df_path = folder.with_name(name + ".pkl")
  df.to_pickle(df_path)

In [18]:
print(df)

     id_EXIST lang                                              tweet  \
0      100001   es  @TheChiflis Ignora al otro, es un capullo.El p...   
1      100002   es  @ultimonomada_ Si comicsgate se parece en algo...   
2      100003   es  @Steven2897 Lee sobre Gamergate, y como eso ha...   
3      100004   es  @Lunariita7 Un retraso social bastante lamenta...   
4      100005   es  @novadragon21 @icep4ck @TvDannyZ Entonces como...   
...       ...  ...                                                ...   
7953   400173   en  Amazing that the GOP is trying to take away ou...   
7954   400174   en  It is is impossible for a man to become a woma...   
7955   400175   en  If Gaga decided to sing 18 versions of Free Wo...   
7956   400176   en  This is your reminder that you can be child-fr...   
7957   400177   en  just completed my last final, i’m officially a...   

      number_annotators                                         annotators  \
0                     6  [Annotator_1, Annota

## 3. Generate hard labels

Generate hard labels for Task 1 using majority voting and store them in a new dataframe column called `hard_label_task1`. Items without a clear majority will be removed from the dataset.

In [19]:
def compute_majority_voting(labels: list):

    unique_labels, counts = np.unique(labels, return_counts=True)
    majority_label = np.argwhere(counts == np.max(counts))

    majority_label = unique_labels[majority_label].flatten().tolist()

    if len(majority_label) > 1:
        majority_label = None


    return majority_label

In [20]:
def generate_hard_labels(df):
    hard_labels = []

    for index, row in df.iterrows():
        # Estrai le etichette dalla colonna 'labels_task1'
        labels = row['labels_task1']
        # print(labels)

        # Verifica se 'labels' è una lista e contiene elementi
        if isinstance(labels, list) and len(labels) > 0:
            # Calcola la moda (voto di maggioranza)
            most_common_label = compute_majority_voting(labels)
            # print(most_common_label)
            hard_labels.append(most_common_label)

    # Aggiungi le hard labels come nuova colonna
    df['hard_label_task1'] = hard_labels

    # Rimuovi le righe senza una chiara maggioranza (se necessario)
    df = df[df['hard_label_task1'].notnull()]

    return df

In [21]:
df = generate_hard_labels(df)
print(df)

     id_EXIST lang                                              tweet  \
0      100001   es  @TheChiflis Ignora al otro, es un capullo.El p...   
1      100002   es  @ultimonomada_ Si comicsgate se parece en algo...   
2      100003   es  @Steven2897 Lee sobre Gamergate, y como eso ha...   
4      100005   es  @novadragon21 @icep4ck @TvDannyZ Entonces como...   
5      100006   es  @yonkykong Aaah sí. Andrew Dobson. El que se d...   
...       ...  ...                                                ...   
7952   400172   en  @leesu44 @elishabroadway @markbann57 @SeaeyesT...   
7954   400174   en  It is is impossible for a man to become a woma...   
7955   400175   en  If Gaga decided to sing 18 versions of Free Wo...   
7956   400176   en  This is your reminder that you can be child-fr...   
7957   400177   en  just completed my last final, i’m officially a...   

      number_annotators                                         annotators  \
0                     6  [Annotator_1, Annota

## 4. Filter the DataFrame

Filter the DataFrame to keep only rows where the `lang` column is `'en'`.

In [22]:
df = df[df['lang'] == 'en']
print(df.shape)

(3314, 12)


## 5. Remove unwanted columns

Keep only `id_EXIST`, `lang`, `tweet`, and `hard_label_task1`.

In [23]:
def remove_unwanted_columns(df):

    columns_to_keep = ['id_EXIST', 'lang', 'tweet', 'hard_label_task1']
    df = df[columns_to_keep]
    return df

In [24]:
df = remove_unwanted_columns(df)
print(df)

     id_EXIST lang                                              tweet  \
3661   200002   en  Writing a uni essay in my local pub with a cof...   
3662   200003   en  @UniversalORL it is 2021 not 1921. I dont appr...   
3665   200006   en  According to a customer I have plenty of time ...   
3666   200007   en  So only 'blokes' drink beer? Sorry, but if you...   
3667   200008   en  New to the shelves this week - looking forward...   
...       ...  ...                                                ...   
7952   400172   en  @leesu44 @elishabroadway @markbann57 @SeaeyesT...   
7954   400174   en  It is is impossible for a man to become a woma...   
7955   400175   en  If Gaga decided to sing 18 versions of Free Wo...   
7956   400176   en  This is your reminder that you can be child-fr...   
7957   400177   en  just completed my last final, i’m officially a...   

     hard_label_task1  
3661            [YES]  
3662            [YES]  
3665            [YES]  
3666            [YES]  
366

## 6. Encode the hard_label_task1 column

Use 1 to represent "YES" and 0 to represent "NO" in the `hard_label_task1 column`.

In [25]:
df['hard_label_task1'] = df['hard_label_task1'].apply(lambda x: 1 if x[0] == 'YES' else 0)
print(df)

     id_EXIST lang                                              tweet  \
3661   200002   en  Writing a uni essay in my local pub with a cof...   
3662   200003   en  @UniversalORL it is 2021 not 1921. I dont appr...   
3665   200006   en  According to a customer I have plenty of time ...   
3666   200007   en  So only 'blokes' drink beer? Sorry, but if you...   
3667   200008   en  New to the shelves this week - looking forward...   
...       ...  ...                                                ...   
7952   400172   en  @leesu44 @elishabroadway @markbann57 @SeaeyesT...   
7954   400174   en  It is is impossible for a man to become a woma...   
7955   400175   en  If Gaga decided to sing 18 versions of Free Wo...   
7956   400176   en  This is your reminder that you can be child-fr...   
7957   400177   en  just completed my last final, i’m officially a...   

      hard_label_task1  
3661                 1  
3662                 1  
3665                 1  
3666                 1 

# Task 2: Data Cleaning

In [26]:
nltk.download('omw-1.4')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

Check this link: [all existing emojis](https://www.unicode.org/Public/emoji/1.0//emoji-data.txt). And also this: [emojis unicode consortium](https://unicode.org/emoji/charts/full-emoji-list.html).

In [28]:
def remove_emoji(text):
    return emoji.replace_emoji(text, replace='')

In [29]:
def remove_hastag(text):
    at = re.compile(r'#\S+')
    return at.sub(r'',text)

In [30]:
def remove_mention(text):
    at = re.compile(r'@\S+')
    return at.sub(r'',text)

In [31]:
def remove_URL(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'',text)

In [32]:
def remove_special_characters(text):
    pattern = r'[^a-zA-Z0-9\s]'
    return re.sub(pattern, '', text)

In [33]:
def remove_quotes(text):
    pattern = r'^"|"$‘’'
    return re.sub(pattern, '', text)

In [34]:
def remove_extra_spaces(text):
    pattern = r'\s+'
    return re.sub(pattern, ' ', text)

In [35]:
def lemmatize(text):
    lemmatizer = WordNetLemmatizer()


    def get_wordnet_key(pos_tag):
        if pos_tag.startswith('J'):
            return wordnet.ADJ
        elif pos_tag.startswith('V'):
            return wordnet.VERB
        elif pos_tag.startswith('N'):
            return wordnet.NOUN
        elif pos_tag.startswith('R'):
            return wordnet.ADV
        else:
            return 'n'


    def lem_text(text: str):
        tokens = nltk.word_tokenize(text)
        tagged = pos_tag(tokens)
        words = [lemmatizer.lemmatize(word, get_wordnet_key(tag)) for word, tag in tagged]
        return " ".join(words)


    return lem_text(text)

In [37]:
def clean_tweet(tweet: str):
    tweet = remove_emoji(tweet)
    tweet = remove_hastag(tweet)
    tweet = remove_mention(tweet)
    tweet = remove_URL(tweet)
    tweet = remove_special_characters(tweet)
    tweet = remove_quotes(tweet)
    tweet = remove_extra_spaces(tweet)
    tweet = lemmatize(tweet)
    return tweet

In [38]:
df['tweet'] = df['tweet'].apply(clean_tweet)

In [39]:
print(df['tweet'])

3661    Writing a uni essay in my local pub with a cof...
3662    it be 2021 not 1921 I dont appreciate that on ...
3665    According to a customer I have plenty of time ...
3666    So only blokes drink beer Sorry but if you are...
3667    New to the shelf this week look forward to rea...
                              ...                        
7952    There be even more way for a woman to prevent ...
7954    It be be impossible for a man to become a woma...
7955    If Gaga decide to sing 18 version of Free Woma...
7956    This be your reminder that you can be childfre...
7957    just complete my last final im officially a fr...
Name: tweet, Length: 3314, dtype: object


# Task 3: Text Encoding