# EMit Emotion Detection – Fine‑tuning setup

In [None]:

import sys, os, platform
import pandas as pd
import numpy as np
import torch
import transformers
import datasets
import emoji
from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit

print(f'Python: {platform.python_version()}')
print(f'PyTorch: {torch.__version__} (CUDA: {torch.cuda.is_available()})')
print(f'Transformers: {transformers.__version__}')
print(f'Datasets: {datasets.__version__}')


## Caricamento dei file .csv

In [None]:

DATA_DIR = 'data'  # modifica se i file .csv sono altrove

train_a = pd.read_csv(f'{DATA_DIR}/emit_train_A.csv')
train_b = pd.read_csv(f'{DATA_DIR}/emit_train_B.csv')
test_in = pd.read_csv(f'{DATA_DIR}/emit_test.csv')

display(train_a.head())


## Statistiche per etichetta

In [None]:

labels = ['Anger','Anticipation','Disgust','Fear','Joy','Love','Neutral','Sadness','Surprise','Trust']
stats = train_a[labels].sum().sort_values(ascending=False)
stats.plot(kind='bar', title="Distribuzione etichette")
print(stats)


## Pre-processing: pulizia testi, token speciali, emoji

In [None]:

import re

URL_TOKEN = '<URL>'
USER_TOKEN = '<USER>'
HASHTAG_TOKEN = '<HASHTAG>'

def clean(text):
    text = re.sub(r'https?://\S+', URL_TOKEN, text)
    text = re.sub(r'@\w+', USER_TOKEN, text)
    text = re.sub(r'#(\w+)', HASHTAG_TOKEN + r' \1', text)
    text = emoji.demojize(text, language='it')
    return text

train_a['text_clean'] = train_a['text'].apply(clean)
train_a[['text', 'text_clean']].head()


## Split 90/10 stratificato (MultilabelStratifiedShuffleSplit)

In [None]:

X = train_a['text_clean'].values
Y = train_a[labels].values

msss = MultilabelStratifiedShuffleSplit(n_splits=1, test_size=0.10, random_state=42)
train_idx, val_idx = next(msss.split(X, Y))

train_df = train_a.iloc[train_idx].reset_index(drop=True)
val_df = train_a.iloc[val_idx].reset_index(drop=True)

print(f"Train size: {train_df.shape[0]}, Validation size: {val_df.shape[0]}")
