## Removing Stopwords

In [37]:
import os
import json
import string
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC

nltk.download('stopwords')
stop_words = set(stopwords.words('italian'))

def preprocess_text(text):
    # Rimuovi la punteggiatura
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Trasforma tutte le lettere in minuscolo
    text = text.lower()
    # Rimuovi le parole vuote
    words = text.split()
    words = [word for word in words if word not in stop_words]
    text = ' '.join(words)
    return text

def remove_stopwords(file_path, output_dir):
    # Crea la cartella di output se non esiste
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    # Leggi i dati dal file
    data = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line))
    # Pre-processa i dati
    for item in data:
        item['sentence'] = preprocess_text(item['sentence'])
    # Salva i dati pre-processati in un nuovo file
    base_name = os.path.basename(file_path)
    output_path = os.path.join(output_dir, base_name)
    with open(output_path, 'w') as f:
        for item in data:
            f.write(json.dumps(item) + '\n')

# Pre-processa i dati del training set, del test set e del validation set
remove_stopwords('Dataset/ate_absita_training.ndjson', 'RemovedStopWordData')
remove_stopwords('Dataset/ate_absita_test.ndjson', 'RemovedStopWordData')
remove_stopwords('Dataset/ate_absita_gold.ndjson', 'RemovedStopWordData')
remove_stopwords('Dataset/ate_absita_dev.ndjson', 'RemovedStopWordData')

def extract_columns(file_path, output_dir):
    # Crea la cartella di output se non esiste
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    # Leggi i dati dal file
    data = []
    with open(file_path, 'r') as f:
        for line in f:
            data.append(json.loads(line))
    # Estrai le colonne di interesse
    sentences = [item['sentence'] for item in data]
    scores = [item['score'] for item in data]
    # Crea un dizionario per contenere le colonne estratte
    extracted_data = {'sentences': sentences, 'scores': scores}
    # Salva il dizionario in un nuovo file JSON
    base_name, _ = os.path.splitext(os.path.basename(file_path))
    output_path = os.path.join(output_dir, f'{base_name}_extracted.json')
    with open(output_path, 'w') as f:
        json.dump(extracted_data, f)


#estrazione delle colonne rilevanti dal dataset con la rimozione delle stopwords
extract_columns('RemovedStopWordData/ate_absita_training.ndjson', 'RemovedStopWordData')
extract_columns('RemovedStopWordData/ate_absita_gold.ndjson', 'RemovedStopWordData')
extract_columns('RemovedStopWordData/ate_absita_dev.ndjson', 'RemovedStopWordData')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Ferdo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Lemmatizzazione

In [38]:
import os
import json
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC

nltk.download('stopwords')
nltk.download('wordnet')
stop_words = set(stopwords.words('italian'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Rimuovi la punteggiatura
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Trasforma tutte le lettere in minuscolo
    text = text.lower()
    # Rimuovi le parole vuote
    words = text.split()
    words = [word for word in words if word not in stop_words]
    # Effettua la lemmatizzazione delle parole
    words = [lemmatizer.lemmatize(word) for word in words]
    text = ' '.join(words)
    return text

def preprocess_file(file_path, output_dir):
    # Crea la cartella di output se non esiste
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    # Leggi i dati dal file
    data = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line))
    # Pre-processa i dati
    for item in data:
        item['sentence'] = preprocess_text(item['sentence'])
    # Salva i dati pre-processati in un nuovo file
    base_name = os.path.basename(file_path)
    output_path = os.path.join(output_dir, base_name)
    with open(output_path, 'w') as f:
        for item in data:
            f.write(json.dumps(item) + '\n')

# Pre-processa i dati del training set, del test set e del validation set con lemmatizzazione
preprocess_file('Dataset/ate_absita_training.ndjson', 'LemmatizedData')
preprocess_file('Dataset/ate_absita_test.ndjson', 'LemmatizedData')
preprocess_file('Dataset/ate_absita_gold.ndjson', 'LemmatizedData')
preprocess_file('Dataset/ate_absita_dev.ndjson', 'LemmatizedData')

def extract_columns(file_path, output_dir):
    # Crea la cartella di output se non esiste
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    # Leggi i dati dal file
    data = []
    with open(file_path, 'r',encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line))
    # Estrai le colonne di interesse
    sentences = [item['sentence'] for item in data]
    scores = [item['score'] for item in data]
    # Crea un dizionario per contenere le colonne estratte
    extracted_data = {'sentences': sentences, 'scores': scores}
    # Salva il dizionario in un nuovo file JSON
    base_name, _ = os.path.splitext(os.path.basename(file_path))
    output_path = os.path.join(output_dir, f'{base_name}_extracted.json')
    with open(output_path, 'w') as f:
        json.dump(extracted_data, f)

extract_columns('LemmatizedData/ate_absita_training.ndjson', 'LemmatizedData')
extract_columns('LemmatizedData/ate_absita_gold.ndjson', 'LemmatizedData')
extract_columns('LemmatizedData/ate_absita_dev.ndjson', 'LemmatizedData')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Ferdo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Ferdo\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Stemmatizzazione

In [None]:
import os
import json
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC

nltk.download('stopwords')
stop_words = set(stopwords.words('italian'))
stemmer = PorterStemmer()

def preprocess_text(text):
    # Rimuovi la punteggiatura
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Trasforma tutte le lettere in minuscolo
    text = text.lower()
    # Rimuovi le parole vuote
    words = text.split()
    words = [word for word in words if word not in stop_words]
    # Effettua la stemmatizzazione delle parole
    words = [stemmer.stem(word) for word in words]
    text = ' '.join(words)
    return text

def preprocess_file(file_path, output_dir):
    # Crea la cartella di output se non esiste
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    # Leggi i dati dal file
    data = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line))
    # Pre-processa i dati
    for item in data:
        item['sentence'] = preprocess_text(item['sentence'])
    # Salva i dati pre-processati in un nuovo file
    base_name = os.path.basename(file_path)
    output_path = os.path.join(output_dir, base_name)
    with open(output_path, 'w') as f:
        for item in data:
            f.write(json.dumps(item) + '\n')

# Pre-processa i dati del training set, del test set e del validation set con stemmatizzazione
preprocess_file('Dataset/ate_absita_training.ndjson', 'StemmedData')
preprocess_file('Dataset/ate_absita_test.ndjson', 'StemmedData')
preprocess_file('Dataset/ate_absita_gold.ndjson', 'StemmedData')
preprocess_file('Dataset/ate_absita_dev.ndjson', 'StemmedData')

def extract_columns(file_path, output_dir):
    # Crea la cartella di output se non esiste
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    # Leggi i dati dal file
    data = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line))
    # Estrai le colonne di interesse
    sentences = [item['sentence'] for item in data]
    scores = [item['score'] for item in data]
    # Crea un dizionario per contenere le colonne estratte
    extracted_data = {'sentences': sentences, 'scores': scores}
    # Salva il dizionario in un nuovo file JSON
    base_name, _ = os.path.splitext(os.path.basename(file_path))
    output_path = os.path.join(output_dir, f'{base_name}_extracted.json')
    with open(output_path, 'w') as f:
        json.dump(extracted_data, f)

extract_columns('StemmedData/ate_absita_training.ndjson', 'StemmedData')
extract_columns('StemmedData/ate_absita_gold.ndjson', 'StemmedData')
extract_columns('StemmedData/ate_absita_gold.ndjson', 'StemmedData')

## Default

In [39]:
def extract_columns(file_path, output_dir):
    # Crea la cartella di output se non esiste
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    # Leggi i dati dal file
    data = []
    with open(file_path, 'r',encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line))
    # Estrai le colonne di interesse
    sentences = [item['sentence'] for item in data]
    scores = [item['score'] for item in data]
    # Crea un dizionario per contenere le colonne estratte
    extracted_data = {'sentences': sentences, 'scores': scores}
    # Salva il dizionario in un nuovo file JSON
    base_name, _ = os.path.splitext(os.path.basename(file_path))
    output_path = os.path.join(output_dir, f'{base_name}_extracted.json')
    with open(output_path, 'w') as f:
        json.dump(extracted_data, f)

# Estrazione delle colonne rilevanti dal dataset originale
extract_columns('Dataset/ate_absita_training.ndjson', 'DefaultData')
extract_columns('Dataset/ate_absita_gold.ndjson', 'DefaultData')
extract_columns('Dataset/ate_absita_dev.ndjson', 'DefaultData')