In [None]:
from sklearn.datasets import fetch_20newsgroups         # Dataset che contiene testo di articoli di giornale appartenenti a 20 categorie differenti
import nltk                                             # NLP toolkit
import re                                               # Libreria per operazioni con le espressioni regolari

nltk.download('punkt')                                  # Con questo comando si scarica il tokenizzatore 'Punkt'

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Scarichiamo il nostro dataset

In [None]:
train_news_texts, category = fetch_20newsgroups(subset="train", categories=["sci.space"],
                                                return_X_y=True,
                                                remove=['headers', 'footers', 'quotes'])

In [None]:
prima_news = train_news_texts[0]
print(prima_news)


Any lunar satellite needs fuel to do regular orbit corrections, and when
its fuel runs out it will crash within months.  The orbits of the Apollo
motherships changed noticeably during lunar missions lasting only a few
days.  It is *possible* that there are stable orbits here and there --
the Moon's gravitational field is poorly mapped -- but we know of none.

Perturbations from Sun and Earth are relatively minor issues at low
altitudes.  The big problem is that the Moon's own gravitational field
is quite lumpy due to the irregular distribution of mass within the Moon.


# Operazioni di preprocessing per lavorare con il testo

## Convertire in minuscolo

In [None]:
prima_news = prima_news.lower()
print(prima_news)


any lunar satellite needs fuel to do regular orbit corrections, and when
its fuel runs out it will crash within months.  the orbits of the apollo
motherships changed noticeably during lunar missions lasting only a few
days.  it is *possible* that there are stable orbits here and there --
the moon's gravitational field is poorly mapped -- but we know of none.

perturbations from sun and earth are relatively minor issues at low
altitudes.  the big problem is that the moon's own gravitational field
is quite lumpy due to the irregular distribution of mass within the moon.


## Rimozione di caratteri speciali

In [None]:
prima_news = prima_news.replace('\n', ' ')
prima_news = re.sub(r'[^a-zA-Z0-9.?! ]+', '', prima_news)
prima_news = re.sub(' +', ' ', prima_news)
prima_news = prima_news.strip()

print(prima_news)


any lunar satellite needs fuel to do regular orbit corrections and when its fuel runs out it will crash within months. the orbits of the apollo motherships changed noticeably during lunar missions lasting only a few days. it is possible that there are stable orbits here and there the moons gravitational field is poorly mapped but we know of none. perturbations from sun and earth are relatively minor issues at low altitudes. the big problem is that the moons own gravitational field is quite lumpy due to the irregular distribution of mass within the moon.


## Separazione dei token

In [None]:
prima_news_sentence = prima_news.split('.')[0]

prima_news_sentence_splitted = prima_news_sentence.split(' ')
print(f'{prima_news_sentence} -> {prima_news_sentence_splitted}')


any lunar satellite needs fuel to do regular orbit corrections and when its fuel runs out it will crash within months -> ['any', 'lunar', 'satellite', 'needs', 'fuel', 'to', 'do', 'regular', 'orbit', 'corrections', 'and', 'when', 'its', 'fuel', 'runs', 'out', 'it', 'will', 'crash', 'within', 'months']


In [None]:
prima_news_tokenized = nltk.word_tokenize(prima_news_sentence)
print(f'{prima_news_sentence} -> {prima_news_tokenized}')


any lunar satellite needs fuel to do regular orbit corrections and when its fuel runs out it will crash within months -> ['any', 'lunar', 'satellite', 'needs', 'fuel', 'to', 'do', 'regular', 'orbit', 'corrections', 'and', 'when', 'its', 'fuel', 'runs', 'out', 'it', 'will', 'crash', 'within', 'months']


In [None]:
word_lengths = [(word, len(word)) for word in prima_news_tokenized] # Create a list with the word lengths using a list comprehension
print(f' Lengths of the words: \n{word_lengths}')

 Lengths of the words: 
[('any', 3), ('lunar', 5), ('satellite', 9), ('needs', 5), ('fuel', 4), ('to', 2), ('do', 2), ('regular', 7), ('orbit', 5), ('corrections', 11), ('and', 3), ('when', 4), ('its', 3), ('fuel', 4), ('runs', 4), ('out', 3), ('it', 2), ('will', 4), ('crash', 5), ('within', 6), ('months', 6)]


# Mettiamo insieme in un unica funzione

In [None]:
def preprocessing(text):
    """
    Funzione che preprocessa il testo per creare una frase tokenizzata.

    Args:
        text: stringa contenente il testo da preprocessare e tokenizzare

    Returns:
        testo preprocessato e tokenizzato
    """
    text = text.lower()
    text = text.replace('\n', ' ')
    text = re.sub(r'[^a-zA-Z0-9.?! ]+', '', text)
    text = re.sub(' +', ' ', text)
    text = text.strip()
    text_tokenized = nltk.word_tokenize(text)
    return text_tokenized


In [None]:
second_news =  train_news_texts[1]
print(second_news)
tokenized_second_news = preprocessing(second_news)
print(tokenized_second_news)


Glad to see Griffin is spending his time on engineering rather than on
ritual purification of the language.  Pity he got stuck with the turkey
rather than one of the sensible options.
['glad', 'to', 'see', 'griffin', 'is', 'spending', 'his', 'time', 'on', 'engineering', 'rather', 'than', 'on', 'ritual', 'purification', 'of', 'the', 'language', '.', 'pity', 'he', 'got', 'stuck', 'with', 'the', 'turkey', 'rather', 'than', 'one', 'of', 'the', 'sensible', 'options', '.']

Glad to see Griffin is spending his time on engineering rather than on
ritual purification of the language.  Pity he got stuck with the turkey
rather than one of the sensible options.
['glad', 'to', 'see', 'griffin', 'is', 'spending', 'his', 'time', 'on', 'engineering', 'rather', 'than', 'on', 'ritual', 'purification', 'of', 'the', 'language', '.', 'pity', 'he', 'got', 'stuck', 'with', 'the', 'turkey', 'rather', 'than', 'one', 'of', 'the', 'sensible', 'options', '.']


## Creazione degli N-Grammi

In [None]:
def sentence_to_ngram(tokenized_sentence, n=3):
    """
    Funzione che restituisce tutti i n-grammi contenuti all'interno della frase tokenizzata.

    Args:
        tokenized_sentence: lista di parole/tokens che compongono la frase
        n: n-grammi da considerare

    Returns:
        lista di tutti i n-grammi presenti all'interno della frase tokenizzata
    """

    ngram_list = []
    for i in range(len(tokenized_sentence) - n + 1):
        # the sliding window starts at position i and contains 3 words
        ngram = tokenized_sentence[i : i + n]
        ngram_list.append(ngram)
    return ngram_list



In [None]:
n = 3
print(f'Lista di tutti i trigrammi della frase: {prima_news_tokenized}\n')
trigram_list = sentence_to_ngram(prima_news_tokenized, n=3)
print(trigram_list)

Lista di tutti i trigrammi della frase: ['any', 'lunar', 'satellite', 'needs', 'fuel', 'to', 'do', 'regular', 'orbit', 'corrections', 'and', 'when', 'its', 'fuel', 'runs', 'out', 'it', 'will', 'crash', 'within', 'months']

[['any', 'lunar', 'satellite'], ['lunar', 'satellite', 'needs'], ['satellite', 'needs', 'fuel'], ['needs', 'fuel', 'to'], ['fuel', 'to', 'do'], ['to', 'do', 'regular'], ['do', 'regular', 'orbit'], ['regular', 'orbit', 'corrections'], ['orbit', 'corrections', 'and'], ['corrections', 'and', 'when'], ['and', 'when', 'its'], ['when', 'its', 'fuel'], ['its', 'fuel', 'runs'], ['fuel', 'runs', 'out'], ['runs', 'out', 'it'], ['out', 'it', 'will'], ['it', 'will', 'crash'], ['will', 'crash', 'within'], ['crash', 'within', 'months']]
Lista di tutti i trigrammi della frase: ['any', 'lunar', 'satellite', 'needs', 'fuel', 'to', 'do', 'regular', 'orbit', 'corrections', 'and', 'when', 'its', 'fuel', 'runs', 'out', 'it', 'will', 'crash', 'within', 'months']

[['any', 'lunar', 'satell

## Aggiunta dei token di inizio \(\<s>\) e fine \(\</s>\)

In [None]:
prima_news_tokenized_complete = ['<s>'] * (n - 1) + prima_news_tokenized + ['</s>']
print(prima_news_tokenized_complete)

['<s>', '<s>', 'any', 'lunar', 'satellite', 'needs', 'fuel', 'to', 'do', 'regular', 'orbit', 'corrections', 'and', 'when', 'its', 'fuel', 'runs', 'out', 'it', 'will', 'crash', 'within', 'months', '</s>']
['<s>', '<s>', 'any', 'lunar', 'satellite', 'needs', 'fuel', 'to', 'do', 'regular', 'orbit', 'corrections', 'and', 'when', 'its', 'fuel', 'runs', 'out', 'it', 'will', 'crash', 'within', 'months', '</s>']


In [None]:
full_trigram_list = sentence_to_ngram(prima_news_tokenized_complete, n=3)
print(full_trigram_list)

[['<s>', '<s>', 'any'], ['<s>', 'any', 'lunar'], ['any', 'lunar', 'satellite'], ['lunar', 'satellite', 'needs'], ['satellite', 'needs', 'fuel'], ['needs', 'fuel', 'to'], ['fuel', 'to', 'do'], ['to', 'do', 'regular'], ['do', 'regular', 'orbit'], ['regular', 'orbit', 'corrections'], ['orbit', 'corrections', 'and'], ['corrections', 'and', 'when'], ['and', 'when', 'its'], ['when', 'its', 'fuel'], ['its', 'fuel', 'runs'], ['fuel', 'runs', 'out'], ['runs', 'out', 'it'], ['out', 'it', 'will'], ['it', 'will', 'crash'], ['will', 'crash', 'within'], ['crash', 'within', 'months'], ['within', 'months', '</s>']]
[['<s>', '<s>', 'any'], ['<s>', 'any', 'lunar'], ['any', 'lunar', 'satellite'], ['lunar', 'satellite', 'needs'], ['satellite', 'needs', 'fuel'], ['needs', 'fuel', 'to'], ['fuel', 'to', 'do'], ['to', 'do', 'regular'], ['do', 'regular', 'orbit'], ['regular', 'orbit', 'corrections'], ['orbit', 'corrections', 'and'], ['corrections', 'and', 'when'], ['and', 'when', 'its'], ['when', 'its', 'fuel'