In [None]:
from sklearn.datasets import fetch_20newsgroups         # Dataset che contiene testo di articoli di giornale appartenenti a 20 categorie differenti
import nltk                                             # NLP toolkit
import re                                               # Libreria per operazioni con le espressioni regolari
from collections import defaultdict, Counter
import numpy as np
import pandas as pd
import random

nltk.download('punkt')                                  # Con questo comando si scarica il tokenizzatore 'Punkt'

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

## Scarichiamo il nostro dataset

In [None]:
train_news_texts, category = fetch_20newsgroups(subset="train", categories=["sci.space"], return_X_y=True, remove=['headers', 'footers', 'quotes'])

## Definiamo la funzione creata in precedenza per processare il dataset e creare gli N-Grammi

In [None]:
def preprocessing(text):
    """
    Funzione che preprocessa il testo per creare una frase tokenizzata.

    Args:
        text: stringa contenente il testo da preprocessare e tokenizzare

    Returns:
        testo preprocessato e tokenizzato
    """
    text = text.lower()
    text = text.replace('\n', ' ')
    text = re.sub(r'[^a-zA-Z0-9.?! ]+', '', text)
    text = re.sub(' +', ' ', text)
    text = text.strip()
    text_tokenized = nltk.word_tokenize(text)
    return text_tokenized


In [None]:
def sentence_to_ngram(tokenized_sentence, n=3):
    """
    Funzione che restituisce tutti i n-grammi contenuti all'interno della frase tokenizzata.

    Args:
        tokenized_sentence: lista di parole/tokens che compongono la frase
        n: n-grammi da considerare

    Returns:
        lista di tutti i n-grammi presenti all'interno della frase tokenizzata
    """

    ngram_list = []
    tokenized_sentence = ['<s>'] * (n - 1) + tokenized_sentence + ['</s>']
    for i in range(len(tokenized_sentence) - n + 1):
        # the sliding window starts at position i and contains 3 words
        ngram = tokenized_sentence[i : i + n]
        ngram_list.append(ngram)
    return ngram_list



# Creiamo una matrice dei conteggi

In [None]:
train_news_texts_sample = [nltk.sent_tokenize(train_news_texts[0].strip())[0],
                           nltk.sent_tokenize(train_news_texts[0].strip())[1],
                           nltk.sent_tokenize(train_news_texts[0].strip())[0]]
for text in train_news_texts_sample:
  print(text)
  print("-"*80)


Any lunar satellite needs fuel to do regular orbit corrections, and when
its fuel runs out it will crash within months.
--------------------------------------------------------------------------------
The orbits of the Apollo
motherships changed noticeably during lunar missions lasting only a few
days.
--------------------------------------------------------------------------------
Any lunar satellite needs fuel to do regular orbit corrections, and when
its fuel runs out it will crash within months.
--------------------------------------------------------------------------------


In [None]:
n=3

train_news_texts_sample_processed = [preprocessing(text) for text in train_news_texts_sample]
trigrams_list = [sentence_to_ngram(tokenized_sentence, n) for tokenized_sentence in train_news_texts_sample_processed]
trigrams_list = [tuple(x) for y in trigrams_list for x in y]
count_trigrams = Counter(trigrams_list)
count_trigrams

Counter({('<s>', '<s>', 'any'): 2,
         ('<s>', 'any', 'lunar'): 2,
         ('any', 'lunar', 'satellite'): 2,
         ('lunar', 'satellite', 'needs'): 2,
         ('satellite', 'needs', 'fuel'): 2,
         ('needs', 'fuel', 'to'): 2,
         ('fuel', 'to', 'do'): 2,
         ('to', 'do', 'regular'): 2,
         ('do', 'regular', 'orbit'): 2,
         ('regular', 'orbit', 'corrections'): 2,
         ('orbit', 'corrections', 'and'): 2,
         ('corrections', 'and', 'when'): 2,
         ('and', 'when', 'its'): 2,
         ('when', 'its', 'fuel'): 2,
         ('its', 'fuel', 'runs'): 2,
         ('fuel', 'runs', 'out'): 2,
         ('runs', 'out', 'it'): 2,
         ('out', 'it', 'will'): 2,
         ('it', 'will', 'crash'): 2,
         ('will', 'crash', 'within'): 2,
         ('crash', 'within', 'months'): 2,
         ('within', 'months', '.'): 2,
         ('months', '.', '</s>'): 2,
         ('<s>', '<s>', 'the'): 1,
         ('<s>', 'the', 'orbits'): 1,
         ('the', 'orbit

In [None]:
def single_pass_ngram_count_matrix(count_ngrams):
    """
    Crea la matrice dei conteggi di tri-grammi utilizzando il corpus passato in input.

    Args:
        count_ngrams: Conteggio dei n-grammi presenti nel corpus

    Returns:
        n_minus_one_grams: lista di tutti i bigrammi, utilizzato come indice di riga della matrice
        vocabulary: lista di tutte le parole presenti nel corpus, utilizzato come indice di colonna
        count_matrix: pandas dataframe con i bigrammi prefixes come righe,
                      le parole del vocabolario come colonne
                      e il conteggio delle combinazioni bigramma/parola come valore
    """
    n_minus_one_grams = []
    vocabulary = []
    count_matrix_dict = defaultdict(dict)

    # go through the corpus once with a sliding window
    for ngram, count in count_ngrams.items():

        n_minus_one_gram = ngram[0 : -1]
        if not n_minus_one_gram in n_minus_one_grams:
            n_minus_one_grams.append(n_minus_one_gram)

        last_word = ngram[-1]
        if not last_word in vocabulary:
            vocabulary.append(last_word)

        if (n_minus_one_gram,last_word) not in count_matrix_dict:
            count_matrix_dict[n_minus_one_gram,last_word] = count

    # convert the count_matrix to np.array to fill in the blanks
    count_matrix = np.zeros((len(n_minus_one_grams), len(vocabulary)))
    for ngram_key, ngram_count in count_matrix_dict.items():
        count_matrix[n_minus_one_grams.index(ngram_key[0]), \
                     vocabulary.index(ngram_key[1])]\
        = ngram_count

    # np.array to pandas dataframe conversion
    count_matrix = pd.DataFrame(count_matrix, index=n_minus_one_grams, columns=vocabulary)
    return n_minus_one_grams, vocabulary, count_matrix


In [None]:
n_minus_one_grams, vocabulary, count_matrix = single_pass_ngram_count_matrix(count_trigrams)

print(count_matrix.iloc[:5, :5])


                    any  lunar  satellite  needs  fuel
(<s>, <s>)          2.0    0.0        0.0    0.0   0.0
(<s>, any)          0.0    2.0        0.0    0.0   0.0
(any, lunar)        0.0    0.0        2.0    0.0   0.0
(lunar, satellite)  0.0    0.0        0.0    2.0   0.0
(satellite, needs)  0.0    0.0        0.0    0.0   2.0


## Costruiamo ora la matrice di probabilità

In [None]:
# create the probability matrix from the count matrix
row_sums = count_matrix.sum(axis=1)
# delete each row by its sum
prob_matrix = count_matrix.div(row_sums, axis=0)

print(prob_matrix.iloc[:5, :5])


                         any  lunar  satellite  needs  fuel
(<s>, <s>)          0.666667    0.0        0.0    0.0   0.0
(<s>, any)          0.000000    1.0        0.0    0.0   0.0
(any, lunar)        0.000000    0.0        1.0    0.0   0.0
(lunar, satellite)  0.000000    0.0        0.0    1.0   0.0
(satellite, needs)  0.000000    0.0        0.0    0.0   1.0


## Generiamo del testo utilizzando il nostro language model

In [None]:
def generate_text(n: int, prob_matrix: pd.DataFrame, token_count: int, threshold_prob = 0.005, random_sampling: bool = False):
    """
    Funzione per generare del testo partendo dalla matrice di probabilità.

    Args:
        n: modello n-gramma da utilizzare
        prob_matrix: matrice di probabilità
        token_count: numero di token da generare
        threshold_prob: soglia di probabilità sopra la quale considerare i token
        random_sampling: booleano che dice se effettuare un sampling tra i token a probabilità non nulla oppure prendere sempre quello a probabilità massima

    Returns:
        bigrams: lista di tutti i bigrammi, utilizzato come indice di riga della matrice
        vocabulary: lista di tutte le parole presenti nel corpus, utilizzato come indice di colonna
        count_matrix: pandas dataframe con i bigrammi prefixes come righe,
                      le parole del vocabolario come colonne
                      e il conteggio delle combinazioni bigramma/parola come valore
    """
    context_queue = (n - 1) * ['<s>']
    result = []
    for _ in range(token_count):
      if random_sampling:
        if tuple(context_queue) in prob_matrix.index.tolist():
          nonzero_probs = (prob_matrix.loc[[tuple(context_queue)]]> threshold_prob).any()
          tokens_list = nonzero_probs.index[nonzero_probs].tolist()
          obj = random.sample(tokens_list, 1)[0]
        else:
          return ' '.join(result)
      else:
        obj = prob_matrix.loc[[tuple(context_queue)]].max().idxmax()
      result.append(obj)
      if n > 1:
          context_queue.pop(0)
          if obj == '.':
              context_queue = (n - 1) * ['<s>']
          else:
              context_queue.append(obj)
    return ' '.join(result)


In [None]:
for i in range(10):
  print(generate_text(n, prob_matrix, 20))

any lunar satellite needs fuel to do regular orbit corrections and when its fuel runs out it will crash within
any lunar satellite needs fuel to do regular orbit corrections and when its fuel runs out it will crash within
any lunar satellite needs fuel to do regular orbit corrections and when its fuel runs out it will crash within
any lunar satellite needs fuel to do regular orbit corrections and when its fuel runs out it will crash within
any lunar satellite needs fuel to do regular orbit corrections and when its fuel runs out it will crash within
any lunar satellite needs fuel to do regular orbit corrections and when its fuel runs out it will crash within
any lunar satellite needs fuel to do regular orbit corrections and when its fuel runs out it will crash within
any lunar satellite needs fuel to do regular orbit corrections and when its fuel runs out it will crash within
any lunar satellite needs fuel to do regular orbit corrections and when its fuel runs out it will crash within
a

In [None]:
for i in range(10):
  print(generate_text(n, prob_matrix, 20, random_sampling=True))

any lunar satellite needs fuel to do regular orbit corrections and when its fuel runs out it will crash within
the orbits of the apollo motherships changed noticeably during lunar missions lasting only a few days . the orbits of
any lunar satellite needs fuel to do regular orbit corrections and when its fuel runs out it will crash within
any lunar satellite needs fuel to do regular orbit corrections and when its fuel runs out it will crash within
the orbits of the apollo motherships changed noticeably during lunar missions lasting only a few days . any lunar satellite
any lunar satellite needs fuel to do regular orbit corrections and when its fuel runs out it will crash within
any lunar satellite needs fuel to do regular orbit corrections and when its fuel runs out it will crash within
any lunar satellite needs fuel to do regular orbit corrections and when its fuel runs out it will crash within
any lunar satellite needs fuel to do regular orbit corrections and when its fuel runs out it