In [1]:
import pandas as pd
from italian_ats_evaluator import TextAnalyzer, SimplificationAnalyzer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
LABELS = ['D1 ORIGINAL', 'D1 SEMPL', 'D2 ORIGINAL', 'D2 SEMPL']

In [3]:
docs = [
  open(f'./docs/D1_ORIGINAL.txt', 'r', encoding='utf-8').read().replace('\n[...]\n', ' '),
  open(f'./docs/D1_SEMPL.txt', 'r', encoding='utf-8').read().replace('\n[...]\n', ' '),
  open(f'./docs/D2_ORIGINAL.txt', 'r', encoding='utf-8').read().replace('\n[...]\n', ' '),
  open(f'./docs/D2_SEMPL.txt', 'r', encoding='utf-8').read().replace('\n[...]\n', ' '),
]

In [4]:
print(docs[0])

Le presenti Linee programmatiche definiscono gli indirizzi per la programmazione regionale, stabilendo le linee di azione e organizzative prioritarie per il Sistema sanitario regionale attuazione della riforma territoriale ai sensi del DM 23 maggio 2022, n. 77. Tali linee programmatiche, che devono essere declinate nella programmazione attuativa aziendale non si intendono esaustive delle attività e dei servizi che devono essere garantiti in adempimento della normativa statale e regionale, ma indicano le priorità individuate dalla Regione. Il presente documento indirizza le funzionalità, i modelli organizzativi e di servizio, i mix di professionalità necessari per il concreto avvio delle Case di Comunità (CdC), Ospedali di Comunità (OdC), Centrali Operative Territoriali (COT), Infermiere di Famiglia e Comunità (IFeC), Assistenza Domiciliare Integrata (ADI) e tutte le altre componenti rilevanti per lo sviluppo dei servizi territoriali per promuovere la prevenzione primaria, migliorare l'

## Basic

In [5]:
d = []
for i, doc in enumerate(docs):
  processed = TextAnalyzer(doc)
  d.append({
    'DOC': LABELS[i],
    'Tokens': processed.basic.n_tokens,
    'Tokens (con punteg.)': processed.basic.n_tokens_all,
    'Caratteri': processed.basic.n_chars,
    'Caratteri (con punt)': processed.basic.n_chars_all,
    'Sillabe': processed.basic.n_syllables,
    'Frasi': processed.basic.n_sentences,
    'Types': processed.basic.n_words,
    'Lemmi': processed.basic.n_unique_lemmas,
  })

pd.DataFrame(d).head(10)

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('it_core_news_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


Unnamed: 0,DOC,Tokens,Tokens (con punteg.),Caratteri,Caratteri (con punt),Sillabe,Frasi,Types,Lemmi
0,D1 ORIGINAL,839,965,5037,5163,2098,29,467,377
1,D1 SEMPL,847,970,4752,4875,2019,51,427,339
2,D2 ORIGINAL,502,584,3030,3112,1244,22,312,243
3,D2 SEMPL,503,580,2946,3023,1222,31,298,232


## POS

In [6]:
d = []
for i, doc in enumerate(docs):
  processed = TextAnalyzer(doc)
  d.append({
    'DOC':                        LABELS[i],
    'Altro':                      processed.pos.n_other,
    'Nomi':                       processed.pos.n_nouns,
    'Verbi':                      processed.pos.n_verbs,
    'Numeri':                     processed.pos.n_number,
    'Simboli':                    processed.pos.n_symbols,
    'Avverbi':                    processed.pos.n_adverbs,
    'Articoli':                   processed.pos.n_articles,
    'Pronomi':                    processed.pos.n_pronouns,
    'Particelle':                 processed.pos.n_particles,
    'Agettivi':                   processed.pos.n_adjectives,
    'Preposizioni':               processed.pos.n_prepositions,
    'Nomi propri':                processed.pos.n_proper_nouns,
    'Punteggiatura':              processed.pos.n_punctuations,
    'Interiezioni':               processed.pos.n_interjections,
    'Cong. coord.':               processed.pos.n_coordinating_conjunctions,
    'Cong. sub.':                 processed.pos.n_subordinating_conjunctions,
  })

pd.DataFrame(d).head(10)

Unnamed: 0,DOC,Altro,Nomi,Verbi,Numeri,Simboli,Avverbi,Articoli,Pronomi,Particelle,Agettivi,Preposizioni,Nomi propri,Punteggiatura,Interiezioni,Cong. coord.,Cong. sub.
0,D1 ORIGINAL,0,235,106,21,0,15,80,27,0,123,161,24,126,0,41,6
1,D1 SEMPL,0,218,132,21,0,20,119,26,0,107,134,20,123,0,46,4
2,D2 ORIGINAL,2,160,51,6,0,13,40,9,0,74,118,8,82,0,21,0
3,D2 SEMPL,0,149,75,7,0,13,75,15,0,62,76,5,77,0,20,6


In [7]:
d = []
for i, doc in enumerate(docs):
  processed = TextAnalyzer(doc)
  d.append({
    'DOC':                        LABELS[i],
    'Altro':                      round(processed.pos.n_other / processed.basic.n_tokens_all * 100, 2),
    'Nomi':                       round(processed.pos.n_nouns / processed.basic.n_tokens_all * 100, 2),
    'Verbi':                      round(processed.pos.n_verbs / processed.basic.n_tokens_all * 100, 2),
    'Numeri':                     round(processed.pos.n_number / processed.basic.n_tokens_all * 100, 2),
    'Simboli':                    round(processed.pos.n_symbols / processed.basic.n_tokens_all * 100, 2),
    'Avverbi':                    round(processed.pos.n_adverbs / processed.basic.n_tokens_all * 100, 2),
    'Articoli':                   round(processed.pos.n_articles / processed.basic.n_tokens_all * 100, 2),
    'Pronomi':                    round(processed.pos.n_pronouns / processed.basic.n_tokens_all * 100, 2),
    'Particelle':                 round(processed.pos.n_particles / processed.basic.n_tokens_all * 100, 2),
    'Agettivi':                   round(processed.pos.n_adjectives / processed.basic.n_tokens_all * 100, 2),
    'Preposizioni':               round(processed.pos.n_prepositions / processed.basic.n_tokens_all * 100, 2),
    'Nomi propri':                round(processed.pos.n_proper_nouns / processed.basic.n_tokens_all * 100, 2),
    'Punteggiatura':              round(processed.pos.n_punctuations / processed.basic.n_tokens_all * 100, 2),
    'Interiezioni':               round(processed.pos.n_interjections / processed.basic.n_tokens_all * 100, 2),
    'Cong. coord.':               round(processed.pos.n_coordinating_conjunctions / processed.basic.n_tokens_all * 100, 2),
    'Cong. sub.':                 round(processed.pos.n_subordinating_conjunctions / processed.basic.n_tokens_all * 100, 2),
  })

pd.DataFrame(d).head(10)

Unnamed: 0,DOC,Altro,Nomi,Verbi,Numeri,Simboli,Avverbi,Articoli,Pronomi,Particelle,Agettivi,Preposizioni,Nomi propri,Punteggiatura,Interiezioni,Cong. coord.,Cong. sub.
0,D1 ORIGINAL,0.0,24.35,10.98,2.18,0.0,1.55,8.29,2.8,0.0,12.75,16.68,2.49,13.06,0.0,4.25,0.62
1,D1 SEMPL,0.0,22.47,13.61,2.16,0.0,2.06,12.27,2.68,0.0,11.03,13.81,2.06,12.68,0.0,4.74,0.41
2,D2 ORIGINAL,0.34,27.4,8.73,1.03,0.0,2.23,6.85,1.54,0.0,12.67,20.21,1.37,14.04,0.0,3.6,0.0
3,D2 SEMPL,0.0,25.69,12.93,1.21,0.0,2.24,12.93,2.59,0.0,10.69,13.1,0.86,13.28,0.0,3.45,1.03


## Passive verbs

In [8]:
d = []
for i, doc in enumerate(docs):
  processed = TextAnalyzer(doc)
  d.append({
    'DOC':                        LABELS[i],
    'Verbi attivi':               processed.verbs.n_active_verbs,
    'Verbi passivi':              processed.verbs.n_passive_verbs
  })

pd.DataFrame(d).head(10)

Unnamed: 0,DOC,Verbi attivi,Verbi passivi
0,D1 ORIGINAL,89,17
1,D1 SEMPL,116,16
2,D2 ORIGINAL,37,14
3,D2 SEMPL,61,14


In [9]:
d = []
for i, doc in enumerate(docs):
  processed = TextAnalyzer(doc)
  d.append({
    'DOC':                        LABELS[i],
    'Verbi attivi':               round(processed.verbs.n_active_verbs / processed.pos.n_verbs * 100, 2),
    'Verbi passivi':              round(processed.verbs.n_passive_verbs / processed.pos.n_verbs * 100, 2)
  })

pd.DataFrame(d).head(10)

Unnamed: 0,DOC,Verbi attivi,Verbi passivi
0,D1 ORIGINAL,83.96,16.04
1,D1 SEMPL,87.88,12.12
2,D2 ORIGINAL,72.55,27.45
3,D2 SEMPL,81.33,18.67


## NVdB

In [10]:
d = []
for i, doc in enumerate(docs):
  processed = TextAnalyzer(doc)
  d.append({
    'DOC':        LABELS[i],
    'ALL':        processed.vdb.n_vdb_tokens,
    'FO':         processed.vdb.n_vdb_fo_tokens,
    'AU':         processed.vdb.n_vdb_au_tokens,
    'AD':         processed.vdb.n_vdb_ad_tokens,
  })

pd.DataFrame(d).head(10)

Unnamed: 0,DOC,ALL,FO,AU,AD
0,D1 ORIGINAL,616,496,123,82
1,D1 SEMPL,701,612,88,88
2,D2 ORIGINAL,365,284,71,66
3,D2 SEMPL,415,339,64,51


In [11]:
d = []
for i, doc in enumerate(docs):
  processed = TextAnalyzer(doc)
  d.append({
    'DOC':        LABELS[i],
    'ALL':        round(processed.vdb.n_vdb_tokens / processed.basic.n_tokens * 100, 2),
    'FO':         round(processed.vdb.n_vdb_fo_tokens / processed.basic.n_tokens * 100, 2),
    'AU':         round(processed.vdb.n_vdb_au_tokens / processed.basic.n_tokens * 100, 2),
    'AD':         round(processed.vdb.n_vdb_ad_tokens / processed.basic.n_tokens * 100, 2),
  })

pd.DataFrame(d).head(10)

Unnamed: 0,DOC,ALL,FO,AU,AD
0,D1 ORIGINAL,73.42,59.12,14.66,9.77
1,D1 SEMPL,82.76,72.26,10.39,10.39
2,D2 ORIGINAL,72.71,56.57,14.14,13.15
3,D2 SEMPL,82.5,67.4,12.72,10.14


## Readability

In [12]:
d = []
for i, doc in enumerate(docs):
  processed = TextAnalyzer(doc)
  d.append({
    'DOC':              LABELS[i],
    'ttr':              round(processed.readability.ttr, 2),
    'gulpease':         round(processed.readability.gulpease, 2),
    'flesch_vacca':     round(processed.readability.flesch_vacca, 2),
    'lexical_density':  round(processed.readability.lexical_density, 2),
  })

pd.DataFrame(d).head(10)

Unnamed: 0,DOC,ttr,gulpease,flesch_vacca,lexical_density
0,D1 ORIGINAL,55.66,39.33,14.53,0.57
1,D1 SEMPL,50.41,50.96,34.45,0.56
2,D2 ORIGINAL,62.15,41.79,22.11,0.59
3,D2 SEMPL,59.24,48.92,31.86,0.59


## Semantic similarity


In [13]:
d = []

a = SimplificationAnalyzer(docs[0], docs[1])
b = SimplificationAnalyzer(docs[2], docs[3])

d.append({
  'Corpus':               f'D1 ORIGINAL vs SEMPL',
  'semantic_similarity':  round(a.similarity.semantic_similarity, 2)
})

d.append({
  'Corpus':               f'D2 ORIGINAL vs SEMPL',
  'semantic_similarity':  round(b.similarity.semantic_similarity, 2)
})

pd.DataFrame(d).head(20)

Unnamed: 0,Corpus,semantic_similarity
0,D1 ORIGINAL vs SEMPL,95.04
1,D2 ORIGINAL vs SEMPL,96.34


## Distance

In [14]:
d = []

a = SimplificationAnalyzer(docs[0], docs[1])
b = SimplificationAnalyzer(docs[2], docs[3])

d.append({
  'Corpus':                 f'D1 ORIGINAL vs SEMPL',
  'editdistance':           a.diff.editdistance,
  'added_tokens':           a.diff.n_added_tokens,
  'added_vdb_tokens':       a.diff.n_added_vdb_tokens,
  '%_added_vdb_tokens':     round(a.diff.n_added_vdb_tokens / a.diff.n_added_tokens * 100, 2),
  'deleted_tokens':         a.diff.n_deleted_tokens,
  'deleted_vdb_tokens':     a.diff.n_deleted_vdb_tokens,
  '%_deleted_vdb_tokens':   round(a.diff.n_deleted_vdb_tokens / a.diff.n_deleted_tokens * 100, 2),
})

d.append({
  'Corpus':                 f'D2 ORIGINAL vs SEMPL',
  'editdistance':           b.diff.editdistance,
  'added_tokens':           b.diff.n_added_tokens,
  'added_vdb_tokens':       b.diff.n_added_vdb_tokens,
  '%_added_vdb_tokens':     round(b.diff.n_added_vdb_tokens / b.diff.n_added_tokens * 100, 2),
  'deleted_tokens':         b.diff.n_deleted_tokens,
  'deleted_vdb_tokens':     b.diff.n_deleted_vdb_tokens,
  '%_deleted_vdb_tokens':   round(b.diff.n_deleted_vdb_tokens / b.diff.n_deleted_tokens * 100, 2),
})
pd.DataFrame(d).head(20)

Unnamed: 0,Corpus,editdistance,added_tokens,added_vdb_tokens,%_added_vdb_tokens,deleted_tokens,deleted_vdb_tokens,%_deleted_vdb_tokens
0,D1 ORIGINAL vs SEMPL,2778,238,203,85.29,266,185,69.55
1,D2 ORIGINAL vs SEMPL,1648,148,130,87.84,159,108,67.92


In [15]:
d = []

a = SimplificationAnalyzer(docs[0], docs[1])
b = SimplificationAnalyzer(docs[2], docs[3])

d.append({
  'Corpus':                 f'D1 ORIGINAL vs SEMPL',
  'editdistance':           a.diff.editdistance / max(a.reference.basic.n_chars, a.simplified.basic.n_chars) * 100,
  'added_tokens':           a.diff.n_added_tokens  / a.simplified.basic.n_tokens * 100,
  'added_vdb_tokens':       a.diff.n_added_vdb_tokens  / a.simplified.basic.n_tokens * 100,
  'deleted_tokens':         a.diff.n_deleted_tokens  / a.simplified.basic.n_tokens * 100,
  'deleted_vdb_tokens':     a.diff.n_deleted_vdb_tokens / a.simplified.basic.n_tokens * 100,
})

d.append({
  'Corpus':                 f'D2 ORIGINAL vs SEMPL',
  'editdistance':           b.diff.editdistance / max(b.reference.basic.n_chars, b.simplified.basic.n_chars) * 100,
  'added_tokens':           b.diff.n_added_tokens  / b.simplified.basic.n_tokens * 100,
  'added_vdb_tokens':       b.diff.n_added_vdb_tokens  / b.simplified.basic.n_tokens * 100,
  'deleted_tokens':         b.diff.n_deleted_tokens  / b.simplified.basic.n_tokens * 100,
  'deleted_vdb_tokens':     b.diff.n_deleted_vdb_tokens / b.simplified.basic.n_tokens * 100,
})
pd.DataFrame(d).head(20)

Unnamed: 0,Corpus,editdistance,added_tokens,added_vdb_tokens,deleted_tokens,deleted_vdb_tokens
0,D1 ORIGINAL vs SEMPL,55.151876,28.099174,23.966942,31.404959,21.841795
1,D2 ORIGINAL vs SEMPL,54.389439,29.423459,25.84493,31.610338,21.471173
