# NLP - Métricas textuais

## Carregamento dos arquivos

In [1]:
import os
path = "documents"
files_list = [os.path.join(path, file_name) for file_name in os.listdir(path)]

print("Lista de arquivos:", files_list)

Lista de arquivos: ['documents\\A9_-_EACL23_Incorporating_context_into_subword_vocabularies.pdf']


## Definição das funções

In [2]:
# !python -m spacy download en_core_web_sm
# !pip install pymupdf spacy chardet


In [11]:
import os
import fitz  # PyMuPDF
import spacy
from collections import Counter
import chardet

# Carrega o modelo de NLP para português
nlp = spacy.load("en_core_web_sm")

def extract_text_from_pdf(file_path):
    text = ""
    try:
        with fitz.open(file_path) as pdf:
            for page in pdf:
                text += page.get_text()
    except Exception as e:
        print(f"Erro ao ler PDF {file_path}: {e}")
    return text

def remove_references(text):
    keywords = ["Referências", "References"]
    last_pos = -1
    for word in keywords:
        pos = text.rfind(word)
        if pos > last_pos:
            last_pos = pos
    if last_pos != -1:
        return text[:last_pos]
    return text

def load_documents(files):
    return [extract_text_from_pdf(file_path) for file_path in files]

def remove_stopwords(texts):
    cleaned_texts = []
    for text in texts:
        doc = nlp(text)
        tokens = [token.text for token in doc if not token.is_stop]
        cleaned_texts.append(" ".join(tokens))
    return cleaned_texts

def count_sentences(doc):
    return len(list(doc.sents))

def count_tokens(doc):
    return [token.text.lower() for token in doc if token.is_alpha]

def count_pos_tags(doc):
    # Substantivos
    num_nouns = sum(1 for token in doc if token.pos_ == "NOUN")
    # Verbos
    num_verbs = sum(1 for token in doc if token.pos_ == "VERB")
    # Preposições
    num_adpositions = sum(1 for token in doc if token.pos_ == "ADP")
    return num_nouns, num_verbs, num_adpositions

def get_lemmas(doc):
    tokens = [token for token in doc if not token.is_space]
    return [token.lemma_ for token in tokens]

def compute_token_stats(tokens, num_docs):
    total_tokens = len(tokens)
    avg_tokens = total_tokens / num_docs if num_docs else 0
    token_freq = Counter(tokens)
    top_10 = token_freq.most_common(10)
    down_10 = token_freq.most_common()[-10:]
    return total_tokens, avg_tokens, top_10, down_10



def get_doc_statistics(texts):
    total_sentences = 0
    total_tokens_list = []
    total_nouns = total_verbs = total_preps = 0

    for text in texts:
        doc = nlp(text)

        total_sentences += count_sentences(doc)

        tokens = count_tokens(doc)
        total_tokens_list.extend(tokens)

        nouns, verbs, preps = count_pos_tags(doc)
        total_nouns += nouns
        total_verbs += verbs
        total_preps += preps

    num_docs = len(texts)
    avg_sentences = total_sentences / num_docs if num_docs else 0
    total_tokens, avg_tokens, top_10, down_10 = compute_token_stats(total_tokens_list, num_docs)
    lemmas = get_lemmas(doc)

    return {
        "num_sentences": total_sentences,
        "avg_sentences_per_doc": avg_sentences,
        "num_tokens": total_tokens,
        "avg_tokens_per_doc": avg_tokens,
        "top_10_tokens": top_10,
        "down_10_tokens": down_10,
        "num_nouns": total_nouns,
        "num_verbs": total_verbs,
        "num_prepositions": total_preps,
        "lemmas": lemmas
    }
    

## Estatísticas

### Sem remoção de stopwords

In [12]:
texts = load_documents(files_list)
texts = [remove_references(text) for text in texts]
stats = get_doc_statistics(texts)

# Exibe as estatísticas
for key, value in stats.items():
    print(f"{key}: {value}")

num_sentences: 199
avg_sentences_per_doc: 199.0
num_tokens: 5056
avg_tokens_per_doc: 5056.0
top_10_tokens: [('the', 224), ('of', 152), ('in', 125), ('to', 117), ('a', 113), ('and', 110), ('sage', 70), ('vocabulary', 70), ('for', 65), ('we', 60)]
down_10_tokens: [('omer', 1), ('attendees', 1), ('iscol', 1), ('suggestions', 1), ('reviewers', 1), ('helpful', 1), ('kaj', 1), ('peter', 1), ('tamar', 1), ('operate', 1)]
num_nouns: 1397
num_verbs: 636
num_prepositions: 700
lemmas: ['proceeding', 'of', 'the', '17th', 'Conference', 'of', 'the', 'european', 'chapter', 'of', 'the', 'Association', 'for', 'Computational', 'Linguistics', ',', 'page', '623–635', 'May', '2', '-', '6', ',', '2023', '©', '2023', 'Association', 'for', 'Computational', 'Linguistics', 'incorporate', 'Context', 'into', 'Subword', 'Vocabularies', 'Shaked', 'Yehezkel', 'Blavatnik', 'School', 'of', 'Computer', 'Science', 'Tel', '-', 'Aviv', 'University', 'Tel', '-', 'Aviv', ',', 'Israel', 'shakedy@mail.tau.ac.il', 'Yuval', 'Pi

In [13]:
import pandas as pd
df_stats = pd.DataFrame([stats])
df_stats

Unnamed: 0,num_sentences,avg_sentences_per_doc,num_tokens,avg_tokens_per_doc,top_10_tokens,down_10_tokens,num_nouns,num_verbs,num_prepositions,lemmas
0,199,199.0,5056,5056.0,"[(the, 224), (of, 152), (in, 125), (to, 117), ...","[(omer, 1), (attendees, 1), (iscol, 1), (sugge...",1397,636,700,"[proceeding, of, the, 17th, Conference, of, th..."


### Com remoção de stopwords

In [14]:
texts = load_documents(files_list)
texts = remove_stopwords(texts)
stats = get_doc_statistics(texts)

# Exibe as estatísticas
for key, value in stats.items():
    print(f"{key}: {value}")

num_sentences: 457
avg_sentences_per_doc: 457.0
num_tokens: 4405
avg_tokens_per_doc: 4405.0
top_10_tokens: [('vocabulary', 73), ('sage', 71), ('tokens', 52), ('bpe', 50), ('al', 44), ('et', 42), ('association', 38), ('token', 38), ('size', 37), ('v', 35)]
down_10_tokens: [('loader', 1), ('dist', 1), ('seq', 1), ('strategy', 1), ('accumulation', 1), ('eval', 1), ('rate', 1), ('grad', 1), ('scheduler', 1), ('polynomial', 1)]
num_nouns: 1691
num_verbs: 653
num_prepositions: 31
lemmas: ['Proceedings', '17th', 'Conference', 'European', 'Chapter', 'Association', 'Computational', 'Linguistics', ',', 'page', '623–635', '2', '-', '6', ',', '2023', '©', '2023', 'Association', 'Computational', 'Linguistics', 'incorporate', 'Context', 'Subword', 'Vocabularies', 'Shaked', 'Yehezkel', 'Blavatnik', 'School', 'Computer', 'Science', 'Tel', '-', 'Aviv', 'University', 'Tel', '-', 'Aviv', ',', 'Israel', 'shakedy@mail.tau.ac.il', 'Yuval', 'Pinter', 'Department', 'Computer', 'Science', 'Ben', '-', 'Gurion',

In [15]:
import pandas as pd
df_stats = pd.DataFrame([stats])
df_stats

Unnamed: 0,num_sentences,avg_sentences_per_doc,num_tokens,avg_tokens_per_doc,top_10_tokens,down_10_tokens,num_nouns,num_verbs,num_prepositions,lemmas
0,457,457.0,4405,4405.0,"[(vocabulary, 73), (sage, 71), (tokens, 52), (...","[(loader, 1), (dist, 1), (seq, 1), (strategy, ...",1691,653,31,"[Proceedings, 17th, Conference, European, Chap..."
