# Procesamiento de Lenguaje Natural con spaCy
## An álisis: El Corazón Delator
Análisis completo usando spaCy en español

In [None]:
!pip install spacy
!python -m spacy download es_core_news_sm

In [None]:
import spacy
from spacy.matcher import Matcher
from collections import Counter
nlp = spacy.load("es_core_news_sm")

In [None]:
with open("El_corazón_delator.txt", "r", encoding="utf-8") as f:
    texto = f.read()
print(f"Longitud: {len(texto)} caracteres")

In [None]:
doc = nlp(texto)
print(f"Total tokens: {len(doc)}")

In [None]:
sentences = list(doc.sents)
print(f"Total oraciones: {len(sentences)}")
for i, s in enumerate(sentences[:3], 1):
    print(f"{i}. {s.text[:60]}...")

In [None]:
tercera = sentences[2] if len(sentences) >= 3 else None
print("\nTercera oración:")
if tercera:
    print(f"{tercera.text}")
    print("\nTokens de la tercera oración:")
    print(f"{\"Token\":<15} {\"POS\":<10} {\"DEP\":<12} {\"LEMMA\":<15}")
    for token in tercera:
        print(f"{token.text:<15} {token.pos_:<10} {token.dep_:<12} {token.lemma_:<15}")

In [None]:
noun_chunks = list(doc.noun_chunks)
print(f"\nSintagmas nominales: {len(noun_chunks)}")
print("\nPrimeros 15 sintagmas nominales:")
for i, chunk in enumerate(noun_chunks[:15], 1):
    print(f"{i}. {chunk.text}")

In [None]:
verbos = [t for t in doc if t.pos_ == "VERB"]
print(f"\nVerbos totales: {len(verbos)}")
verbos_unicos = sorted(set(t.lemma_ for t in verbos))
print(f"Verbos únicos: {len(verbos_unicos)}")
print("\nVerbos encontrados:")
for i, verbo in enumerate(verbos_unicos[:20], 1):
    print(f"{i}. {verbo}")

In [None]:
print("\nEntidades Nombradas:")
entidades_dict = {}
for ent in doc.ents:
    if ent.label_ not in entidades_dict:
        entidades_dict[ent.label_] = []
    if ent.text not in entidades_dict[ent.label_]:
        entidades_dict[ent.label_].append(ent.text)
for label, entities in entidades_dict.items():
    print(f"\n{label}:")
    for ent in entities[:5]:
        print(f"  - {ent}")

In [None]:
matcher = Matcher(nlp.vocab)
pattern = [{"POS": "VERB"}, {"POS": "ADV"}]
matcher.add("Vigorous_Activities", [pattern])
matches = matcher(doc)
print(f"\nPatrones Vigorous encontrados: {len(matches)}")
print("\nPrimeros 5 matches:")
for ID, start, end in matches[:5]:
    print(f"  - {doc[start:end].text}")

In [None]:
print("\nPalabras más frecuentes:")
palabra_freq = Counter()
for token in doc:
    if not token.is_stop and token.is_alpha:
        palabra_freq[token.lemma_] += 1
for palabra, freq in palabra_freq.most_common(20):
    print(f"{palabra:<20} {freq:>5} veces")