In [59]:
import pandas as pd
import os
import glob
from KafNafParserPy import KafNafParser
import nltk
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
import numpy as np
from nltk.corpus import stopwords as sw


### Exploracion

In [60]:


# Ruta al directorio con los archivos .naf
directory = "data/docs-raw-texts/"

# Buscar todos los archivos que coincidan con el patrón
files = glob.glob(directory + "wes2015.d*.naf")

# Lista para almacenar los datos
data = []

# Procesar cada archivo
for file in files:
    # Extraer el identificador del nombre del archivo
    identifier = file.split(".")[-2][-3:]  # Extrae '001', '002', etc.
    
    # Crear el objeto KafNafParser para el archivo actual
    naf_parser = KafNafParser(file)
    
    # Extraer el texto crudo
    raw_text = naf_parser.get_raw()

    title = naf_parser.root.find('nafHeader/fileDesc').get('title')
    
    # Agregar los datos a la lista
    data.append({"identifier": identifier, "text": raw_text, "title":title})

df = pd.DataFrame(data)


In [61]:
df

Unnamed: 0,identifier,text,title
0,038,Evangelista Torricelli and the Barometer.\n\nE...,Evangelista Torricelli and the Barometer
1,004,Walt Disney’s ‘Steamboat Willie’ and the Rise ...,Walt Disney’s ‘Steamboat Willie’ and the Rise ...
2,010,Sir James Young Simpson and the Chloroform.\n\...,Sir James Young Simpson and the Chloroform
3,206,Amusing Ourselves to Death by Neil Postman.\n\...,Amusing Ourselves to Death by Neil Postman
4,212,George Boole – The Founder of Modern Logics.\n...,George Boole – The Founder of Modern Logics
...,...,...,...
326,237,Andrija Mohorovičić and the Mohorovičić Discon...,Andrija Mohorovičić and the Mohorovičić Discon...
327,223,"Well, I Didn’t Know it was Hard – Happy Birthd...","Well, I Didn’t Know it was Hard – Happy Birthd..."
328,009,Dorothea Erxleben – Germany’s First Female Med...,Dorothea Erxleben – Germany’s First Female Med...
329,035,Nicholas Culpeper and the Complete Herbs of En...,Nicholas Culpeper and the Complete Herbs of En...


In [62]:
def replace_title_on_text(dataFrame):
    for i in range(0,len(dataFrame["text"])):
        if dataFrame["title"][i] in dataFrame["text"][i]:
            dataFrame["text"][i] = dataFrame["text"][i].replace(dataFrame["title"][i]+".","")
        if "\n" in dataFrame["text"][i]:
            dataFrame["text"][i] = dataFrame["text"][i].replace("\n","")


replace_title_on_text(df)

In [63]:
df

Unnamed: 0,identifier,text,title
0,038,Evangelista Torricelli (1608-1647). On October...,Evangelista Torricelli and the Barometer
1,004,Mickey Mouse star in Walk of Fame Image by Fli...,Walt Disney’s ‘Steamboat Willie’ and the Rise ...
2,010,"Sir James Young Simpson, 1st Baronet (1811-187...",Sir James Young Simpson and the Chloroform
3,206,"Neil Postman (1931 – 2003). On March 8, 1931,...",Amusing Ourselves to Death by Neil Postman
4,212,"George Boole (1815-1864). On December 8, 1864...",George Boole – The Founder of Modern Logics
...,...,...,...
326,237,Andrija Mohorovicic (1857 – 1936). On January...,Andrija Mohorovičić and the Mohorovičić Discon...
327,223,Ivan Sutherland’s Sketchpad (1963) Happy Birth...,"Well, I Didn’t Know it was Hard – Happy Birthd..."
328,009,Dorothea Christiane Erxleben (1715 – 1762). O...,Dorothea Erxleben – Germany’s First Female Med...
329,035,"Nicholas Culpeper. On October 18, 1616, Engli...",Nicholas Culpeper and the Complete Herbs of En...


# Procesamiento

* Tokenizacion
* Stopwords
* Normalizacion (uncased)
* Lematizacion
* Stemming

In [64]:
text = df[['text']]
text = np.array(text)

stops = stopwords.words('english')

count_vect = CountVectorizer(max_df=0.9,strip_accents='unicode', stop_words=stops)
X = count_vect.fit_transform(text.ravel())


In [65]:
#RESULTADO ESPERADO
print('Dimensiones de la matriz X:', X.shape)

print('Contenido de la matriz X:\n', X.toarray())

print('Cantidad de documentos:', X.shape[0])
print('Cantidad de palabras:', X.shape[1])
print('Cantidad de ocurrencias:', X.sum())

Dimensiones de la matriz X: (331, 18549)
Contenido de la matriz X:
 [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [1 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [1 0 0 ... 0 0 0]]
Cantidad de documentos: 331
Cantidad de palabras: 18549
Cantidad de ocurrencias: 120614
