In [31]:
import pandas as pd
import os
import glob
from KafNafParserPy import KafNafParser
import nltk
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
import numpy as np
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer



### Exploracion

In [32]:

def read_files(directory: str = "data/docs-raw-texts/"):
    # Ruta al directorio con los archivos .naf

    # Buscar todos los archivos que coincidan con el patrón
    files = glob.glob(directory + "wes2015.d*.naf")

    # Lista para almacenar los datos
    data = []

    # Procesar cada archivo
    for file in files:
        # Extraer el identificador del nombre del archivo
        identifier = file.split(".")[-2][-3:]  # Extrae '001', '002', etc.
        
        # Crear el objeto KafNafParser para el archivo actual
        naf_parser = KafNafParser(file)
        
        # Extraer el texto crudo
        raw_text = naf_parser.get_raw()

        title = naf_parser.root.find('nafHeader/fileDesc').get('title')
        
        # Agregar los datos a la lista
        data.append({"identifier": identifier, "text": raw_text, "title":title})

    df = pd.DataFrame(data)
    return df

In [33]:
df = read_files()
df

Unnamed: 0,identifier,text,title
0,001,William Beaumont and the Human Digestion.\n\nW...,William Beaumont and the Human Digestion
1,002,Selma Lagerlöf and the wonderful Adventures of...,Selma Lagerlöf and the wonderful Adventures of...
2,003,Ferdinand de Lesseps and the Suez Canal.\n\nFe...,Ferdinand de Lesseps and the Suez Canal
3,004,Walt Disney’s ‘Steamboat Willie’ and the Rise ...,Walt Disney’s ‘Steamboat Willie’ and the Rise ...
4,005,Eugene Wigner and the Structure of the Atomic ...,Eugene Wigner and the Structure of the Atomic ...
...,...,...,...
326,327,James Parkinson and Parkinson’s Disease.\n\nWo...,James Parkinson and Parkinson’s Disease
327,328,Juan de la Cierva and the Autogiro.\n\nDemonst...,Juan de la Cierva and the Autogiro
328,329,Squire Whipple – The Father of the Iron Bridge...,Squire Whipple – The Father of the Iron Bridge
329,330,William Playfair and the Beginnings of Infogra...,William Playfair and the Beginnings of Infogra...


In [34]:
def replace_title_on_text(dataFrame):
    for i in range(0,len(dataFrame["text"])):
        if dataFrame["title"][i] in dataFrame["text"][i]:
            dataFrame["text"][i] = dataFrame["text"][i].replace(dataFrame["title"][i]+".","")
        if "\n" in dataFrame["text"][i]:
            dataFrame["text"][i] = dataFrame["text"][i].replace("\n","")


replace_title_on_text(df)

In [35]:
df

Unnamed: 0,identifier,text,title
0,001,William Beaumont: Physiology of digestion Imag...,William Beaumont and the Human Digestion
1,002,Cover of The Wonderful Adventures of Nils. On...,Selma Lagerlöf and the wonderful Adventures of...
2,003,"Ferdinand Marie, Vicomte de Lesseps (1805-1894...",Ferdinand de Lesseps and the Suez Canal
3,004,Mickey Mouse star in Walk of Fame Image by Fli...,Walt Disney’s ‘Steamboat Willie’ and the Rise ...
4,005,Eugene Paul Wigner (1902-1995). On November 17...,Eugene Wigner and the Structure of the Atomic ...
...,...,...,...
326,327,Woodcut of a man suffering from Parkinson‘s di...,James Parkinson and Parkinson’s Disease
327,328,Demonstration of Cierva C.6 autogiro at Farnbo...,Juan de la Cierva and the Autogiro
328,329,Truss Bridge patented by Squire Whipple. On S...,Squire Whipple – The Father of the Iron Bridge
329,330,"Playfair’s trade-balance time-series chart, fr...",William Playfair and the Beginnings of Infogra...


# Procesamiento

* Tokenizacion
* Stopwords
* Normalizacion (uncased)
* Lematizacion
* Stemming

In [36]:
from utils_processor.processor import Processor
import nltk
from nltk import word_tokenize

processor_ = Processor()

df["text"] = df["text"]+ " " + df["title"]
df.drop(columns=["title"],inplace=True)
for i in range(0,len(df["text"])):
    df["text"][i] = word_tokenize(df["text"][i])
    df["text"][i] = processor_.to_lowercase(df["text"][i])
    df["text"][i] = processor_.to_lowercase(df["text"][i])
    df["text"][i] = processor_.remove_punctuation(df["text"][i])
    df["text"][i] = processor_.remove_non_ascii(df["text"][i])
    df["text"][i] = processor_.remove_stopwords(df["text"][i])
    df["text"][i] = processor_.lemmatize_verbs(df["text"][i])
    df["text"][i] = ' '.join(df["text"][i])


In [39]:
df["text_list"] = [0] * 331
for i in range(0,len(df["text"])):
    df["text_list"][i] = df["text"][i].split()
df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["text_list"][i] = df["text"][i].split()


Unnamed: 0,identifier,text,text_list
0,001,william beaumont physiology digestion image so...,"[william, beaumont, physiology, digestion, ima..."
1,002,cover wonderful adventure nils november 20 185...,"[cover, wonderful, adventure, nils, november, ..."
2,003,ferdinand marie vicomte de lesseps 18051894 no...,"[ferdinand, marie, vicomte, de, lesseps, 18051..."
3,004,mickey mouse star walk fame image flickr user ...,"[mickey, mouse, star, walk, fame, image, flick..."
4,005,eugene paul wigner 19021995 november 17 1902 h...,"[eugene, paul, wigner, 19021995, november, 17,..."
...,...,...,...
326,327,woodcut man suffer parkinson disease publish 1...,"[woodcut, man, suffer, parkinson, disease, pub..."
327,328,demonstration cierva c6 autogiro farnborough o...,"[demonstration, cierva, c6, autogiro, farnboro..."
328,329,truss bridge patent squire whipple september 1...,"[truss, bridge, patent, squire, whipple, septe..."
329,330,playfair tradebalance timeseries chart commerc...,"[playfair, tradebalance, timeseries, chart, co..."
