In [2]:
import pandas as pd
import os
import glob
from KafNafParserPy import KafNafParser
import nltk
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
import numpy as np
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer



### Exploracion

In [3]:

def read_files(directory: str = "data/docs-raw-texts/"):
    # Ruta al directorio con los archivos .naf

    # Buscar todos los archivos que coincidan con el patrón
    files = glob.glob(directory + "wes2015.d*.naf")

    # Lista para almacenar los datos
    data = []

    # Procesar cada archivo
    for file in files:
        # Extraer el identificador del nombre del archivo
        identifier = file.split(".")[-2][-3:]  # Extrae '001', '002', etc.
        
        # Crear el objeto KafNafParser para el archivo actual
        naf_parser = KafNafParser(file)
        
        # Extraer el texto crudo
        raw_text = naf_parser.get_raw()

        title = naf_parser.root.find('nafHeader/fileDesc').get('title')
        
        # Agregar los datos a la lista
        data.append({"identifier": identifier, "text": raw_text, "title":title})

    df = pd.DataFrame(data)
    return df

In [4]:
df = read_files()
df

Unnamed: 0,identifier,text,title
0,001,William Beaumont and the Human Digestion.\n\nW...,William Beaumont and the Human Digestion
1,002,Selma Lagerlöf and the wonderful Adventures of...,Selma Lagerlöf and the wonderful Adventures of...
2,003,Ferdinand de Lesseps and the Suez Canal.\n\nFe...,Ferdinand de Lesseps and the Suez Canal
3,004,Walt Disney’s ‘Steamboat Willie’ and the Rise ...,Walt Disney’s ‘Steamboat Willie’ and the Rise ...
4,005,Eugene Wigner and the Structure of the Atomic ...,Eugene Wigner and the Structure of the Atomic ...
...,...,...,...
326,327,James Parkinson and Parkinson’s Disease.\n\nWo...,James Parkinson and Parkinson’s Disease
327,328,Juan de la Cierva and the Autogiro.\n\nDemonst...,Juan de la Cierva and the Autogiro
328,329,Squire Whipple – The Father of the Iron Bridge...,Squire Whipple – The Father of the Iron Bridge
329,330,William Playfair and the Beginnings of Infogra...,William Playfair and the Beginnings of Infogra...


In [5]:
def replace_title_on_text(dataFrame):
    for i in range(0,len(dataFrame["text"])):
        if dataFrame["title"][i] in dataFrame["text"][i]:
            dataFrame["text"][i] = dataFrame["text"][i].replace(dataFrame["title"][i]+".","")
        if "\n" in dataFrame["text"][i]:
            dataFrame["text"][i] = dataFrame["text"][i].replace("\n","")


replace_title_on_text(df)

In [6]:
df

Unnamed: 0,identifier,text,title
0,001,William Beaumont: Physiology of digestion Imag...,William Beaumont and the Human Digestion
1,002,Cover of The Wonderful Adventures of Nils. On...,Selma Lagerlöf and the wonderful Adventures of...
2,003,"Ferdinand Marie, Vicomte de Lesseps (1805-1894...",Ferdinand de Lesseps and the Suez Canal
3,004,Mickey Mouse star in Walk of Fame Image by Fli...,Walt Disney’s ‘Steamboat Willie’ and the Rise ...
4,005,Eugene Paul Wigner (1902-1995). On November 17...,Eugene Wigner and the Structure of the Atomic ...
...,...,...,...
326,327,Woodcut of a man suffering from Parkinson‘s di...,James Parkinson and Parkinson’s Disease
327,328,Demonstration of Cierva C.6 autogiro at Farnbo...,Juan de la Cierva and the Autogiro
328,329,Truss Bridge patented by Squire Whipple. On S...,Squire Whipple – The Father of the Iron Bridge
329,330,"Playfair’s trade-balance time-series chart, fr...",William Playfair and the Beginnings of Infogra...


# Procesamiento

* Tokenizacion
* Stopwords
* Normalizacion (uncased)
* Lematizacion
* Stemming

In [7]:
from utils.processor import Processor
processor_ = Processor()


text = df['text'] + " "+ df['title']
text = np.array(text).tolist()

len(text)



331

In [8]:
def apply_process(dataFrame: pd.DataFrame, processor_: Processor = Processor()):
    """ 
    
    """
    text = dataFrame['text'] + " "+ dataFrame['title']
    text = np.array(text).tolist()
    text_processed = []
    for i in text:
        text_processed.append(processor_.preprocessing_pipeline(i))
    return text_processed, dataFrame


text_process, dataFrame2 = apply_process(df)
text_process, dataFrame2

(['bear father follow research bear physician early wound 19yearold wind serious perforate really expect really around wind fuse research information really examine father publish observations early knowledge food another mechanical activity famous outlive though wind heal',
  'author bear first writer teachers teacher secondary write publish first literary publish first publish family teach completely write publish however famous publish family catch tomte meanwhile white please accomplish back around back',
  'french bear seas reduce east bear family french first father educate act barthelemy french french expedition follow profession father early french east french office read east directly thereby around first immediately first follow years finally officially finally treat buy effective choose however french french find guilty heavily fin father century thomas university history relate around famous architect',
  'freshwater2006 release famous character first city first sound thoug

In [9]:
# Paso 5: Vectorización
count_vect = CountVectorizer(lowercase=False,max_df=0.9)

print(f"count_vect: {count_vect}")
X = count_vect.fit_transform(text_process)

# X ahora contiene la matriz de características resultante
print(f"Data transformed: {X}")

count_vect: CountVectorizer(lowercase=False, max_df=0.9)
Data transformed:   (0, 221)	2
  (0, 693)	2
  (0, 750)	1
  (0, 1643)	2
  (0, 1461)	1
  (0, 596)	2
  (0, 2254)	1
  (0, 6)	1
  (0, 2226)	3
  (0, 1730)	1
  (0, 1410)	1
  (0, 1572)	3
  (0, 662)	1
  (0, 140)	1
  (0, 816)	1
  (0, 1057)	1
  (0, 653)	1
  (0, 1540)	1
  (0, 1323)	1
  (0, 1151)	1
  (0, 753)	1
  (0, 95)	1
  (0, 1226)	1
  (0, 37)	1
  (0, 683)	1
  :	:
  (329, 2046)	1
  (329, 1851)	1
  (329, 834)	1
  (330, 221)	1
  (330, 596)	1
  (330, 728)	3
  (330, 992)	1
  (330, 663)	5
  (330, 2149)	1
  (330, 407)	1
  (330, 645)	1
  (330, 638)	2
  (330, 1012)	1
  (330, 1405)	1
  (330, 1558)	1
  (330, 1561)	1
  (330, 2110)	1
  (330, 1803)	1
  (330, 779)	2
  (330, 329)	2
  (330, 1259)	1
  (330, 1141)	1
  (330, 1805)	1
  (330, 806)	1
  (330, 2086)	1


In [10]:
#RESULTADO ESPERADO
print('Dimensiones de la matriz X:', X.shape)

print('Contenido de la matriz X:\n', X.toarray())

print('Cantidad de documentos:', X.shape[0])
print('Cantidad de palabras:', X.shape[1])
print('Cantidad de ocurrencias:', X.sum())



Dimensiones de la matriz X: (331, 2274)
Contenido de la matriz X:
 [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
Cantidad de documentos: 331
Cantidad de palabras: 2274
Cantidad de ocurrencias: 18953


In [11]:
dataFrame2['identifier'].values

array(['001', '002', '003', '004', '005', '006', '007', '008', '009',
       '010', '011', '012', '013', '014', '015', '016', '017', '018',
       '019', '020', '021', '022', '023', '024', '025', '026', '027',
       '028', '029', '030', '031', '032', '033', '034', '035', '036',
       '037', '038', '039', '040', '041', '042', '043', '044', '045',
       '046', '047', '048', '049', '050', '051', '052', '053', '054',
       '055', '056', '057', '058', '059', '060', '061', '062', '063',
       '064', '065', '066', '067', '068', '069', '070', '071', '072',
       '073', '074', '075', '076', '077', '078', '079', '080', '081',
       '082', '083', '084', '085', '086', '087', '088', '089', '090',
       '091', '092', '093', '094', '095', '096', '097', '098', '099',
       '100', '101', '102', '103', '104', '105', '106', '107', '108',
       '109', '110', '111', '112', '113', '114', '115', '116', '117',
       '118', '119', '120', '121', '122', '123', '124', '125', '126',
       '127', '128',

In [42]:
vocabulario = count_vect.get_feature_names_out()
print(vocabulario)

# Convertir la matriz de conteos a un DataFrame
df = pd.DataFrame(X.toarray(), columns=[vocabulario])
df['identifier_files'] = dataFrame2['identifier'].values

df

['10foot' '11thcentury' '12thcentury' ... 'yellowstone' 'youngest' 'youth']


Unnamed: 0,10foot,11thcentury,12thcentury,18thcentury,1931with,1959the,19yearold,20round,20thcentury,36pwhipple,...,writers,wuthering,wwwyovistocomplay20275,years,yeast,yeats,yellowstone,youngest,youth,identifier_files
0,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,001
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,002
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,003
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,004
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,005
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
326,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,327
327,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,328
328,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,329
329,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,330


In [13]:
dataFramePrueba = pd.DataFrame()

dataFramePrueba["identifier"] = ["001","002","003","004"]
dataFramePrueba["zum"] = [0, 0, 4, 3]
dataFramePrueba["zu"] = [15, 13, 0 ,1]
dataFramePrueba["hello"] = [0, 1 , 1 ,0]

In [14]:
dataFramePrueba

Unnamed: 0,identifier,zum,zu,hello
0,1,0,15,0
1,2,0,13,1
2,3,4,0,1
3,4,3,1,0


In [15]:
a = df["identifier_files"].iloc[0]
a


identifier_files    001
Name: 0, dtype: object

In [16]:
#Prueba funciona

inverted_index_dicc = {}
for i in dataFramePrueba.columns.tolist():
    list_variable = []
    if i != "identifier":
        for j in range(0,len(dataFramePrueba[i])):
            if dataFramePrueba[i].iloc[j] > 0:
                list_variable.append(dataFramePrueba["identifier"].iloc[j])
        print(i,list_variable)
        inverted_index_dicc[i] = sorted(list_variable)
print(inverted_index_dicc)

zum ['003', '004']
zu ['001', '002', '004']
hello ['002', '003']
{'zum': ['003', '004'], 'zu': ['001', '002', '004'], 'hello': ['002', '003']}


In [44]:
df.columns = [col[0] if isinstance(col, tuple) else col for col in df.columns]


In [49]:
inverted_index_dicc = {}
for i in df.columns.tolist():
    list_variable = []
    if i != "identifier_files":
        for j in range(0, len(df[i])):
            if df[i].iloc[j] > 0:
                list_variable.append(df["identifier_files"].iloc[j])  # .item() para obtener el valor real
        print(i, list_variable)
        inverted_index_dicc[i] = sorted(list_variable)
print(inverted_index_dicc)


10foot ['155']
11thcentury ['080']
12thcentury ['311']
18thcentury ['305']
1931with ['271']
1959the ['199']
19yearold ['001']
20round ['157']
20thcentury ['159', '244']
36pwhipple ['061']
37inch ['081']
3rdcentury ['284']
3sphere ['181']
61inch ['127']
abhor ['215']
abolish ['275']
absinthe ['052']
absorption ['030', '110', '121']
accompany ['014', '126', '127']
accomplish ['002', '006', '044', '054', '072', '115', '158', '239', '247', '277', '282', '298']
accomplishment ['243', '328']
accomplishments ['244']
account ['015', '025', '036', '067', '068', '069', '072', '079', '107', '117', '134', '137', '164', '175', '190', '195', '208', '231', '272', '275', '284', '298']
accuse ['009', '086', '105', '157', '158', '183', '188']
achieve ['328']
achromatic ['040']
acknowledge ['041', '056', '093', '140', '189', '191', '203', '222', '259', '260', '322']
acknowledgement ['211']
acknowledgment ['251']
acoustic ['052']
acoustical ['312']
acoustics ['016', '122', '312']
act ['003', '015', '038',

In [50]:
inverted_index_dicc

{'10foot': ['155'],
 '11thcentury': ['080'],
 '12thcentury': ['311'],
 '18thcentury': ['305'],
 '1931with': ['271'],
 '1959the': ['199'],
 '19yearold': ['001'],
 '20round': ['157'],
 '20thcentury': ['159', '244'],
 '36pwhipple': ['061'],
 '37inch': ['081'],
 '3rdcentury': ['284'],
 '3sphere': ['181'],
 '61inch': ['127'],
 'abhor': ['215'],
 'abolish': ['275'],
 'absinthe': ['052'],
 'absorption': ['030', '110', '121'],
 'accompany': ['014', '126', '127'],
 'accomplish': ['002',
  '006',
  '044',
  '054',
  '072',
  '115',
  '158',
  '239',
  '247',
  '277',
  '282',
  '298'],
 'accomplishment': ['243', '328'],
 'accomplishments': ['244'],
 'account': ['015',
  '025',
  '036',
  '067',
  '068',
  '069',
  '072',
  '079',
  '107',
  '117',
  '134',
  '137',
  '164',
  '175',
  '190',
  '195',
  '208',
  '231',
  '272',
  '275',
  '284',
  '298'],
 'accuse': ['009', '086', '105', '157', '158', '183', '188'],
 'achieve': ['328'],
 'achromatic': ['040'],
 'acknowledge': ['041',
  '056',
  '