# Dataset creation for classification

In [1]:
#!pip install tensorflow

In [3]:
import pandas as pd
import nltk
from utils_processor.processor import Processor
import logging

In [4]:
processor_ = Processor()

In [5]:
import os
import re

# Directorio donde están los archivos .txt
data_dir = 'data/books/'

# Lista para almacenar los textos y autores
texts = []
authors = []

# Leer todos los archivos .txt del directorio
for filename in os.listdir(data_dir):
    if filename.endswith('.txt'):

        with open(os.path.join(data_dir, filename), 'r', encoding='utf-8') as f:
            text = f.read()
            texts.append(text)  # Almacenar el texto
            print(f"Procesando archivo: {filename}, lineas: ", len(text))
            
            # Buscar el nombre del autor
            author_match = re.search(r'Author:\s*(.+)', text)
            if author_match:
                author_name = author_match.group(1).strip()
                authors.append(author_name.lower())  # Almacenar el autor
            else:
                authors.append("Autor no encontrado")  # En caso de no encontrarlo
            
            # Mostrar un fragmento del texto (opcional)
            #print(text[:2500])

# Mostrar los autores encontrados
print("Lista de autores en orden:")
for i, author in enumerate(authors):
    print(f"{i+1}. {author}")

# Ahora tienes dos listas: 'texts' con los textos y 'authors' con los autores en el mismo orden
authors

Procesando archivo: ATaleofTwoCities_Dickens.txt, lineas:  776878
Procesando archivo: BleakHouse_Dickens.txt, lineas:  1958792
Procesando archivo: CountofMonteCristo_Dumas.txt, lineas:  2646641
Procesando archivo: CrimeAndPunishment_dostoyevski.txt, lineas:  1154409
Procesando archivo: OliverTwist_Dickens.txt, lineas:  912421
Procesando archivo: TheGambler_dostoyevski.txt, lineas:  350954
Procesando archivo: TheIdiot_dostoyevski.txt, lineas:  1366983
Procesando archivo: TheThreeMusketeers_Dumas.txt, lineas:  1317339
Procesando archivo: TwentyYearsAfter_Dumas.txt, lineas:  1387344
Lista de autores en orden:
1. charles dickens
2. charles dickens
3. alexandre dumas
4. fyodor dostoyevsky
5. charles dickens
6. fyodor dostoyevsky
7. fyodor dostoyevsky
8. alexandre dumas
9. alexandre dumas


['charles dickens',
 'charles dickens',
 'alexandre dumas',
 'fyodor dostoyevsky',
 'charles dickens',
 'fyodor dostoyevsky',
 'fyodor dostoyevsky',
 'alexandre dumas',
 'alexandre dumas']

In [6]:
def process_all_texts(processor = Processor(), texts: list = []):
    """
    Processes a list of texts and logs progress for each one, using the Processor class.
    
    Args:
        processor (Processor): An instance of the Processor class.
        texts (list): A list of text strings to process.
    
    Returns:
        list: A list of processed texts.
    """
    total = len(texts)
    processed_texts = []
    
    for index, text in enumerate(texts):
        processed_text = processor.preprocessing_pipeline_as_chunks(text, index, total)
        processed_texts.append(processed_text)  # Guardamos el texto procesado como lista de tokens
    
    return processed_texts

In [7]:
# Procesar todos los textos con el sistema de logging
processed_texts = process_all_texts(processor_, texts)

In [8]:
text_chunks = []
chunk_authors = []

for i, text_list in enumerate(processed_texts):
    author = authors[i]
    for chunk in text_list:
        text_chunks.append(chunk)  # Agregar cada chunk de texto
        chunk_authors.append(author)  # Agregar el autor correspondiente

# Crear un DataFrame con las listas
df_chunks = pd.DataFrame({
    'text_chunk': text_chunks,
    'author': chunk_authors
})


In [9]:
df_chunks

Unnamed: 0,text_chunk,author
0,tale num citi tale num citi stori french revol...,charles dickens
1,chapter xv knit chapter xvi still knit chapter...,charles dickens
2,life chapter period best time worst time age w...,charles dickens
3,king larg jaw queen plain face throne england ...,charles dickens
4,london westminst even cocklan ghost laid round...,charles dickens
...,...,...
16890,critic reach project gutenberg goal ensur proj...,alexandre dumas
16891,num contribut project gutenberg literari archi...,alexandre dumas
16892,num num particular import maintain tax exempt ...,alexandre dumas
16893,us offer donat intern donat grate accept make ...,alexandre dumas


1. Prepare the text data

We already have the processed texts stored in a list called processed_texts. Each element in this list represents the chunks of text (after splitting) for a particular book.
Each entry in processed_texts is a list where each element is a chunk of text for that book, processed based on the method we implemented for splitting into chunks of 150 words with a 25-word overlap

2. Prepare the author labels

We have an authors list that stores the corresponding author for each book in processed_texts. Each author appears multiple times if they have multiple books in the dataset. For example:
python

authors = ['dostoyevski', 'poe', 'dostoyevski', 'dostoyevski', 'well', 'poe', 'poe', 'well', 'well']

3. Create the DataFrame structure

For each processed book (i.e., processed_texts[i]), we know that all the chunks of that book correspond to a specific author. So we can assign the same author to all the chunks in that list.
We will loop over each entry in processed_texts and for each chunk, add it to a DataFrame, along with the corresponding author.

4. Steps to build the DataFrame

* Initialize lists for the DataFrame: We will initialize two lists: one for text chunks and one for authors.
* Iterate over processed_texts: For each entry in processed_texts, we extract the list of chunks and the corresponding author.
* Add chunks and authors to the lists: For each chunk in the list of text chunks, we append it to the "text_chunk" list and the corresponding author to the "author" list.
* Create the DataFrame: Once the lists are filled, we create a pandas DataFrame with two columns: "text_chunk" and "author".

In [10]:
from sklearn.model_selection import train_test_split
import pandas as pd

# Assuming df_chunks is the dataframe with the columns ['text_chunk', 'author']

# Step 1: Split the dataset into 70% training and 30% test
train_df, test_df = train_test_split(df_chunks, test_size=0.30, stratify=df_chunks['author'], random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.1, stratify=train_df['author'], random_state=42)


In [11]:
train_df

Unnamed: 0,text_chunk,author
10344,elbow beg pardon said dodger look air abstract...,charles dickens
2552,pursu say emphat william guppi drop mr guppi a...,charles dickens
13609,secret use know anyth said young woman instinc...,alexandre dumas
3369,look keep secret condescens present visit feel...,charles dickens
9896,jew sooner alon counten resum former express a...,charles dickens
...,...,...
5799,better inform know owner hors shut cri pit cho...,alexandre dumas
8183,began count third excus would say fanci made m...,fyodor dostoyevsky
13251,take away commiss give mademoisell de chemerau...,alexandre dumas
4851,must curios natur island mass rock contain acr...,alexandre dumas


In [12]:
val_df

Unnamed: 0,text_chunk,author
5881,deceiv play joke excel read ah true said mont ...,alexandre dumas
882,heel head wish wos still say prewar sir let bo...,charles dickens
16471,would choos num atho artagnan said noth silenc...,alexandre dumas
3294,littl earlier morn keep account attend houseke...,charles dickens
13959,shall get back upon lackey hors _pardieu_ anyb...,alexandre dumas
...,...,...
8226,heart long anoth father polya papa fear angri ...,fyodor dostoyevsky
11409,note often grow paler take princ took note fer...,fyodor dostoyevsky
7376,room first floor room whitewash custom prison ...,alexandre dumas
1898,time littl woman ad rub head signific settl ye...,charles dickens


In [13]:
test_df

Unnamed: 0,text_chunk,author
12339,quit sure reach culmin point happi num day saw...,fyodor dostoyevsky
4185,spain itali mercédè father could join fear liv...,alexandre dumas
15378,found pale fatigu inquir whether ill fact said...,alexandre dumas
9926,empti comfort said mrs corney much inde said b...,charles dickens
6920,crush singl touch word breath yes self thought...,alexandre dumas
...,...,...
2448,poor dear girl found much admir good disposit ...,charles dickens
14473,dispos convers reclin corner carriag num pass ...,alexandre dumas
14223,smile indic knew stori well wish relat recomme...,alexandre dumas
16470,shall begin portho arami drew back disappoint ...,alexandre dumas


In [14]:

def summary_by_author(train_df, validation_df, test_df):
    """
    Generates a summary table showing the number of samples per author for the training, validation, and testing sets.
    
    Args:
        train_df (pd.DataFrame): Training DataFrame.
        validation_df (pd.DataFrame): Validation DataFrame.
        test_df (pd.DataFrame): Testing DataFrame.
        
    Returns:
        pd.DataFrame: A summary DataFrame.
    """
    
    summary_data = {
        'Author': train_df['author'].unique(),
        'Train': train_df['author'].value_counts(),
        'Validation': validation_df['author'].value_counts(),
        'Test': test_df['author'].value_counts()
    }
    
    summary_df = pd.DataFrame(summary_data)
    summary_df = summary_df.fillna(0)  # Replace NaN with 0 if no samples exist for some authors
    
    return summary_df

In [17]:
summary_by_author(train_df, val_df, test_df)

Unnamed: 0_level_0,Author,Train,Validation,Test
author,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
alexandre dumas,charles dickens,4744,527,2260
charles dickens,alexandre dumas,3307,368,1575
fyodor dostoyevsky,fyodor dostoyevsky,2592,288,1234


Feed Forward

In [18]:
from gensim.models import Word2Vec

# Cargar el modelo Word2Vec
word2vec_model = Word2Vec.load('data/answers/Books_1000_6.model')

# Obtener la matriz de embeddings
embedding_matrix = word2vec_model.wv.vectors  # Matriz de vectores

# Tamaño del vocabulario y dimensión de los embeddings
vocab_size = embedding_matrix.shape[0]
embedding_dim = embedding_matrix.shape[1]

print(f"Vocab Size: {vocab_size}, Embedding Dimension: {embedding_dim}")


Vocab Size: 27219, Embedding Dimension: 1000


In [28]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Crear el diccionario de palabras a índices basado en el modelo Word2Vec
tokenizer = Tokenizer()
tokenizer.word_index = {word: idx for idx, word in enumerate(word2vec_model.wv.index_to_key)}

# Convertir los textos en secuencias de índices
train_sequences = tokenizer.texts_to_sequences(train_df['text_chunk'].tolist())
val_sequences = tokenizer.texts_to_sequences(val_df['text_chunk'].tolist())
test_sequences = tokenizer.texts_to_sequences(test_df['text_chunk'].tolist())

# Rellenar las secuencias para que todas tengan la misma longitud
maxlen = 200 
train_sequences_padded = pad_sequences(train_sequences, maxlen=maxlen, padding='post')
val_sequences_padded = pad_sequences(val_sequences, maxlen=maxlen, padding='post')
test_sequences_padded = pad_sequences(test_sequences, maxlen=maxlen, padding='post')


In [29]:
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical

# Inicializar el codificador
label_encoder = LabelEncoder()

# Codificar las etiquetas de los autores como enteros
train_labels_encoded = label_encoder.fit_transform(train_df['author'])
val_labels_encoded = label_encoder.transform(val_df['author'])
test_labels_encoded = label_encoder.transform(test_df['author'])

# Convertir a formato de una-hot (one-hot encoding)
train_labels_onehot = to_categorical(train_labels_encoded)
val_labels_onehot = to_categorical(val_labels_encoded)
test_labels_onehot = to_categorical(test_labels_encoded)


In [30]:
from keras.models import Sequential
from keras.layers import Embedding, Dense, Flatten, Dropout

def model_1(vocab_size, embedding_dim, embedding_matrix, maxlen):
    model = Sequential()
    model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, 
                        weights=[embedding_matrix], input_length=maxlen, trainable=False))
    model.add(Flatten())
    model.add(Dropout(0.5))
    model.add(Dense(32, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(3, activation='softmax'))  # Tres clases
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

def model_2(vocab_size, embedding_dim, embedding_matrix, maxlen):
    model = Sequential()
    model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, 
                        weights=[embedding_matrix], input_length=maxlen, trainable=False))
    model.add(Flatten())
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(32, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(3, activation='softmax'))
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

def model_3(vocab_size, embedding_dim, embedding_matrix, maxlen):
    model = Sequential()
    model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, 
                        weights=[embedding_matrix], input_length=maxlen, trainable=False))
    model.add(Flatten())
    model.add(Dense(128, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(32, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(3, activation='softmax'))
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model


In [31]:
from keras.callbacks import EarlyStopping

# Definir EarlyStopping para que detenga el entrenamiento si no hay mejora en 5 épocas
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)


In [32]:
# Entrenar la primera red
model1 = model_1(vocab_size, embedding_dim, embedding_matrix, maxlen)
model1.fit(train_sequences_padded, train_labels_onehot, 
           validation_data=(val_sequences_padded, val_labels_onehot), epochs=1000, callbacks=[early_stopping])


Epoch 1/1000




[1m333/333[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 51ms/step - accuracy: 0.4268 - loss: 1.7509 - val_accuracy: 0.4455 - val_loss: 1.0726
Epoch 2/1000
[1m333/333[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 50ms/step - accuracy: 0.4445 - loss: 1.0716 - val_accuracy: 0.4455 - val_loss: 1.0679
Epoch 3/1000
[1m333/333[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 49ms/step - accuracy: 0.4430 - loss: 1.0674 - val_accuracy: 0.4455 - val_loss: 1.0675
Epoch 4/1000
[1m333/333[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 46ms/step - accuracy: 0.4402 - loss: 1.0714 - val_accuracy: 0.4455 - val_loss: 1.0674
Epoch 5/1000
[1m333/333[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 52ms/step - accuracy: 0.4465 - loss: 1.0676 - val_accuracy: 0.4455 - val_loss: 1.0674
Epoch 6/1000
[1m333/333[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 50ms/step - accuracy: 0.4544 - loss: 1.0622 - val_accuracy: 0.4455 - val_loss: 1.0674
Epoch 7/1000
[1m

<keras.src.callbacks.history.History at 0x1c6614e5d10>

In [33]:
# Entrenar la segunda red
model2 = model_2(vocab_size, embedding_dim, embedding_matrix, maxlen)
model2.fit(train_sequences_padded, train_labels_onehot, 
           validation_data=(val_sequences_padded, val_labels_onehot), epochs=1000, callbacks=[early_stopping])



Epoch 1/1000
[1m333/333[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 89ms/step - accuracy: 0.4277 - loss: 2.1973 - val_accuracy: 0.4455 - val_loss: 1.0690
Epoch 2/1000
[1m333/333[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 87ms/step - accuracy: 0.4500 - loss: 1.0667 - val_accuracy: 0.4455 - val_loss: 1.0677
Epoch 3/1000
[1m333/333[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 85ms/step - accuracy: 0.4415 - loss: 1.0699 - val_accuracy: 0.4455 - val_loss: 1.0674
Epoch 4/1000
[1m333/333[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 88ms/step - accuracy: 0.4366 - loss: 1.0709 - val_accuracy: 0.4455 - val_loss: 1.0675
Epoch 5/1000
[1m333/333[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 86ms/step - accuracy: 0.4427 - loss: 1.0690 - val_accuracy: 0.4455 - val_loss: 1.0674
Epoch 6/1000
[1m333/333[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 88ms/step - accuracy: 0.4462 - loss: 1.0678 - val_accuracy: 0.4455 - val_loss: 1.0674
Epoc

<keras.src.callbacks.history.History at 0x1c664f3f850>

In [34]:
# Entrenar la tercera red
model3 = model_3(vocab_size, embedding_dim, embedding_matrix, maxlen)
model3.fit(train_sequences_padded, train_labels_onehot, 
           validation_data=(val_sequences_padded, val_labels_onehot), epochs=1000, callbacks=[early_stopping])

Epoch 1/1000
[1m333/333[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m55s[0m 162ms/step - accuracy: 0.4060 - loss: 4.1586 - val_accuracy: 0.4455 - val_loss: 1.0682
Epoch 2/1000
[1m333/333[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m57s[0m 170ms/step - accuracy: 0.4449 - loss: 1.0749 - val_accuracy: 0.4455 - val_loss: 1.0675
Epoch 3/1000
[1m333/333[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m57s[0m 171ms/step - accuracy: 0.4462 - loss: 1.0693 - val_accuracy: 0.4455 - val_loss: 1.0676
Epoch 4/1000
[1m333/333[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m54s[0m 161ms/step - accuracy: 0.4508 - loss: 1.0656 - val_accuracy: 0.4455 - val_loss: 1.0675
Epoch 5/1000
[1m333/333[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 160ms/step - accuracy: 0.4449 - loss: 1.0684 - val_accuracy: 0.4455 - val_loss: 1.0675
Epoch 6/1000
[1m333/333[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 159ms/step - accuracy: 0.4419 - loss: 1.0699 - val_accuracy: 0.4455 - val_loss: 1.067

<keras.src.callbacks.history.History at 0x1c665852950>

Metricas frente a test


In [35]:
# Obtener las predicciones del modelo sobre el conjunto de prueba
predictions = model1.predict(test_sequences_padded)


[1m159/159[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step


In [36]:
import numpy as np

# Convertir las probabilidades a etiquetas (seleccionando la clase con la mayor probabilidad)
predicted_classes = np.argmax(predictions, axis=1)


In [37]:
predicted_classes

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [38]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Calcular la precisión (accuracy)
accuracy = accuracy_score(test_labels_encoded, predicted_classes)
print(f"Accuracy: {accuracy}")

# Imprimir un informe de clasificación detallado
print("Classification Report:")
print(classification_report(test_labels_encoded, predicted_classes))

# Imprimir la matriz de confusión
print("Confusion Matrix:")
print(confusion_matrix(test_labels_encoded, predicted_classes))


Accuracy: 0.44584730716117577
Classification Report:
              precision    recall  f1-score   support

           0       0.45      1.00      0.62      2260
           1       0.00      0.00      0.00      1575
           2       0.00      0.00      0.00      1234

    accuracy                           0.45      5069
   macro avg       0.15      0.33      0.21      5069
weighted avg       0.20      0.45      0.27      5069

Confusion Matrix:
[[2260    0    0]
 [1575    0    0]
 [1234    0    0]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


: 