# Dataset creation for classification

In [1]:
#!pip install tensorflow

In [2]:
import pandas as pd
import nltk
from utils_processor.processor import Processor
import logging

In [3]:
processor_ = Processor()

In [4]:
import os
import re

# Directorio donde están los archivos .txt
data_dir = 'data/books/'

# Lista para almacenar los textos y autores
texts = []
authors = []

# Leer todos los archivos .txt del directorio
for filename in os.listdir(data_dir):
    if filename.endswith('.txt'):

        with open(os.path.join(data_dir, filename), 'r', encoding='utf-8') as f:
            text = f.read()
            texts.append(text)  # Almacenar el texto
            print(f"Procesando archivo: {filename}, lineas: ", len(text))
            
            # Buscar el nombre del autor
            author_match = re.search(r'Author:\s*(.+)', text)
            if author_match:
                author_name = author_match.group(1).strip()
                authors.append(author_name.lower())  # Almacenar el autor
            else:
                authors.append("Autor no encontrado")  # En caso de no encontrarlo
            
            # Mostrar un fragmento del texto (opcional)
            #print(text[:2500])

# Mostrar los autores encontrados
print("Lista de autores en orden:")
for i, author in enumerate(authors):
    print(f"{i+1}. {author}")

# Ahora tienes dos listas: 'texts' con los textos y 'authors' con los autores en el mismo orden
authors

Procesando archivo: CrimeAndPunishment_dostoyevski.txt
Procesando archivo: TheFallOfTheHouseOfUsher_EdgarAllanPoe.txt
Procesando archivo: TheGambler_dostoyevski.txt
Procesando archivo: TheIdiot_dostoyevski.txt
Procesando archivo: TheInvisibleMan_Wells.txt
Procesando archivo: TheMasqueOfTheRedDeath_EdgarAllanPoe.txt
Procesando archivo: TheRaven_EdgarAllanPoe.txt
Procesando archivo: TheSleeperAwakes_Wells.txt
Procesando archivo: TheWarOfTheWorlds_Wells.txt
Lista de autores en orden:
1. fyodor dostoyevsky
2. edgar allan poe
3. fyodor dostoyevsky
4. fyodor dostoyevsky
5. h. g. wells
6. edgar allan poe
7. edgar allan poe
8. h. g. wells
9. h. g. wells


['fyodor dostoyevsky',
 'edgar allan poe',
 'fyodor dostoyevsky',
 'fyodor dostoyevsky',
 'h. g. wells',
 'edgar allan poe',
 'edgar allan poe',
 'h. g. wells',
 'h. g. wells']

In [5]:
def process_all_texts(processor = Processor(), texts: list = []):
    """
    Processes a list of texts and logs progress for each one, using the Processor class.
    
    Args:
        processor (Processor): An instance of the Processor class.
        texts (list): A list of text strings to process.
    
    Returns:
        list: A list of processed texts.
    """
    total = len(texts)
    processed_texts = []
    
    for index, text in enumerate(texts):
        processed_text = processor.preprocessing_pipeline_as_chunks(text, index, total)
        processed_texts.append(processed_text)  # Guardamos el texto procesado como lista de tokens
    
    return processed_texts

In [6]:
# Procesar todos los textos con el sistema de logging
processed_texts = process_all_texts(processor_, texts)

In [7]:
text_chunks = []
chunk_authors = []

for i, text_list in enumerate(processed_texts):
    author = authors[i]
    for chunk in text_list:
        text_chunks.append(chunk)  # Agregar cada chunk de texto
        chunk_authors.append(author)  # Agregar el autor correspondiente

# Crear un DataFrame con las listas
df_chunks = pd.DataFrame({
    'text_chunk': text_chunks,
    'author': chunk_authors
})


In [8]:
df_chunks

Unnamed: 0,text_chunk,author
0,crime punish crime punish fyodor dostoevski tr...,fyodor dostoyevsky
1,acclam shi unknown youth found instant someth ...,fyodor dostoyevsky
2,semyonovski squar shot write brother mihail do...,fyodor dostoyevsky
3,went mad soon unti never regain saniti intens ...,fyodor dostoyevsky
4,develop violent attack epilepsi suffer rest li...,fyodor dostoyevsky
...,...,...
5860,num volunt donat peopl walk life volunt financ...,h. g. wells
5861,state mississippi grant tax exempt status inte...,h. g. wells
5862,licens work freeli distribut machineread form ...,h. g. wells
5863,met solicit requir know prohibit accept unsoli...,h. g. wells


1. Prepare the text data

We already have the processed texts stored in a list called processed_texts. Each element in this list represents the chunks of text (after splitting) for a particular book.
Each entry in processed_texts is a list where each element is a chunk of text for that book, processed based on the method we implemented for splitting into chunks of 150 words with a 25-word overlap

2. Prepare the author labels

We have an authors list that stores the corresponding author for each book in processed_texts. Each author appears multiple times if they have multiple books in the dataset. For example:
python

authors = ['dostoyevski', 'poe', 'dostoyevski', 'dostoyevski', 'well', 'poe', 'poe', 'well', 'well']

3. Create the DataFrame structure

For each processed book (i.e., processed_texts[i]), we know that all the chunks of that book correspond to a specific author. So we can assign the same author to all the chunks in that list.
We will loop over each entry in processed_texts and for each chunk, add it to a DataFrame, along with the corresponding author.

4. Steps to build the DataFrame

* Initialize lists for the DataFrame: We will initialize two lists: one for text chunks and one for authors.
* Iterate over processed_texts: For each entry in processed_texts, we extract the list of chunks and the corresponding author.
* Add chunks and authors to the lists: For each chunk in the list of text chunks, we append it to the "text_chunk" list and the corresponding author to the "author" list.
* Create the DataFrame: Once the lists are filled, we create a pandas DataFrame with two columns: "text_chunk" and "author".

In [9]:
from sklearn.model_selection import train_test_split
import pandas as pd

# Assuming df_chunks is the dataframe with the columns ['text_chunk', 'author']

# Step 1: Split the dataset into 70% training and 30% test
train_df, test_df = train_test_split(df_chunks, test_size=0.30, stratify=df_chunks['author'], random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.1, stratify=train_df['author'], random_state=42)


In [10]:
train_df

Unnamed: 0,text_chunk,author
2328,_not_ presum eh strike princ ask gania sudden ...,fyodor dostoyevsky
5225,man assur propraiet r assur propraiet r propri...,h. g. wells
2100,larg soon obtain stand room among ring gambler...,fyodor dostoyevsky
4328,slightest doubt could kill get away quit easil...,h. g. wells
620,last right path pyotr petrovitch right path th...,fyodor dostoyevsky
...,...,...
1523,heart rodya num num day ago food cloth way liv...,fyodor dostoyevsky
3564,think pistol bound go consist whole affair sur...,fyodor dostoyevsky
3522,let becom attach last illus life love mean tri...,fyodor dostoyevsky
1640,email within num day receipt agre term full pr...,fyodor dostoyevsky


In [11]:
val_df

Unnamed: 0,text_chunk,author
3720,evid mind hippolyt look furious restrain quit ...,fyodor dostoyevsky
982,mother beg found mother live surround children...,fyodor dostoyevsky
2436,_now_ die listen rasp iron head lay certain li...,fyodor dostoyevsky
4451,although blow realli hurt found someth irresis...,h. g. wells
5049,openwork stage distanc start time peac servic ...,h. g. wells
...,...,...
5621,way tri clutch bit hors could get heard scream...,h. g. wells
2778,abl give littl inform princ away num month eve...,fyodor dostoyevsky
2397,hungri yes come along princ said mother hungri...,fyodor dostoyevsky
2489,sens well perhap sens realli great thing smile...,fyodor dostoyevsky


In [12]:
test_df

Unnamed: 0,text_chunk,author
2166,omit caress depart condit would refus say want...,fyodor dostoyevsky
4879,good purpos good purpos warn warn consequ come...,h. g. wells
388,touch food num day must tell rodya dine like e...,fyodor dostoyevsky
2087,twist face innumer wrinkl caus eye almost disa...,fyodor dostoyevsky
4292,room close crowd invis eh said huxter ignor st...,h. g. wells
...,...,...
688,everyon frown raskolnikov sat seem pay attent ...,fyodor dostoyevsky
4413,light would reflect refract would get brillian...,h. g. wells
3687,time would never mention gania attitud modest ...,fyodor dostoyevsky
4859,ah said graham forgot everyth els sat chair wi...,h. g. wells


In [13]:

def summary_by_author(train_df, validation_df, test_df):
    """
    Generates a summary table showing the number of samples per author for the training, validation, and testing sets.
    
    Args:
        train_df (pd.DataFrame): Training DataFrame.
        validation_df (pd.DataFrame): Validation DataFrame.
        test_df (pd.DataFrame): Testing DataFrame.
        
    Returns:
        pd.DataFrame: A summary DataFrame.
    """
    
    summary_data = {
        'Author': train_df['author'].unique(),
        'Train': train_df['author'].value_counts(),
        'Validation': validation_df['author'].value_counts(),
        'Test': test_df['author'].value_counts()
    }
    
    summary_df = pd.DataFrame(summary_data)
    summary_df = summary_df.fillna(0)  # Replace NaN with 0 if no samples exist for some authors
    
    return summary_df

In [14]:
summary_by_author(train_df, val_df, test_df)

Unnamed: 0_level_0,Author,Train,Validation,Test
author,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
fyodor dostoyevsky,fyodor dostoyevsky,2591,288,1235
h. g. wells,h. g. wells,974,108,464
edgar allan poe,edgar allan poe,129,15,61


Feed Forward

In [15]:
from gensim.models import Word2Vec

# Cargar el modelo Word2Vec
word2vec_model = Word2Vec.load('data/answers/Books_1000_6.model')

# Obtener la matriz de embeddings
embedding_matrix = word2vec_model.wv.vectors  # Matriz de vectores

# Tamaño del vocabulario y dimensión de los embeddings
vocab_size = embedding_matrix.shape[0]
embedding_dim = embedding_matrix.shape[1]

print(f"Vocab Size: {vocab_size}, Embedding Dimension: {embedding_dim}")


Vocab Size: 15888, Embedding Dimension: 1000


In [30]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Crear el diccionario de palabras a índices basado en el modelo Word2Vec
tokenizer = Tokenizer()
tokenizer.word_index = {word: idx for idx, word in enumerate(word2vec_model.wv.index_to_key)}

# Convertir los textos en secuencias de índices
train_sequences = tokenizer.texts_to_sequences(train_df['text_chunk'].tolist())
val_sequences = tokenizer.texts_to_sequences(val_df['text_chunk'].tolist())
test_sequences = tokenizer.texts_to_sequences(test_df['text_chunk'].tolist())

# Rellenar las secuencias para que todas tengan la misma longitud
maxlen = 100  # Puedes ajustar este valor según la longitud típica de tus secuencias
train_sequences_padded = pad_sequences(train_sequences, maxlen=maxlen, padding='post')
val_sequences_padded = pad_sequences(val_sequences, maxlen=maxlen, padding='post')
test_sequences_padded = pad_sequences(test_sequences, maxlen=maxlen, padding='post')


In [31]:
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical

# Inicializar el codificador
label_encoder = LabelEncoder()

# Codificar las etiquetas de los autores como enteros
train_labels_encoded = label_encoder.fit_transform(train_df['author'])
val_labels_encoded = label_encoder.transform(val_df['author'])
test_labels_encoded = label_encoder.transform(test_df['author'])

# Convertir a formato de una-hot (one-hot encoding)
train_labels_onehot = to_categorical(train_labels_encoded)
val_labels_onehot = to_categorical(val_labels_encoded)
test_labels_onehot = to_categorical(test_labels_encoded)


In [38]:
from keras.models import Sequential
from keras.layers import Embedding, Dense, Flatten, Dropout

def model_1(vocab_size, embedding_dim, embedding_matrix, maxlen):
    model = Sequential()
    model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, 
                        weights=[embedding_matrix], input_length=maxlen, trainable=False))
    model.add(Flatten())
    model.add(Dropout(0.5))
    model.add(Dense(32, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(3, activation='softmax'))  # Tres clases
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

def model_2(vocab_size, embedding_dim, embedding_matrix, maxlen):
    model = Sequential()
    model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, 
                        weights=[embedding_matrix], input_length=maxlen, trainable=False))
    model.add(Flatten())
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(32, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(3, activation='softmax'))
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

def model_3(vocab_size, embedding_dim, embedding_matrix, maxlen):
    model = Sequential()
    model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, 
                        weights=[embedding_matrix], input_length=maxlen, trainable=False))
    model.add(Flatten())
    model.add(Dense(128, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(32, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(3, activation='softmax'))
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model


In [43]:
from keras.callbacks import EarlyStopping

# Definir EarlyStopping para que detenga el entrenamiento si no hay mejora en 5 épocas
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)


In [44]:
# Entrenar la primera red
model1 = model_1(vocab_size, embedding_dim, embedding_matrix, maxlen)
model1.fit(train_sequences_padded, train_labels_onehot, 
           validation_data=(val_sequences_padded, val_labels_onehot), epochs=1000, callbacks=[early_stopping])


Epoch 1/1000
[1m116/116[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 46ms/step - accuracy: 0.6674 - loss: 1.1671 - val_accuracy: 0.7007 - val_loss: 0.6407
Epoch 2/1000
[1m116/116[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 42ms/step - accuracy: 0.7018 - loss: 0.8272 - val_accuracy: 0.7007 - val_loss: 0.7052
Epoch 3/1000
[1m116/116[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 44ms/step - accuracy: 0.6988 - loss: 0.8160 - val_accuracy: 0.7007 - val_loss: 0.6608
Epoch 4/1000
[1m116/116[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 41ms/step - accuracy: 0.7075 - loss: 0.7824 - val_accuracy: 0.7007 - val_loss: 0.6396
Epoch 5/1000
[1m116/116[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 42ms/step - accuracy: 0.6990 - loss: 0.7689 - val_accuracy: 0.7007 - val_loss: 0.6554
Epoch 6/1000
[1m116/116[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 43ms/step - accuracy: 0.6919 - loss: 0.7518 - val_accuracy: 0.7007 - val_loss: 0.6573
Epoch 7/10

<keras.src.callbacks.history.History at 0x2c73858be90>

In [45]:
# Entrenar la segunda red
model2 = model_2(vocab_size, embedding_dim, embedding_matrix, maxlen)
model2.fit(train_sequences_padded, train_labels_onehot, 
           validation_data=(val_sequences_padded, val_labels_onehot), epochs=1000, callbacks=[early_stopping])



Epoch 1/1000
[1m116/116[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 72ms/step - accuracy: 0.5914 - loss: 2.4121 - val_accuracy: 0.7007 - val_loss: 0.9292
Epoch 2/1000
[1m116/116[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 68ms/step - accuracy: 0.6993 - loss: 0.8914 - val_accuracy: 0.7007 - val_loss: 0.7994
Epoch 3/1000
[1m116/116[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 71ms/step - accuracy: 0.7108 - loss: 0.7819 - val_accuracy: 0.7007 - val_loss: 0.7470
Epoch 4/1000
[1m116/116[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 71ms/step - accuracy: 0.6982 - loss: 0.7516 - val_accuracy: 0.7007 - val_loss: 0.7294
Epoch 5/1000
[1m116/116[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 70ms/step - accuracy: 0.7092 - loss: 0.7336 - val_accuracy: 0.7007 - val_loss: 0.7245
Epoch 6/1000
[1m116/116[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 72ms/step - accuracy: 0.7030 - loss: 0.7316 - val_accuracy: 0.7007 - val_loss: 0.7225
Epoch 7/1

<keras.src.callbacks.history.History at 0x2c739288790>

In [46]:
# Entrenar la tercera red
model3 = model_3(vocab_size, embedding_dim, embedding_matrix, maxlen)
model3.fit(train_sequences_padded, train_labels_onehot, 
           validation_data=(val_sequences_padded, val_labels_onehot), epochs=1000, callbacks=[early_stopping])

Epoch 1/1000




[1m116/116[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 124ms/step - accuracy: 0.5892 - loss: 2.6750 - val_accuracy: 0.7007 - val_loss: 0.9042
Epoch 2/1000
[1m116/116[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 123ms/step - accuracy: 0.6967 - loss: 0.8730 - val_accuracy: 0.7007 - val_loss: 0.7713
Epoch 3/1000
[1m116/116[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 123ms/step - accuracy: 0.7094 - loss: 0.7631 - val_accuracy: 0.7007 - val_loss: 0.7316
Epoch 4/1000
[1m116/116[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 138ms/step - accuracy: 0.7052 - loss: 0.7453 - val_accuracy: 0.7007 - val_loss: 0.7248
Epoch 5/1000
[1m116/116[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 129ms/step - accuracy: 0.7082 - loss: 0.7198 - val_accuracy: 0.7007 - val_loss: 0.7224
Epoch 6/1000
[1m116/116[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 133ms/step - accuracy: 0.6995 - loss: 0.7403 - val_accuracy: 0.7007 - val_loss: 0.7220
Epoch 7/100

<keras.src.callbacks.history.History at 0x2c739b7be90>

Metricas frente a test


In [56]:
# Obtener las predicciones del modelo sobre el conjunto de prueba
predictions = model1.predict(test_sequences_padded)


[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step


In [57]:
import numpy as np

# Convertir las probabilidades a etiquetas (seleccionando la clase con la mayor probabilidad)
predicted_classes = np.argmax(predictions, axis=1)


In [58]:
predicted_classes

array([1, 1, 1, ..., 1, 1, 1], dtype=int64)

In [59]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Calcular la precisión (accuracy)
accuracy = accuracy_score(test_labels_encoded, predicted_classes)
print(f"Accuracy: {accuracy}")

# Imprimir un informe de clasificación detallado
print("Classification Report:")
print(classification_report(test_labels_encoded, predicted_classes))

# Imprimir la matriz de confusión
print("Confusion Matrix:")
print(confusion_matrix(test_labels_encoded, predicted_classes))


Accuracy: 0.7017045454545454
Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        61
           1       0.70      1.00      0.82      1235
           2       0.00      0.00      0.00       464

    accuracy                           0.70      1760
   macro avg       0.23      0.33      0.27      1760
weighted avg       0.49      0.70      0.58      1760

Confusion Matrix:
[[   0   61    0]
 [   0 1235    0]
 [   0  464    0]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
