# Dataset creation for classification

In [51]:
#!pip install tensorflow
#!pip install seaborn

In [1]:
import pandas as pd
import nltk
from utils_processor.processor import Processor
import logging

In [2]:
processor_ = Processor()

In [3]:
import os
import re

# Directorio donde están los archivos .txt
data_dir = 'data/books/'

# Lista para almacenar los textos y autores
texts = []
authors = []

# Leer todos los archivos .txt del directorio
for filename in os.listdir(data_dir):
    if filename.endswith('.txt'):

        with open(os.path.join(data_dir, filename), 'r', encoding='utf-8') as f:
            text = f.read()
            texts.append(text)  # Almacenar el texto
            print(f"Procesando archivo: {filename}, lineas: ", len(text))
            
            # Buscar el nombre del autor
            author_match = re.search(r'Author:\s*(.+)', text)
            if author_match:
                author_name = author_match.group(1).strip()
                authors.append(author_name.lower())  # Almacenar el autor
            else:
                authors.append("Autor no encontrado")  # En caso de no encontrarlo
            
            # Mostrar un fragmento del texto (opcional)
            #print(text[:2500])

# Mostrar los autores encontrados
print("Lista de autores en orden:")
for i, author in enumerate(authors):
    print(f"{i+1}. {author}")

# Ahora tienes dos listas: 'texts' con los textos y 'authors' con los autores en el mismo orden
authors

Procesando archivo: ATaleofTwoCities_Dickens.txt, lineas:  776878
Procesando archivo: BleakHouse_Dickens.txt, lineas:  1958792
Procesando archivo: CountofMonteCristo_Dumas.txt, lineas:  2646641
Procesando archivo: CrimeAndPunishment_dostoyevski.txt, lineas:  1154409
Procesando archivo: OliverTwist_Dickens.txt, lineas:  912421
Procesando archivo: TheGambler_dostoyevski.txt, lineas:  350954
Procesando archivo: TheIdiot_dostoyevski.txt, lineas:  1366983
Procesando archivo: TheThreeMusketeers_Dumas.txt, lineas:  1317339
Procesando archivo: TwentyYearsAfter_Dumas.txt, lineas:  1387344
Lista de autores en orden:
1. charles dickens
2. charles dickens
3. alexandre dumas
4. fyodor dostoyevsky
5. charles dickens
6. fyodor dostoyevsky
7. fyodor dostoyevsky
8. alexandre dumas
9. alexandre dumas


['charles dickens',
 'charles dickens',
 'alexandre dumas',
 'fyodor dostoyevsky',
 'charles dickens',
 'fyodor dostoyevsky',
 'fyodor dostoyevsky',
 'alexandre dumas',
 'alexandre dumas']

In [4]:
def process_all_texts(processor = Processor(), texts: list = []):
    """
    Processes a list of texts and logs progress for each one, using the Processor class.
    
    Args:
        processor (Processor): An instance of the Processor class.
        texts (list): A list of text strings to process.
    
    Returns:
        list: A list of processed texts.
    """
    total = len(texts)
    processed_texts = []
    
    for index, text in enumerate(texts):
        processed_text = processor.preprocessing_pipeline_as_chunks(text, index, total)
        processed_texts.append(processed_text)  # Guardamos el texto procesado como lista de tokens
    
    return processed_texts

In [5]:
# Procesar todos los textos con el sistema de logging
processed_texts = process_all_texts(processor_, texts)

In [6]:
text_chunks = []
chunk_authors = []

for i, text_list in enumerate(processed_texts):
    author = authors[i]
    for chunk in text_list:
        text_chunks.append(chunk)  # Agregar cada chunk de texto
        chunk_authors.append(author)  # Agregar el autor correspondiente

# Crear un DataFrame con las listas
df_chunks = pd.DataFrame({
    'text_chunk': text_chunks,
    'author': chunk_authors
})


In [7]:
df_chunks

Unnamed: 0,text_chunk,author
0,tale num citi tale num citi stori french revol...,charles dickens
1,chapter xv knit chapter xvi still knit chapter...,charles dickens
2,life chapter period best time worst time age w...,charles dickens
3,king larg jaw queen plain face throne england ...,charles dickens
4,london westminst even cocklan ghost laid round...,charles dickens
...,...,...
16890,critic reach project gutenberg goal ensur proj...,alexandre dumas
16891,num contribut project gutenberg literari archi...,alexandre dumas
16892,num num particular import maintain tax exempt ...,alexandre dumas
16893,us offer donat intern donat grate accept make ...,alexandre dumas


1. Prepare the text data

We already have the processed texts stored in a list called processed_texts. Each element in this list represents the chunks of text (after splitting) for a particular book.
Each entry in processed_texts is a list where each element is a chunk of text for that book, processed based on the method we implemented for splitting into chunks of 150 words with a 25-word overlap

2. Prepare the author labels

We have an authors list that stores the corresponding author for each book in processed_texts. Each author appears multiple times if they have multiple books in the dataset. For example:
python

authors = ['dostoyevski', 'poe', 'dostoyevski', 'dostoyevski', 'well', 'poe', 'poe', 'well', 'well']

3. Create the DataFrame structure

For each processed book (i.e., processed_texts[i]), we know that all the chunks of that book correspond to a specific author. So we can assign the same author to all the chunks in that list.
We will loop over each entry in processed_texts and for each chunk, add it to a DataFrame, along with the corresponding author.

4. Steps to build the DataFrame

* Initialize lists for the DataFrame: We will initialize two lists: one for text chunks and one for authors.
* Iterate over processed_texts: For each entry in processed_texts, we extract the list of chunks and the corresponding author.
* Add chunks and authors to the lists: For each chunk in the list of text chunks, we append it to the "text_chunk" list and the corresponding author to the "author" list.
* Create the DataFrame: Once the lists are filled, we create a pandas DataFrame with two columns: "text_chunk" and "author".

In [8]:
from sklearn.model_selection import train_test_split
import pandas as pd

# Assuming df_chunks is the dataframe with the columns ['text_chunk', 'author']

# Step 1: Split the dataset into 70% training and 30% test
train_df, test_df = train_test_split(df_chunks, test_size=0.30, stratify=df_chunks['author'], random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.1, stratify=train_df['author'], random_state=42)


In [9]:
train_df

Unnamed: 0,text_chunk,author
10344,elbow beg pardon said dodger look air abstract...,charles dickens
2552,pursu say emphat william guppi drop mr guppi a...,charles dickens
13609,secret use know anyth said young woman instinc...,alexandre dumas
3369,look keep secret condescens present visit feel...,charles dickens
9896,jew sooner alon counten resum former express a...,charles dickens
...,...,...
5799,better inform know owner hors shut cri pit cho...,alexandre dumas
8183,began count third excus would say fanci made m...,fyodor dostoyevsky
13251,take away commiss give mademoisell de chemerau...,alexandre dumas
4851,must curios natur island mass rock contain acr...,alexandre dumas


In [10]:
val_df

Unnamed: 0,text_chunk,author
5881,deceiv play joke excel read ah true said mont ...,alexandre dumas
882,heel head wish wos still say prewar sir let bo...,charles dickens
16471,would choos num atho artagnan said noth silenc...,alexandre dumas
3294,littl earlier morn keep account attend houseke...,charles dickens
13959,shall get back upon lackey hors _pardieu_ anyb...,alexandre dumas
...,...,...
8226,heart long anoth father polya papa fear angri ...,fyodor dostoyevsky
11409,note often grow paler take princ took note fer...,fyodor dostoyevsky
7376,room first floor room whitewash custom prison ...,alexandre dumas
1898,time littl woman ad rub head signific settl ye...,charles dickens


In [11]:
test_df

Unnamed: 0,text_chunk,author
12339,quit sure reach culmin point happi num day saw...,fyodor dostoyevsky
4185,spain itali mercédè father could join fear liv...,alexandre dumas
15378,found pale fatigu inquir whether ill fact said...,alexandre dumas
9926,empti comfort said mrs corney much inde said b...,charles dickens
6920,crush singl touch word breath yes self thought...,alexandre dumas
...,...,...
2448,poor dear girl found much admir good disposit ...,charles dickens
14473,dispos convers reclin corner carriag num pass ...,alexandre dumas
14223,smile indic knew stori well wish relat recomme...,alexandre dumas
16470,shall begin portho arami drew back disappoint ...,alexandre dumas


In [12]:

def summary_by_author(train_df, validation_df, test_df):
    """
    Generates a summary table showing the number of samples per author for the training, validation, and testing sets.
    
    Args:
        train_df (pd.DataFrame): Training DataFrame.
        validation_df (pd.DataFrame): Validation DataFrame.
        test_df (pd.DataFrame): Testing DataFrame.
        
    Returns:
        pd.DataFrame: A summary DataFrame.
    """
    
    summary_data = {
        'Author': train_df['author'].unique(),
        'Train': train_df['author'].value_counts(),
        'Validation': validation_df['author'].value_counts(),
        'Test': test_df['author'].value_counts()
    }
    
    summary_df = pd.DataFrame(summary_data)
    summary_df = summary_df.fillna(0)  # Replace NaN with 0 if no samples exist for some authors
    
    return summary_df

In [13]:
summary_by_author(train_df, val_df, test_df)

Unnamed: 0_level_0,Author,Train,Validation,Test
author,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
alexandre dumas,charles dickens,4744,527,2260
charles dickens,alexandre dumas,3307,368,1575
fyodor dostoyevsky,fyodor dostoyevsky,2592,288,1234


Feed Forward

Embedding 1000

In [14]:
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import LabelEncoder
import numpy as np
from tensorflow.keras import layers, models
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.utils.class_weight import compute_class_weight
from tensorflow.keras.callbacks import EarlyStopping
from gensim.models import Word2Vec


In [16]:
embedding_size = 300
word2vec_model = Word2Vec.load(f'data/answers/Books_{embedding_size}_6.model')

In [17]:
def prepare_data(df, processor_):
    X = []
    print("Procesando textos...")
    
    for text in df['text_chunk']:
        processed_tokens = processor_.preprocessing_pipeline(text)
        X.append(processed_tokens)
    
    # Convertir tokens a índices
    vocab = word2vec_model.wv.key_to_index
    X_indices = []
    for tokens in X:
        indices = []
        for token in tokens:
            indices.append(vocab.get(token, 0))  # 0 para tokens desconocidos
        X_indices.append(indices)
    
    return X_indices

In [18]:
X_train = prepare_data(train_df, processor_)
X_val = prepare_data(val_df, processor_)
X_test = prepare_data(test_df, processor_)

Procesando textos...
Procesando textos...
Procesando textos...


In [19]:
max_length = max(
    max(len(x) for x in X_train),
    max(len(x) for x in X_val),
    max(len(x) for x in X_test)
)
print(f"Longitud máxima de secuencia: {max_length}")

Longitud máxima de secuencia: 121


In [20]:
X_train_padded = pad_sequences(X_train, maxlen=max_length, padding='post')
X_val_padded = pad_sequences(X_val, maxlen=max_length, padding='post')
X_test_padded = pad_sequences(X_test, maxlen=max_length, padding='post')

In [21]:
label_encoder = LabelEncoder()
label_encoder.fit(train_df['author'])

In [22]:
y_train = label_encoder.transform(train_df['author'])
y_val = label_encoder.transform(val_df['author'])
y_test = label_encoder.transform(test_df['author'])

In [50]:
from tensorflow.keras.utils import to_categorical


In [52]:
y_train_onehot = to_categorical(y_train)
y_val_onehot = to_categorical(y_val)
y_test_onehot = to_categorical(y_test)

In [53]:
classes = np.unique(y_train)
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=classes,
    y=y_train
)
class_weight_dict = dict(enumerate(class_weights))

In [77]:
# 6. Crear el modelo
def create_model(vocab_size, embedding_dim, embedding_matrix):
    model = models.Sequential([
        # Capa de Embedding
        layers.Embedding(
            input_dim=vocab_size,
            output_dim=embedding_dim,
            weights=[embedding_matrix],
            trainable=False
        ),
        
        # Capas de procesamiento bidireccional
        layers.Bidirectional(layers.LSTM(128, return_sequences=True)),
        layers.GlobalMaxPooling1D(),
        
        # Capas densas
        layers.Dense(256, activation='relu'),
        layers.Dropout(0.5),
        layers.BatchNormalization(),
        
        layers.Dense(128, activation='relu'),
        layers.Dropout(0.4),
        layers.BatchNormalization(),
        
        # Capa de salida
        layers.Dense(3, activation='softmax')
    ])
    
    model.compile(
        optimizer='adam',
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )
    
    return model

In [78]:
vocab_size = len(word2vec_model.wv.key_to_index) + 1  # +1 para el token de padding
embedding_matrix = np.zeros((vocab_size, embedding_size))
for word, i in word2vec_model.wv.key_to_index.items():
    embedding_matrix[i] = word2vec_model.wv[word]

In [79]:
embedding_matrix

array([[ 4.91890199e-02,  5.21713018e-01, -1.65519714e-02, ...,
        -3.39471728e-01, -2.32708510e-02,  2.90578604e-01],
       [ 4.55714166e-02,  5.29075682e-01, -4.92591932e-02, ...,
        -3.48181039e-01,  1.85044250e-03,  2.48613954e-01],
       [ 5.07456213e-02,  5.03904164e-01, -4.64888029e-02, ...,
        -3.29032212e-01,  8.04999610e-04,  2.47850284e-01],
       ...,
       [ 3.24375462e-03,  1.43650617e-03,  8.10264726e-04, ...,
         1.30300445e-03,  2.12241639e-03, -1.63700184e-04],
       [-1.00655516e-03, -2.20268336e-03,  1.28425634e-03, ...,
         2.25621625e-03, -4.39817901e-04, -2.68187234e-03],
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00]])

In [80]:
print("\nPesos de las clases:")
for class_idx, weight in class_weight_dict.items():
    print(f"Clase {class_idx}: {weight}")



Pesos de las clases:
Clase 0: 0.7478218100056211
Clase 1: 1.0727749218828748
Clase 2: 1.368698559670782


In [81]:
model = create_model(vocab_size, embedding_size, embedding_matrix)
early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=3,
    restore_best_weights=True,
    verbose=1
)


In [82]:
print("\nEntrenando el modelo...")
history = model.fit(
    X_train_padded, 
    y_train,
    validation_data=(X_val_padded, y_val),
    epochs=20,
    batch_size=32,
    class_weight=class_weight_dict,
    callbacks=[early_stopping],
    verbose=1
)


Entrenando el modelo...
Epoch 1/20
[1m333/333[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 86ms/step - accuracy: 0.3274 - loss: 1.4371 - val_accuracy: 0.3111 - val_loss: 1.1197
Epoch 2/20
[1m333/333[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 81ms/step - accuracy: 0.3285 - loss: 1.1699 - val_accuracy: 0.2434 - val_loss: 1.1163
Epoch 3/20
[1m333/333[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 86ms/step - accuracy: 0.3520 - loss: 1.1113 - val_accuracy: 0.2434 - val_loss: 1.2703
Epoch 4/20
[1m333/333[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 89ms/step - accuracy: 0.4225 - loss: 1.0476 - val_accuracy: 0.2325 - val_loss: 1.1012
Epoch 5/20
[1m333/333[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 82ms/step - accuracy: 0.4223 - loss: 1.0379 - val_accuracy: 0.4455 - val_loss: 1.1708
Epoch 6/20
[1m333/333[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 82ms/step - accuracy: 0.4487 - loss: 1.0170 - val_accuracy: 0.3111 - val_loss

In [83]:
print("\nEvaluando en el conjunto de prueba...")
test_loss, test_accuracy = model.evaluate(X_test_padded, y_test)
print(f"\nPrecisión en el conjunto de prueba: {test_accuracy:.4f}")


Evaluando en el conjunto de prueba...
[1m159/159[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 28ms/step - accuracy: 0.4331 - loss: 0.9867

Precisión en el conjunto de prueba: 0.4172


In [84]:
# Hacer predicciones
y_pred = model.predict(X_test_padded)
y_pred_classes = np.argmax(y_pred, axis=1)

# Mostrar reporte de clasificación
from sklearn.metrics import classification_report
print("\nReporte de clasificación:")
print(classification_report(y_test, y_pred_classes, 
                          target_names=label_encoder.classes_))

[1m159/159[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 29ms/step

Reporte de clasificación:
                    precision    recall  f1-score   support

   alexandre dumas       0.56      0.12      0.20      2260
   charles dickens       0.91      0.48      0.63      1575
fyodor dostoyevsky       0.29      0.88      0.44      1234

          accuracy                           0.42      5069
         macro avg       0.59      0.49      0.42      5069
      weighted avg       0.60      0.42      0.39      5069

