# PUNTO 3 - ARQUITECTURA 1

# Descarga de librerias necesarias

In [1]:
# !pip3 install tensorflow

## Importar librerias necesarias

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

import os
from gensim.utils import simple_preprocess
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from keras.layers import Dense, Embedding, Flatten
from keras.callbacks import EarlyStopping




## PREPROCESAMIENTO

## Carga de datos

In [3]:


books_folder = './books/'


book_files = [f for f in os.listdir(books_folder) if f.endswith('.txt')]

names = []
books_texts = []
for book_file in book_files:
    with open(os.path.join(books_folder, book_file), 'r', encoding='utf-8') as file:
        books_texts.append(file.read())
        names.append(book_file.split('_')[0])

print(f'Se encontraron {len(book_files)} libros en la carpeta /books')
processed_books = [" ".join(simple_preprocess(text)) for text in books_texts]



Se encontraron 9 libros en la carpeta /books


## Dividir textos en fragmentos con su respectiva etiqueta

In [4]:
def split_into_fragments(text, fragment_size=150):
    words = text.split()
    fragments = [words[i:i + fragment_size] for i in range(0, len(words), fragment_size)]
    return [' '.join(fragment) for fragment in fragments]


fragment_size = 150  
fragmented_books = []
fragment_labels = []

for i, book_text in enumerate(books_texts):
    fragments = split_into_fragments(book_text, fragment_size=fragment_size)
    fragmented_books.extend(fragments)
    fragment_labels.extend([names[i]] * len(fragments))  


## Revision de fragmentos por etiqueta

In [5]:
print(len(fragment_labels))
dic = {}
for i in fragment_labels:
    dic[i] = dic.get(i, 0)+ 1

print(dic)

7553
{'tolstoy': 4544, 'forster': 1520, 'vonarnin': 1489}


## Tokenizacion

In [6]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(fragmented_books)
X = tokenizer.texts_to_sequences(fragmented_books)

X = pad_sequences(X, maxlen=fragment_size)

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(fragment_labels)


## Division entre test y train

In [7]:
!pip install imbalanced-learn


Defaulting to user installation because normal site-packages is not writeable


In [8]:
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled)


In [9]:
vocab_size = len(tokenizer.word_index) + 1  


## Creacion de la matriz de embeddings pre entrenados

## Embeddings de tamaño 300

In [10]:
from gensim.models import Word2Vec  
embedding_model_3 = Word2Vec.load('Books_300_EMF.model')
vocab_size = len(tokenizer.word_index) + 1  
embedding_dim_3 = embedding_model_3.vector_size  
embedding_matrix_3 = np.zeros((vocab_size, embedding_dim_3))

In [11]:
for word, idx in tokenizer.word_index.items():
    if word in embedding_model_3.wv:
        embedding_matrix_3[idx] = embedding_model_3.wv[word]
    else:
        embedding_matrix_3[idx] = np.zeros(embedding_dim_3)

## Embedding de tamaño 200


In [12]:
from gensim.models import Word2Vec  
embedding_model_2 = Word2Vec.load('Books_200_EMF.model')
vocab_size = len(tokenizer.word_index) + 1  
embedding_dim_2 = embedding_model_2.vector_size  
embedding_matrix_2 = np.zeros((vocab_size, embedding_dim_2))

In [13]:
for word, idx in tokenizer.word_index.items():
    if word in embedding_model_2.wv:
        embedding_matrix_2[idx] = embedding_model_2.wv[word]
    else:
        embedding_matrix_2[idx] = np.zeros(embedding_dim_2)


# Embeddings de tamaño 100

In [14]:
from gensim.models import Word2Vec  
embedding_model = Word2Vec.load('Books_100_EMF.model')
vocab_size = len(tokenizer.word_index) + 1  
embedding_dim = embedding_model.vector_size  
embedding_matrix = np.zeros((vocab_size, embedding_dim))

In [15]:
for word, idx in tokenizer.word_index.items():
    if word in embedding_model.wv:
        embedding_matrix[idx] = embedding_model.wv[word]
    else:\
        embedding_matrix[idx] = np.zeros(embedding_dim)


## Creacion de la red feed-forward: Arquitectura 1

### Embeddings tamaño 300

In [16]:
model_3 = Sequential()
model_3.add(Embedding(input_dim=vocab_size, 
                    output_dim=embedding_dim_3, 
                    weights=[embedding_matrix_3],  
                    input_length=fragment_size,  
                    trainable=False))  

model_3.add(Flatten()) 
model_3.add(Dense(128, activation='relu')) 
model_3.add(Dense(64, activation='relu'))   
model_3.add(Dense(len(set(names)), activation='softmax'))  #

In [17]:
model_3.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])


### Embeddings tamaño 200

In [18]:
set(names)

{'forster', 'tolstoy', 'vonarnin'}

In [19]:
unique, counts = np.unique(y_train, return_counts=True)
class_distribution = dict(zip(unique, counts))
print(class_distribution)

{0: 3635, 1: 3635, 2: 3635}


In [20]:
model_2 = Sequential()
model_2.add(Embedding(input_dim=vocab_size, 
                    output_dim=embedding_dim_2, 
                    weights=[embedding_matrix_2],  
                    input_length=fragment_size,  
                    trainable=False))  

model_2.add(Flatten()) 
model_2.add(Dense(128, activation='relu')) 
model_2.add(Dense(64, activation='relu'))   
model_2.add(Dense(len(set(names)), activation='softmax'))  #

In [21]:
model_2.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])


### Embeddings tamaño 100

In [22]:
model = Sequential()
model.add(Embedding(input_dim=vocab_size, 
                    output_dim=embedding_dim, 
                    weights=[embedding_matrix],  
                    input_length=fragment_size,  
                    trainable=False))  

model.add(Flatten()) 
model.add(Dense(128, activation='relu')) 
model.add(Dense(64, activation='relu'))   
model.add(Dense(len(set(names)), activation='softmax'))  #


In [23]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])


## Entrenamiento de la red

### Embeddings tamaño 100

In [24]:
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

model.fit(X_train, y_train, epochs=100, batch_size=32, validation_data=(X_test, y_test), callbacks=[early_stopping])


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100


<keras.src.callbacks.History at 0x317776100>

### Embeddings tamaño 200

In [25]:
early_stopping_2 = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

model_2.fit(X_train, y_train, epochs=100, batch_size=32, validation_data=(X_test, y_test), callbacks=[early_stopping])


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100


<keras.src.callbacks.History at 0x322088e80>

In [26]:
loss, accuracy = model_2.evaluate(X_test, y_test)



### Embeddings tamaño 300

In [27]:
early_stopping_3 = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

model_3.fit(X_train, y_train, epochs=100, batch_size=32, validation_data=(X_test, y_test), callbacks=[early_stopping])


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100


<keras.src.callbacks.History at 0x358474d60>

In [28]:
loss, accuracy = model_3.evaluate(X_test, y_test)



# Resultados  y metricas

### Embeddings tamaño 100

In [29]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 150, 100)          3507800   
                                                                 
 flatten_2 (Flatten)         (None, 15000)             0         
                                                                 
 dense_6 (Dense)             (None, 128)               1920128   
                                                                 
 dense_7 (Dense)             (None, 64)                8256      
                                                                 
 dense_8 (Dense)             (None, 3)                 195       
                                                                 
Total params: 5436379 (20.74 MB)
Trainable params: 1928579 (7.36 MB)
Non-trainable params: 3507800 (13.38 MB)
_________________________________________________________________


In [30]:
y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)  



In [31]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred_classes))

              precision    recall  f1-score   support

           0       0.79      0.83      0.81       909
           1       0.72      0.58      0.64       909
           2       0.77      0.88      0.82       909

    accuracy                           0.76      2727
   macro avg       0.76      0.76      0.76      2727
weighted avg       0.76      0.76      0.76      2727



### Embeddings tamaño 200

In [32]:
model_2.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 150, 200)          7015600   
                                                                 
 flatten_1 (Flatten)         (None, 30000)             0         
                                                                 
 dense_3 (Dense)             (None, 128)               3840128   
                                                                 
 dense_4 (Dense)             (None, 64)                8256      
                                                                 
 dense_5 (Dense)             (None, 3)                 195       
                                                                 
Total params: 10864179 (41.44 MB)
Trainable params: 3848579 (14.68 MB)
Non-trainable params: 7015600 (26.76 MB)
_________________________________________________________________


In [33]:
y_pred = model_2.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1) 



In [34]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred_classes))

              precision    recall  f1-score   support

           0       0.76      0.90      0.82       909
           1       0.77      0.53      0.63       909
           2       0.77      0.87      0.81       909

    accuracy                           0.76      2727
   macro avg       0.77      0.76      0.76      2727
weighted avg       0.77      0.76      0.76      2727



### Embeddings tamaño 300

In [35]:
model_3.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 150, 300)          10523400  
                                                                 
 flatten (Flatten)           (None, 45000)             0         
                                                                 
 dense (Dense)               (None, 128)               5760128   
                                                                 
 dense_1 (Dense)             (None, 64)                8256      
                                                                 
 dense_2 (Dense)             (None, 3)                 195       
                                                                 
Total params: 16291979 (62.15 MB)
Trainable params: 5768579 (22.01 MB)
Non-trainable params: 10523400 (40.14 MB)
_________________________________________________________________


In [36]:
y_pred = model_3.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1) 



In [37]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred_classes)) 

              precision    recall  f1-score   support

           0       0.77      0.85      0.81       909
           1       0.74      0.60      0.67       909
           2       0.81      0.88      0.85       909

    accuracy                           0.78      2727
   macro avg       0.78      0.78      0.77      2727
weighted avg       0.78      0.78      0.77      2727

