In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Input, Embedding, Conv1D, MaxPooling1D, Flatten, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import torch
import pickle

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

In [1]:
from datasets import load_dataset
import torch

dataset = load_dataset("dbpedia_14")

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import pickle
# with open('dbpedia.pkl', 'wb') as file:
#     pickle.dump(dataset, file)

In [4]:
with open('dbpedia.pkl', 'rb') as file:
    dataset = pickle.load(file)

In [5]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['label', 'title', 'content'],
        num_rows: 560000
    })
    test: Dataset({
        features: ['label', 'title', 'content'],
        num_rows: 70000
    })
})


In [6]:
train_dataset = dataset['train']
test_dataset = dataset['test']

# Convert train and test datasets to arrays
train_data_title = train_dataset['title']
train_data_content = train_dataset['content']
train_labels = train_dataset['label']
test_data_title = test_dataset['title']
test_data_content = test_dataset['content']
test_labels = test_dataset['label']



In [6]:
# print(train_data_title[0])
# print(train_data_content[0])
# print(train_labels[0])


In [7]:
# train_data = train_data_title + train_data_content
# test_data = test_data_title + test_data_content

train_data = [None] * len(train_data_title)
for i in range(len(train_data_title)):
    train_data[i] = train_data_title[i] + "mid" + train_data_content[i]

test_data = [None] * len(test_data_title)
for i in range(len(test_data_title)):
    test_data[i] = test_data_title[i] + "mid" + test_data_content[i]

In [None]:
print(train_data[0])
print(train_labels[0])

In [None]:
# print the max sequence length
max_len = max([len(s.split()) for s in train_data])
print(max_len)

In [None]:
max_sequence_length = (max_len + 10)  # Maximum sequence length
embedding_dim = 128  # Dimension of character embeddings
num_filters = 128  # Number of convolution filters
kernel_size = 5  # Size of convolution kernels
pool_size = 4  # Size of max-pooling window
dropout_rate = 0.5  # Dropout rate
num_classes = len(set(train_labels))  # Number of classes

In [None]:
# train val split
train_data, val_data, train_labels, val_labels = train_test_split(train_data, train_labels, test_size=0.1, random_state=42)

In [None]:
tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(train_data)
sequences_train = tokenizer.texts_to_sequences(train_data)
sequences_val = tokenizer.texts_to_sequences(val_data)
sequences_test = tokenizer.texts_to_sequences(test_data)

In [None]:
X_train = pad_sequences(sequences_train, maxlen=max_sequence_length)
X_val = pad_sequences(sequences_val, maxlen=max_sequence_length)
X_test = pad_sequences(sequences_test, maxlen=max_sequence_length)

In [None]:
y_train = keras.utils.to_categorical(train_labels, num_classes)
y_val = keras.utils.to_categorical(val_labels, num_classes)
y_test = keras.utils.to_categorical(test_labels, num_classes)

In [None]:
# define the model
input_layer = Input(shape=(max_sequence_length,))
embedding_layer = Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=embedding_dim)(input_layer)
conv1d_layer = Conv1D(filters=num_filters, kernel_size=kernel_size, activation='relu')(embedding_layer)
maxpool_layer = MaxPooling1D(pool_size=pool_size)(conv1d_layer)
flatten_layer = Flatten()(maxpool_layer)
dropout_layer = tf.keras.layers.Dropout(dropout_rate)(flatten_layer)
output_layer = Dense(num_classes, activation='softmax')(dropout_layer)

In [None]:
model = Model(inputs=input_layer, outputs=output_layer)

In [None]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
num_epochs = 3
batch_size = 128

In [None]:
model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=num_epochs, batch_size=batch_size)

In [None]:
# Evaluate the model on the test set
test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=2)
print("Test accuracy: ", test_accuracy)
# print classification report and confusion matrix
from sklearn.metrics import classification_report, confusion_matrix
y_pred = model.predict(X_test)
y_pred = np.argmax(y_pred, axis=1)
y_test = np.argmax(y_test, axis=1)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))


In [None]:
# save the model
model.save('dbpedia_cnn.h5')
torch.save(model.state_dict(), 'dbpedia_cnn.pt')