In [1]:
import pandas as pd
import tensorflow as tf
import random

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import load_model

In [5]:
# Load the data
df = pd.read_csv("./balenced_data.csv")  # Replace with the path to your DataFrame
texts = df['text'].values
labels = df['Source'].values

In [6]:
# Convert the labels to numerical values
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(labels)

In [7]:
# Split the data into training and testing sets
texts_train, texts_test, labels_train, labels_test = train_test_split(texts, labels, test_size=0.2, random_state=42)

# Tokenize the texts
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts_train)

# Convert text sequences to numerical sequences
sequences_train = tokenizer.texts_to_sequences(texts_train)
sequences_test = tokenizer.texts_to_sequences(texts_test)

# Pad the sequences to ensure equal length
max_seq_length = 100 #max(len(seq) for seq in sequences_train)
sequences_train = pad_sequences(sequences_train, maxlen=max_seq_length)
sequences_test = pad_sequences(sequences_test, maxlen=max_seq_length)

In [5]:
# Create TensorFlow datasets
train_dataset = tf.data.Dataset.from_tensor_slices((sequences_train, labels_train))
test_dataset = tf.data.Dataset.from_tensor_slices((sequences_test, labels_test))

# Apply data pipeline optimizations
train_dataset = train_dataset.shuffle(len(sequences_train)).batch(16).prefetch(tf.data.AUTOTUNE)
test_dataset = test_dataset.batch(16).prefetch(tf.data.AUTOTUNE)

In [6]:
# Define the model
vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 50 #100

model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, input_length=max_seq_length))
model.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(len(label_encoder.classes_), activation='softmax'))

In [7]:
# Compile the models
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [8]:
# Train the model
model.fit(train_dataset, epochs=10, validation_data=test_dataset)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1a7659c2bc0>

In [2]:
# Define the file path for saving / loading the model
model_path = './models/nn.h5'

In [None]:
# Save the trained model
model.save(model_path)

print("Model saved successfully.")

In [9]:
# Load the saved model
model = load_model(model_path)

print("Model loaded successfully.")

Model loaded successfully.


In [19]:
# Select a random index from the test dataset
random_index = random.randint(0, len(sequences_test) - 1)

# Get the text, true label, and corresponding predicted label at the random index
random_text = texts_test[random_index]
true_label = labels_test[random_index]

# Preprocess the random text
random_sequence = tokenizer.texts_to_sequences([random_text])
random_sequence = pad_sequences(random_sequence, maxlen=max_seq_length)

# Perform prediction on the preprocessed random text
prediction = model.predict(random_sequence)
predicted_label = label_encoder.inverse_transform(prediction.argmax(axis=1))

# Print the first 100 symbols of the text, true label, and predicted class
print("Text:", random_text[:100])
print("True Label:", label_encoder.inverse_transform([true_label])[0])
print("Predicted Label:", predicted_label[0])

Text: You guys make me laugh. What do you think? All you need is my number? OK, it's 06 12 23 23 41. You'l
True Label: movie
Predicted Label: blog
