In [59]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import joblib
import requests
import zipfile
import io
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
import tensorflow as tf
from sklearn import svm
from tensorflow.keras.layers import Input,Dense, Dropout, Activation, Flatten, Embedding, Conv1D, GlobalMaxPooling1D, GlobalAveragePooling1D, LSTM, SimpleRNN, Reshape
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras import layers
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam
from keras.layers import Concatenate
from keras.utils import np_utils
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from keras.utils import to_categorical




## Part 3 -  BBC Dataset

In [60]:
# Download the zip file from a URL
url = 'https://storage.googleapis.com/kaggle-data-sets/30569/38997/bundle/archive.zip?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=gcp-kaggle-com%40kaggle-161607.iam.gserviceaccount.com%2F20230508%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20230508T172143Z&X-Goog-Expires=259200&X-Goog-SignedHeaders=host&X-Goog-Signature=4e609a86d435ed376361a76cdd011b242c015e86fc2d0a8f415fecbb53a0ee3ef71e4f0b12222c0f4952572db08e96f6588a430a1acae8924a9a484181337f2013e3a68dfc73e0e1f96ad153c62d25b36d40b33992033027fde63c06e2b1c6291fc699b1817c83fd0c5ca52745b15247b5389b6030db1a51c9183cf5cdd4e41aeb78182b7ff83ff06a494c03ba09243b87a1aeda606bb1928e772f6c420ea6c7d1fdc54714ef2b07cf849fa533c9ef797584d44a05d036877966c99a35a48052e7a46877b17943f692a74fb3e82bd7c20fcef9ccdda38481291835c58668beaf481776dca8b1bc2921bef4567b11b4618213bff468e8884647638b48accb7f7d'
r = requests.get(url)
z = zipfile.ZipFile(io.BytesIO(r.content))

# Extract the files to a directory
z.extractall('datasets')

In [61]:
# Load dataset
dataset = pd.read_csv('datasets/bbc-text.csv')
dataset.head()

Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...
3,sport,yeading face newcastle in fa cup premiership s...
4,entertainment,ocean s twelve raids box office ocean s twelve...


### Preprocess Text

In [None]:
def preprocess_text(text):

    # Convert the text to a string
    text = str(text)
    
    # Convert text to lowercase
    text = text.lower()

    # Remove unwanted characters using regular expressions
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenize the text into words
    words = word_tokenize(text)

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]

    # Lemmatize the words
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]

    # Join the words back into a single string
    preprocessed_text = ' '.join(words)

    return preprocessed_text


# Preprocess the text extract in the extractframe
dataset['text'] = dataset['text'].apply(preprocess_text)
dataset['category'] = dataset['category'].apply(preprocess_text)

In [None]:
# Filter dataset to include only 'business' and 'sport' categories
dataset = dataset[dataset['category'].isin(['business', 'sport'])]

### Create Fixed sequence lengths for Inputs and One-Hot Encode target class

In [None]:
# Define maximum number of words in a sequence
maxlen = 20

# Tokenize input sequences
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(dataset['text'])
sequences = tokenizer.texts_to_sequences(dataset['text'])
word_index = tokenizer.word_index
max_words = len(tokenizer.word_index) + 1

# Pad sequences to have a fixed length
data = pad_sequences(sequences, maxlen=maxlen)

# Create one-hot encoded labels
labels = pd.get_dummies(dataset['category']).values

### Create Training, testing, and validation sets

In [None]:
# Define the sizes of the training, testing, and validation sets
test_sample = int(0.1 * data.shape[0])
validation_sample = int(0.1 * data.shape[0])
train_sample = data.shape[0] - test_sample - validation_sample

# Split the data and labels into training, testing, and validation sets
x_train = data[:train_sample]
y_train = labels[:train_sample]
x_test = data[train_sample:train_sample+test_sample]
y_test = labels[train_sample:train_sample+test_sample]
x_val = data[train_sample+test_sample:]
y_val = labels[train_sample+test_sample:]

#Define hyperparamters
embedding_dim = 16
epochs = 5

### LSTM with multiple layers for creating the generative model

In [None]:
# Define modelGenerate architecture
modelGenerate = Sequential()
modelGenerate.add(Embedding(max_words, embedding_dim, input_length=maxlen))
modelGenerate.add(LSTM(embedding_dim, return_sequences=True))
modelGenerate.add(Dropout(0.2))
modelGenerate.add(LSTM(embedding_dim))
modelGenerate.add(Dropout(0.2))
modelGenerate.add(Dense(2, activation='softmax'))

# Compile modelGenerate
modelGenerate.compile(optimizer='adam', loss='categorical_crossentropy',metrics=['accuracy'])

history = modelGenerate.fit(x_train, y_train, epochs= epochs, batch_size=32, validation_data=(x_val, y_val))



In [None]:
modelGenerate.save("/Users/tobi/SavedModels/modelGenerate.keras")

### Use Generative Model to generate text

In [None]:
# Define the seed text
start_index = np.random.randint(len(dataset[dataset['category'] == 'sport']) - 1)
seed_text = dataset[dataset['category'] == category].iloc[start_index]['text']

# Tokenize the seed text
seed_sequence = tokenizer.texts_to_sequences([seed_text])[0]

# Pad the sequence to a fixed length
seed_sequence = pad_sequences([seed_sequence], maxlen=20, padding='pre', truncating='pre')

# generate a sequence from a language model
def generate_text(model, tokenizer, seed_text, n_words):
    in_text = seed_text
    # generate a fixed number of words
    for _ in range(n_words):
        # predict probabilities for each word
        yhat = model.predict(seed_sequence, verbose=0)
        # map predicted word index to word
        out_word_index = np.argmax(yhat)
        out_word = ""
        for word, index in tokenizer.word_index.items():
            if index == out_word_index:
                out_word = word
                break
        # append to input
        in_text += " " + out_word

    print(in_text)

#generate new text
generated = generate_text(modelGenerate,tokenizer, seed_text, 50)

### Evaluate model 

In [None]:
# Evaluate model on test set
loss, accuracy = modelGenerate.evaluate(x_test, y_test)

# Calculate perplexity
perplexity = np.exp(loss)

# Print results
print("Test Loss: {:.4f}".format(loss))
print("Test Accuracy: {:.4f}".format(accuracy))
print("Perplexity: {:.4f}".format(perplexity))

In [None]:
# Plot training and validation loss over time
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['train', 'val'], loc='upper right')
plt.show()

# Plot training and validation accuracy over time
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['train', 'val'], loc='lower right')
plt.show()
