In [8]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [9]:
# %load nn.py
"""nn.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1nncHMaUAaheWr5ATvX32QOENaWY_YMJ0
"""

from typing import Tuple, List, Dict
import tensorflow as tf
from tensorflow.keras import layers, models

def create_toy_rnn(input_shape: tuple, n_outputs: int) -> Tuple[tf.keras.models.Model, Dict]:
    """Creates a recurrent neural network for a toy problem."""
    model = models.Sequential([
        layers.SimpleRNN(32, activation='relu', return_sequences=True, input_shape=input_shape),
        layers.SimpleRNN(32, activation='relu'),
        layers.Dense(n_outputs, activation='linear')
    ])
    model.compile(optimizer='adam', loss='mse', metrics=['mae'])
    return model, {'batch_size': 32, 'epochs': 20}

def create_mnist_cnn(input_shape: tuple, n_outputs: int) -> Tuple[tf.keras.models.Model, Dict]:
    """Creates a convolutional neural network for digit classification."""
    model = models.Sequential([
        layers.Conv2D(32, (3,3), activation='relu', input_shape=input_shape),
        layers.MaxPooling2D((2,2)),
        layers.Conv2D(64, (3,3), activation='relu'),
        layers.MaxPooling2D((2,2)),
        layers.Flatten(),
        layers.Dense(128, activation='relu'),
        layers.Dense(n_outputs, activation='softmax')
    ])
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model, {'batch_size': 64, 'epochs': 10}

def create_youtube_comment_rnn(vocabulary: List[str], n_outputs: int) -> Tuple[tf.keras.models.Model, Dict]:
    """Creates a recurrent neural network for spam classification."""
    vocab_size = len(vocabulary)
    model = models.Sequential([
        layers.Embedding(input_dim=vocab_size, output_dim=128, mask_zero=True),
        layers.LSTM(64, return_sequences=True),
        layers.LSTM(64),
        layers.Dense(64, activation='relu'),
        layers.Dense(n_outputs, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model, {'batch_size': 32, 'epochs': 15}

def create_youtube_comment_cnn(vocabulary: List[str], n_outputs: int) -> Tuple[tf.keras.models.Model, Dict]:
    """Creates a convolutional neural network for spam classification."""
    vocab_size = len(vocabulary)
    model = models.Sequential([
        layers.Embedding(input_dim=vocab_size, output_dim=128, input_length=100),
        layers.Conv1D(64, 5, activation='relu'),
        layers.MaxPooling1D(2),
        layers.Conv1D(128, 5, activation='relu'),
        layers.GlobalMaxPooling1D(),
        layers.Dense(64, activation='relu'),
        layers.Dense(n_outputs, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model, {'batch_size': 32, 'epochs': 15}



In [10]:
with open('nn.py', 'r') as file:
    code = file.read()

In [11]:
print(code)

# -*- coding: utf-8 -*-
"""nn-1.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1P7dIWHZ4MR65nH0xIZmt55yxQq9umBam
"""

"""
The main code for the recurrent and convolutional networks assignment.
See README.md for details.
"""
from typing import Tuple, List, Dict

import tensorflow
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, LSTM, Dense, Conv2D, MaxPool2D, Flatten, Embedding, Conv1D, GlobalMaxPool1D, Dropout, BatchNormalization, TimeDistributed
from tensorflow.keras.callbacks import EarlyStopping

def create_toy_rnn(input_shape: tuple, n_outputs: int) \
        -> Tuple[tensorflow.keras.models.Model, Dict]:
    """Creates a recurrent neural network for a toy problem."""
    model = Sequential([
        Input(shape=input_shape),
        LSTM(64, return_sequences=True),
        BatchNormalization(),
        Dropout(0.2),
        LSTM(32, return_sequences=True),
        BatchNorma

In [12]:
# %load test_nn.py
"""test_nn.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/17ZUKbqFIq3Z_2rGyAuV4yX3z8UaOCSOS
"""

import os
import numpy as np
import h5py
from tensorflow.keras.datasets import mnist
from tensorflow.keras.utils import to_categorical

# Load MNIST dataset
(train_input, train_output), (test_input, test_output) = mnist.load_data()

# Expand dimensions to make compatible with CNN input shape
train_input = np.expand_dims(train_input, axis=-1)
test_input = np.expand_dims(test_input, axis=-1)

# Convert labels to one-hot encoded vectors
train_output = to_categorical(train_output).astype(np.float32)
test_output = to_categorical(test_output).astype(np.float32)

# Create 'data' directory if it doesn't exist
if not os.path.exists('data'):  # For Assignment data is provided in hdf5 format
    os.makedirs('data')

# Save preprocessed data to HDF5 file
with h5py.File('data/mnist.hdf5', 'w') as f:
    # Include only every 100th training/testing example to limit dataset size
    train = f.create_group("train")
    train.create_dataset("input", compression="gzip", data=train_input[::100])
    train.create_dataset("output", compression="gzip", data=train_output[::100])
    test = f.create_group("test")
    test.create_dataset("input", compression="gzip", data=test_input[::100])
    test.create_dataset("output", compression="gzip", data=test_output[::100])

import os
print(os.path.exists("/mnt/data/"))  # Should print True if the directory exists

import json
import re
import h5py
import numpy as np
import pandas as pd

# Download .csv files from
    #     https://archive.ics.uci.edu/ml/datasets/YouTube+Spam+Collection
# Correct file paths
import pandas as pd

# File names
names = ["1-Psy", "2-KatyPerry", "3-LMFAO", "4-Eminem", "5-Shakira"]

# Correct file paths
file_paths = ["/content/Youtube0{0}.csv".format(name) for name in names]

# Load datasets
dfs = [pd.read_csv(file) for file in file_paths]

#print("Files loaded successfully!")

tokenize = re.compile(r"\d+|[^\d\W]+|\S").findall
dfs_tokenized = [[tokenize(comment) for comment in df["CONTENT"]]
                 for df in dfs]

index_to_token = [''] + sorted(set(token
                                   for comments in dfs_tokenized
                                   for tokens in comments
                                   for token in tokens))

token_to_index = {c: i for i, c in enumerate(index_to_token)}

max_tokens = max(len(tokens)
                 for comments in dfs_tokenized
                 for tokens in comments)

with h5py.File('data/youtube-comments.hdf5', 'w') as f:
    f.attrs["vocabulary"] = json.dumps(index_to_token)
    for name, df, comments in zip(names, dfs, dfs_tokenized):
        matrix_in = np.zeros(shape=(len(comments), max_tokens))
        for i, tokens in enumerate(comments):
            for j, token in enumerate(tokens):
                matrix_in[i, j] = token_to_index[token]
        matrix_out = df["CLASS"].values.reshape((-1, 1))
        group = f.create_group(name)
        group.create_dataset("input", compression="gzip", data=matrix_in)
        group.create_dataset("output", compression="gzip", data=matrix_out)

import os
import json

import h5py
import numpy as np
import pytest
import tensorflow

import nn


@pytest.fixture(autouse=True)
def set_seeds():
    os.environ["TF_DETERMINISTIC_OPS"] = "1"
    tensorflow.random.set_seed(42)
    tensorflow.config.threading.set_intra_op_parallelism_threads(1)
    tensorflow.config.threading.set_inter_op_parallelism_threads(1)


def test_toy_rnn(capsys):
    n_train = 20
    n_test = 10
    n_timesteps = 20
    n_features = 2

    # create random input for train and test
    train_in = np.random.randint(1, 11, (n_train, n_timesteps, n_features))
    test_in = np.random.randint(1, 11, (n_test, n_timesteps, n_features))

    # deterministically create output from the random input
    def out(matrix_in):
        matrix_out = np.zeros(shape=matrix_in.shape[:-1] + (1,))
        for i, example in enumerate(matrix_in):
            for j, [_, x1] in enumerate(example):
                [x0, _] = example[j - 3] if j >= 3 else [0., 0.]
                matrix_out[i, j] = x0 - x1
        return matrix_out
    train_out = out(train_in)
    test_out = out(test_in)

    # request a model
    input_shape = train_in.shape[1:]
    (_, _, n_outputs) = train_out.shape
    model, kwargs = nn.create_toy_rnn(input_shape, n_outputs)

    # check that model contains a recurrent layer
    assert any(is_recurrent(layer) for layer in layers(model))

    # check that model contains no convolutional layers
    assert all(not is_convolution(layer) for layer in layers(model))

    # check that output type and loss are appropriate
    assert "mean" in loss_name(model)
    assert output_activation(model) == tensorflow.keras.activations.linear

    # set training data, epochs and validation data
    kwargs.update(x=train_in, y=train_out,
                  epochs=20, validation_data=(test_in, test_out))

    # call fit, including any arguments supplied alongside the model
    model.fit(**kwargs)

    # make sure error is low enough
    rmse = root_mean_squared_error(model.predict(test_in), test_out)
    with capsys.disabled():
        print("\n{:.1f} RMSE for RNN on toy problem".format(rmse))
    assert rmse < 2


def test_image_cnn(capsys):

    with h5py.File("data/mnist.hdf5", 'r') as f:
        train = f["train"]
        train_out = np.array(train["output"])
        train_in = np.array(train["input"])
        test = f["test"]
        test_out = np.array(test["output"])
        test_in = np.array(test["input"])

    # request a model
    input_shape = train_in.shape[1:]
    (_, n_outputs) = train_out.shape
    model, kwargs = nn.create_mnist_cnn(input_shape, n_outputs)

    # check that model contains a convolutional layer
    assert any(is_convolution(layer) for layer in layers(model))

    # check that model contains no recurrent layers
    assert all(not is_recurrent(layer) for layer in layers(model))

    # check that output type and loss are appropriate
    assert "categorical" in loss_name(model)
    assert output_activation(model) == tensorflow.keras.activations.softmax

    # set training data, epochs and validation data
    kwargs.update(x=train_in, y=train_out,
                  epochs=10, validation_data=(test_in, test_out))

    # call fit, including any arguments supplied alongside the model
    model.fit(**kwargs)

    # make sure accuracy is high enough
    accuracy = multi_class_accuracy(model.predict(test_in), test_out)
    with capsys.disabled():
        print("\n{:.1%} accuracy for CNN on MNIST sample".format(accuracy))
    assert accuracy > 0.8


def test_text_rnn(capsys):

    with h5py.File("data/youtube-comments.hdf5", 'r') as f:
        vocabulary = json.loads(f.attrs["vocabulary"])
        train = f["1-Psy"]
        train_in = np.array(train["input"])[:, :200]
        train_out = np.array(train["output"])
        test = f["5-Shakira"]
        test_in = np.array(test["input"])[:, :200]
        test_out = np.array(test["output"])

    # request a model
    model, kwargs = nn.create_youtube_comment_rnn(vocabulary=vocabulary,
                                                  n_outputs=1)

    # check that model contains a recurrent layer
    assert any(is_recurrent(layer) for layer in layers(model))

    # check that model contains no convolutional layers
    assert all(not is_convolution(layer) for layer in layers(model))

    # check that output type and loss are appropriate
    assert any(x in loss_name(model) for x in ["hinge", "crossentropy"])
    assert output_activation(model) == tensorflow.keras.activations.sigmoid

    # set training data, epochs and validation data
    kwargs.update(x=train_in, y=train_out,
                  epochs=10, validation_data=(test_in, test_out))

    # call fit, including any arguments supplied alongside the model
    model.fit(**kwargs)

    # make sure accuracy is high enough
    accuracy = binary_accuracy(model.predict(test_in), test_out)
    with capsys.disabled():
        print("\n{:.1%} accuracy for RNN on Youtube comments".format(accuracy))
    assert accuracy > 0.8


def test_text_cnn(capsys):
    # The data below was obtained as in test_text_rnn
    with h5py.File("data/youtube-comments.hdf5", 'r') as f:
        vocabulary = json.loads(f.attrs["vocabulary"])
        train = f["1-Psy"]
        train_in = np.array(train["input"])[:, :200]
        train_out = np.array(train["output"])
        test = f["5-Shakira"]
        test_in = np.array(test["input"])[:, :200]
        test_out = np.array(test["output"])

    # request a model
    model, kwargs = nn.create_youtube_comment_cnn(vocabulary=vocabulary,
                                                  n_outputs=1)

    # check that model contains a convolutional layer
    assert any(is_convolution(layer) for layer in layers(model))

    # check that model contains no recurrent layers
    assert all(not is_recurrent(layer) for layer in layers(model))

    # check that output type and loss are appropriate
    assert any(x in loss_name(model) for x in ["hinge", "crossentropy"])
    assert output_activation(model) == tensorflow.keras.activations.sigmoid

    # set training data, epochs and validation data
    kwargs.update(x=train_in, y=train_out,
                  epochs=10, validation_data=(test_in, test_out))

    # call fit, including any arguments supplied alongside the model
    model.fit(**kwargs)

    # make sure accuracy is high enough
    accuracy = binary_accuracy(model.predict(test_in), test_out)
    with capsys.disabled():
        print("\n{:.1%} accuracy for CNN on Youtube comments".format(accuracy))
    assert accuracy > 0.8


def layers(model: tensorflow.keras.models.Model):
    return [x.layer if isinstance(x, tensorflow.keras.layers.Wrapper) else x
            for x in model.layers]


def is_convolution(layer: tensorflow.keras.layers.Layer):
    return layer.__class__.__name__.startswith('Conv')


def is_recurrent(layer: tensorflow.keras.layers.Layer):
    return isinstance(layer, tensorflow.keras.layers.RNN)


def loss_name(model):
    if isinstance(model.loss, str):
        loss = getattr(tensorflow.keras.losses, model.loss)
    else:
        loss = model.loss
    return loss.__name__.lower()


def output_activation(model: tensorflow.keras.models.Model):
    return model.layers[-1].activation


def root_mean_squared_error(system: np.ndarray, human: np.ndarray):
    return ((system - human) ** 2).mean() ** 0.5


def multi_class_accuracy(system: np.ndarray, human: np.ndarray):
    return np.mean(np.argmax(system, axis=1) == np.argmax(human, axis=1))


def binary_accuracy(system: np.ndarray, human: np.ndarray):
    return np.mean(np.round(system) == human)



False


In [13]:
!pytest

platform linux -- Python 3.11.11, pytest-8.3.4, pluggy-1.5.0
rootdir: /content
plugins: anyio-3.7.1, langsmith-0.3.8, typeguard-4.4.2
collected 4 items                                                                                  [0m

test_nn.py 
1.3 RMSE for RNN on toy problem
[32m.[0m
88.0% accuracy for CNN on MNIST sample
[32m.[0m
86.5% accuracy for RNN on Youtube comments
[32m.[0m
85.7% accuracy for CNN on Youtube comments
[32m.[0m[32m                                                                              [100%][0m

