In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Input, Embedding, Conv1D, MaxPooling1D, Flatten, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import pickle

2023-11-30 22:22:57.815091: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-11-30 22:22:57.838681: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-11-30 22:22:57.952184: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-11-30 22:22:57.953098: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        # Currently, memory growth needs to be the same across GPUs
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(f"{len(gpus)} Physical GPUs, {len(logical_gpus)} Logical GPUs")
    except RuntimeError as e:
        # Memory growth must be set before GPUs have been initialized
        print(e)
else:
    print("No GPU is detected")

No GPU is detected


2023-11-30 22:23:01.160387: E tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:266] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected


In [3]:
from datasets import load_dataset

dataset = load_dataset("yelp_review_full")

Found cached dataset yelp_review_full (/home/sriteja/.cache/huggingface/datasets/yelp_review_full/yelp_review_full/1.0.0/e8e18e19d7be9e75642fc66b198abadb116f73599ec89a69ba5dd8d1e57ba0bf)


  0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 650000
    })
    test: Dataset({
        features: ['label', 'text'],
        num_rows: 50000
    })
})


In [5]:
train_dataset = dataset['train']
test_dataset = dataset['test']

# Convert train and test datasets to arrays
train_data = train_dataset['text']
train_labels = train_dataset['label']
test_data = test_dataset['text']
test_labels = test_dataset['label']

# Convert labels to lists (optional)
# train_labels = train_labels.tolist()
# test_labels = test_labels.tolist()

In [6]:
# print(train_data[0])
# print(train_labels[0])


In [7]:
# print the max sequence length
max_len = max([len(s.split()) for s in train_data])
print(max_len)

1052


In [8]:
# train val split
train_data, val_data, train_labels, val_labels = train_test_split(train_data, train_labels, test_size=0.1, random_state=42)

In [9]:
tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(train_data)
sequences_train = tokenizer.texts_to_sequences(train_data)
sequences_val = tokenizer.texts_to_sequences(val_data)
sequences_test = tokenizer.texts_to_sequences(test_data)

In [10]:
max_sequence_length = 1100  # Maximum sequence length

X_train = pad_sequences(sequences_train, maxlen=max_sequence_length)
X_val = pad_sequences(sequences_val, maxlen=max_sequence_length)
X_test = pad_sequences(sequences_test, maxlen=max_sequence_length)

In [11]:
num_classes = len(set(train_labels))  # Number of classes
y_train = keras.utils.to_categorical(train_labels, num_classes)
y_val = keras.utils.to_categorical(val_labels, num_classes)
y_test = keras.utils.to_categorical(test_labels, num_classes)

In [12]:
import wandb
from wandb.keras import WandbCallback
from tensorflow.keras.layers import Input, Embedding, Conv1D, MaxPooling1D, Flatten, Dense
from tensorflow.keras.models import Model
import tensorflow as tf

wandb.login()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33msilvertongue1729[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [51]:
num_epochs = 1
batch_size = 128

In [None]:
sweep_config = {
    'method': 'random',
    'metric': {
        'name': 'val accuracy',
        'goal': 'maximize'   
    },
    'parameters': {
        'embedding_dim': {'values': [64, 128, 256]},
        'num_filters': {'values': [128, 256]},
        'kernel_size': {'values': [5, 7]},
        'pool_size': {'values': [4]},
        'dropout_rate': {'values': [0.3, 0.5]},
    },
    "name": "cnn_yelp_sweep"
}

In [None]:
def train():
    # Initialize a new wandb run
    with wandb.init() as run:
        # Get hyperparameters
        config = wandb.config

        # Define the model
        input_layer = Input(shape=(max_sequence_length,))
        embedding_layer = Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=config.embedding_dim)(input_layer)
        conv1d_layer = Conv1D(filters=config.num_filters, kernel_size=config.kernel_size, activation='relu')(embedding_layer)
        maxpool_layer = MaxPooling1D(pool_size=config.pool_size)(conv1d_layer)
        flatten_layer = Flatten()(maxpool_layer)
        dropout_layer = tf.keras.layers.Dropout(config.dropout_rate)(flatten_layer)
        output_layer = Dense(num_classes, activation='softmax')(dropout_layer)
        model = Model(inputs=input_layer, outputs=output_layer)

        # Compile the model
        model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

        # Train the model
        model.fit(X_train, y_train, validation_data=(X_val, y_val), 
                  epochs=num_epochs, batch_size=batch_size,
                  callbacks=[WandbCallback()])

In [None]:
sweep_id = wandb.sweep(sweep_config, project="SMAI_project_cnn_yelp")
wandb.agent(sweep_id, train, count = 12)

In [None]:
import sys
sys.exit()

In [43]:
embedding_dim = 128  # Dimension of character embeddings
num_filters = 128  # Number of convolution filters
kernel_size = 5  # Size of convolution kernels
pool_size = 4  # Size of max-pooling window
dropout_rate = 0.5  # Dropout rate

In [48]:
# define the model
input_layer = Input(shape=(max_sequence_length,))
embedding_layer = Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=embedding_dim)(input_layer)
conv1d_layer = Conv1D(filters=num_filters, kernel_size=kernel_size, activation='relu')(embedding_layer)
maxpool_layer = MaxPooling1D(pool_size=pool_size)(conv1d_layer)
flatten_layer = Flatten()(maxpool_layer)
dropout_layer = tf.keras.layers.Dropout(dropout_rate)(flatten_layer)
output_layer = Dense(num_classes, activation='softmax')(dropout_layer)

In [49]:
model = Model(inputs=input_layer, outputs=output_layer)

In [50]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [52]:
model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=num_epochs, batch_size=batch_size)

Epoch 1/3
Epoch 2/3
Epoch 3/3
 204/4571 [>.............................] - ETA: 17:08 - loss: 1.0415 - accuracy: 0.5522

In [None]:
# Evaluate the model on the test set
test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=2)
print("Test accuracy: ", test_accuracy)
# print classification report and confusion matrix
from sklearn.metrics import classification_report, confusion_matrix
y_pred = model.predict(X_test)
y_pred = np.argmax(y_pred, axis=1)
y_test = np.argmax(y_test, axis=1)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
