## Install WandB

In [1]:
!pip install wandb --quiet
!wandb login

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.3/207.3 kB[0m [31m24.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m267.1/267.1 kB[0m [31m34.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.7/62.7 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25h[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit: 
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


## Dependencies

In [2]:
import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, GlobalMaxPooling1D
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model

import pandas as pd
from wandb.keras import WandbCallback
import wandb
import numpy as np
import zipfile
import matplotlib.pyplot as plt
import sys
import os

## Extract

In [3]:
GLOVE_ZIP_PATH = "/content/drive/MyDrive/Datasets/Toxic Comments/glove.6B.zip"
ZIP_PATH = "/content/drive/MyDrive/Datasets/Toxic Comments/jigsaw-toxic-comment-classification-challenge.zip"
if not os.path.exists("/content/data"):
    os.mkdir("/content/data")
with zipfile.ZipFile(ZIP_PATH, "r") as zipref:
    zipref.extractall("/content/data")
SUB_ZIP_PATHS = os.listdir("/content/data")
for zip_file in SUB_ZIP_PATHS:
    full_path = os.path.join("/content/data", zip_file)
    with zipfile.ZipFile(full_path, "r") as zipref:
        zipref.extractall("/content/data")

# Extract GLOVE
if not os.path.exists("/content/glove"):
    os.mkdir("/content/glove")
with zipfile.ZipFile(GLOVE_ZIP_PATH, "r") as zipref:
    zipref.extractall("/content/glove")

In [4]:
config = {
    "ARCHITECTURE": "CNN",
    "DATASET": "jigsaw_toxic_comments",
    "WEIGHTS": "STANFORD GLOVE.6B",
    "MAX_SEQUENCE_LENGTH": 100,
    "MAX_VOCAB_SIZE": 20000,
    "EMBED_SIZE": 100,
    "VALIDATION_SPLIT": 0.2,
    "BATCH_SIZE": 128,
    "EPOCHS": 10
}
wandb.init(project="Toxic_Comment_with_CNN", config=config)
config = wandb.config

[34m[1mwandb[0m: Currently logged in as: [33mtaiduc1001[0m ([33mduckyhome[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [5]:
print("Load word vectors...")
word2vec = {}
with open(f"/content/glove/glove.6B.{config.EMBED_SIZE}d.txt", "r") as f:
    for line in f:
        values = line.split()
        word = values[0]
        vec = np.array(values[1:], dtype="float32")
        word2vec[word] = vec
print(f"Found {len(word2vec)} word vectors with length {len(vec)}.")

Load word vectors...
Found 400000 word vectors with length 100.


In [6]:
print("Load comments...")
train = pd.read_csv("/content/data/train.csv")
sentences = train["comment_text"].fillna("DUMMY_VALUE").values
possible_labels = train.columns.values[2:]
targets = train[possible_labels].values

Load comments...


In [7]:
tokenizer = Tokenizer(num_words=config.MAX_VOCAB_SIZE)
tokenizer.fit_on_texts(sentences)
sequences = tokenizer.texts_to_sequences(sentences)

In [8]:
print(f"Max sequence length: {max(len(s) for s in sequences)}")
print(f"Min sequence length: {min(len(s) for s in sequences)}")
s = sorted(len(s) for s in sequences)
print(f"Median sequence length: {s[len(s)//2]}")

Max sequence length: 1400
Min sequence length: 0
Median sequence length: 35


In [9]:
word2idx = tokenizer.word_index
print(f"Found {len(word2idx)} unique tokens.")

Found 210337 unique tokens.


In [10]:
data = pad_sequences(sequences, maxlen=config.MAX_SEQUENCE_LENGTH)
print(f"Shape of data tensor: ", data.shape)

Shape of data tensor:  (159571, 100)


In [11]:
# 159571 là số training example
# 1400 là độ dài lớn nhất của example
# 210337 là số unique token, hoặc là số unique word
# (159671, 100) là shape của bộ dataset khi đã tokenize, rows là số example, columns là số word tối đa sau khi bị crop và pad

In [12]:
print("Filling pre-trained embeddings...")
num_words = min(config.MAX_VOCAB_SIZE, len(word2idx)+1)
embedding_matrix = np.zeros((num_words, config.EMBED_SIZE))
for word, i in word2idx.items():
    if i < config.MAX_VOCAB_SIZE:
        embedding_vector = word2vec.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

Filling pre-trained embeddings...


In [13]:
class CNNToxicComment(Model):
    def __init__(self):
        super().__init__()
        self.embed_layer = Embedding(
            input_dim = num_words,
            output_dim = config.EMBED_SIZE,
            weights = [embedding_matrix],
            input_length = config.MAX_SEQUENCE_LENGTH,
            trainable = False
        )
        self.body = keras.Sequential([
            Conv1D(128, 3, activation='relu'),
            MaxPooling1D(3),
            Conv1D(128, 3, activation='relu'),
            MaxPooling1D(3),
            Conv1D(128, 3, activation='relu'),
            GlobalMaxPooling1D(),
            Dense(128, activation='relu'),
            Dense(6, activation='sigmoid')
        ])
    def call(self, x):
        x = self.embed_layer(x)
        x = self.body(x)
        return x

In [None]:
callbacks = [
    WandbCallback(
        monitor='val_loss',
        mode='auto'
    ),
    WandbCallback(
        monitor='val_accuracy',
        mode='auto'
    ),
    WandbCallback(
        monitor='loss',
        mode='auto'
    ),
    WandbCallback(
        monitor='accuracy',
        mode='auto'
    )
]

model = CNNToxicComment()
model.compile(
    loss='binary_crossentropy',
    optimizer='rmsprop',
    metrics=['accuracy']
)
model.fit(
    data,
    targets,
    batch_size=config.BATCH_SIZE,
    epochs=config.EPOCHS,
    validation_split=config.VALIDATION_SPLIT,
    callbacks=callbacks
)