In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import zipfile
z= zipfile.ZipFile('/kaggle/input/jigsaw-toxic-comment-classification-challenge/test.csv.zip')
z.extractall()

z= zipfile.ZipFile('/kaggle/input/jigsaw-toxic-comment-classification-challenge/train.csv.zip')
z.extractall()

z= zipfile.ZipFile('/kaggle/input/jigsaw-toxic-comment-classification-challenge/test_labels.csv.zip')
z.extractall()

In [None]:
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

In [None]:
test_df.head()

In [None]:
train_df.head()

In [None]:
tokenizer = Tokenizer(
    num_words=5000,
    filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
    lower=True,
    split=' ',
    char_level=False,
    oov_token=0 #token 0 si no está en el vocabulario
)

In [None]:
tokenizer.fit_on_texts(
    train_df["comment_text"].to_list()
)

In [None]:
train_sequences = tokenizer.texts_to_sequences(train_df["comment_text"].to_list())
test_sequences = tokenizer.texts_to_sequences(test_df["comment_text"].to_list())

In [None]:
train_sequences[0]

In [None]:
tok_config = tokenizer.get_config()

In [None]:
tok_config.keys()

In [None]:
tok_config["index_word"][:100]

In [None]:
tok_config["word_index"][:100]

In [None]:
lengths = [len(texto) for texto in train_sequences]

In [None]:
from collections import Counter
from matplotlib import pyplot as plt

In [None]:
plt.figure(figsize=(18,6))
plt.hist(lengths, bins=500)
plt.show()

In [None]:
train_sequences_padded = pad_sequences(
    train_sequences,
    maxlen=200,
    dtype='int32',
    padding='pre',
    truncating='pre',
    value=0.0
)

test_sequences_padded = pad_sequences(
    test_sequences,
    maxlen=200,
    dtype='int32',
    padding='pre',
    truncating='pre',
    value=0.0
)

In [None]:
lengths = [len(texto) for texto in train_sequences_padded]
plt.figure(figsize=(18,6))
plt.hist(lengths, bins=500)
plt.show()

In [None]:
test_sequences_padded[0]

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding
from tensorflow.keras.callbacks import EarlyStopping

In [None]:
model = Sequential()
model.add(Embedding(5001, 64, mask_zero=True, input_length=200))
model.summary()

In [None]:
model.add(LSTM(100, return_sequences=True, unroll=True))
model.add(LSTM(100, unroll=True))
model.summary()

In [None]:
model.add(Dense(100, activation="relu"))
model.add(Dense(6, activation="sigmoid"))
model.summary()

In [None]:
from tensorflow.keras.metrics import AUC

auc = AUC(
    num_thresholds=200,
    curve='ROC',
    summation_method='interpolation',
    name=None,
    dtype=None,
    thresholds=None,
    multi_label=False,
    num_labels=None,
    label_weights=None,
    from_logits=False
)
model.compile(loss="binary_crossentropy", optimizer="RMSProp", metrics=[auc, "accuracy"])

In [None]:
y_train = train_df.iloc[:,2:].values

In [None]:
es = EarlyStopping(monitor="val_auc", patience=5,restore_best_weights=True )

In [None]:
model.fit(train_sequences_padded, y_train, batch_size=128, epochs=10, validation_split=0.2, callbacks=[es])

In [None]:
y_test = model.predict(test_sequences_padded, verbose=1)

In [None]:
y_test.shape

In [None]:
test_labels_df = pd.read_csv("test_labels.csv")
test_labels_df.head()

In [None]:
test_df[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]] = y_test

In [None]:
test_df.head()

In [None]:
test_df.drop(columns="comment_text").to_csv("submission.csv",index=False)