### 0. Install Dependencies

In [29]:
import os
import pandas as pd
import tensorflow as tf
import numpy as np

In [30]:
df = pd.read_csv(
    os.path.join('data','train.csv','train.csv')
)

In [31]:
df.sample(5)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
26981,477397a3f5343587,Personally I'd have nothing there (i.e. no spa...,0,0,0,0,0,0
37278,6388a502e378e2b4,Reasons to KEEP my FrontPoint system==\nCurren...,0,0,0,0,0,0
122893,917756a7ad6c74b7,"2007 litter act sect 11 is shit, grab qld leg ...",1,0,0,0,0,0
106321,38def99b8c1f123e,"""\nOK, so now I'm even more baffled. Is sarcas...",0,0,0,0,0,0
62167,a64de15dd498c6a8,"Your ass \n\nHey, Okay, you don't want to put ...",1,0,1,0,1,0


In [32]:
df.iloc[3]['comment_text']

'"\nMore\nI can\'t make any real suggestions on improvement - I wondered if the section statistics should be later on, or a subsection of ""types of accidents""  -I think the references may need tidying so that they are all in the exact same format ie date format etc. I can do that later on, if no-one else does first - if you have any preferences for formatting style on references or want to do it yourself please let me know.\n\nThere appears to be a backlog on articles for review so I guess there may be a delay until a reviewer turns up. It\'s listed in the relevant form eg Wikipedia:Good_article_nominations#Transport  "'

In [33]:
df[df.columns[2:]].iloc[155251]

toxic            1
severe_toxic     0
obscene          0
threat           0
insult           0
identity_hate    0
Name: 155251, dtype: int64

### 1. Preprocess

In [34]:
from tensorflow.keras.layers import TextVectorization

In [35]:
X = df['comment_text']
y = df[df.columns[2:]].values

In [36]:
X

0         Explanation\nWhy the edits made under my usern...
1         D'aww! He matches this background colour I'm s...
2         Hey man, I'm really not trying to edit war. It...
3         "\nMore\nI can't make any real suggestions on ...
4         You, sir, are my hero. Any chance you remember...
                                ...                        
159566    ":::::And for the second time of asking, when ...
159567    You should be ashamed of yourself \n\nThat is ...
159568    Spitzer \n\nUmm, theres no actual article for ...
159569    And it looks like it was actually you who put ...
159570    "\nAnd ... I really don't think you understand...
Name: comment_text, Length: 159571, dtype: object

In [37]:
y

array([[0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       ...,
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0]], dtype=int64)

In [38]:
MAX_WORDS = 200000 # number of words in the vocab

In [39]:
vectorizer = TextVectorization(max_tokens=MAX_WORDS, output_sequence_length=1800,output_mode = 'int')

In [40]:
vectorizer.adapt(X.values)

In [41]:
vectorizer.get_vocabulary()

['',
 '[UNK]',
 'the',
 'to',
 'of',
 'and',
 'a',
 'you',
 'i',
 'is',
 'that',
 'in',
 'it',
 'for',
 'this',
 'not',
 'on',
 'be',
 'as',
 'have',
 'are',
 'your',
 'with',
 'if',
 'article',
 'was',
 'or',
 'but',
 'page',
 'my',
 'an',
 'from',
 'by',
 'do',
 'at',
 'about',
 'me',
 'so',
 'wikipedia',
 'can',
 'what',
 'there',
 'all',
 'has',
 'will',
 'talk',
 'please',
 'would',
 'its',
 'no',
 'one',
 'just',
 'like',
 'they',
 'he',
 'dont',
 'which',
 'any',
 'been',
 'should',
 'more',
 'we',
 'some',
 'other',
 'who',
 'see',
 'here',
 'also',
 'his',
 'think',
 'im',
 'because',
 'know',
 'how',
 'am',
 'people',
 'why',
 'edit',
 'articles',
 'only',
 'out',
 'up',
 'when',
 'were',
 'use',
 'then',
 'may',
 'time',
 'did',
 'them',
 'now',
 'being',
 'their',
 'than',
 'thanks',
 'even',
 'get',
 'make',
 'good',
 'had',
 'very',
 'information',
 'does',
 'could',
 'well',
 'want',
 'such',
 'sources',
 'way',
 'name',
 'these',
 'deletion',
 'pages',
 'first',
 'help'

In [42]:
vectorizer("Hi earth this is mars")[:8]

<tf.Tensor: shape=(8,), dtype=int64, numpy=array([ 171, 1252,   14,    9, 6249,    0,    0,    0], dtype=int64)>

In [43]:
vectorized_text = vectorizer(X.values)

In [44]:
vectorized_text[5][:10]

<tf.Tensor: shape=(10,), dtype=int64, numpy=
array([ 2522,    31,    36,    18,   104,    84,     2,  2190,   104,
       14746], dtype=int64)>

In [45]:
# MCSHBAP - map, cache, shuffle, batch, prefetch from_tensor_slices, list_file

dataset = tf.data.Dataset.from_tensor_slices((vectorized_text, y))
dataset = dataset.cache()
dataset = dataset.shuffle(160000)
dataset = dataset.batch(16)
dataset = dataset.prefetch(8) # helps prevent bottlenecks


In [46]:
batch_x,batch_y=dataset.as_numpy_iterator().next()

In [47]:
batch_x

array([[   67,    70,   470, ...,     0,     0,     0],
       [57681,     7,    51, ...,     0,     0,     0],
       [  480, 11616,    61, ...,     0,     0,     0],
       ...,
       [    8,  4457,    12, ...,     0,     0,     0],
       [34125,   143,   273, ...,     0,     0,     0],
       [    8,    55,    69, ...,     0,     0,     0]], dtype=int64)

In [48]:
batch_y

array([[0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0]], dtype=int64)

In [49]:
train = dataset.take(int(len(dataset)*.7))
val = dataset.skip(int(len(dataset)*.7)).take(int(len(dataset)*.2))
test = dataset.skip(int(len(dataset)*.9)).take(int(len(dataset)*.1))

In [50]:
len(train),len(val),len(test) # length of batches

(6981, 1994, 997)

In [51]:
x = lambda i : i*16
for i in [len(train),len(val),len(test)]:
    print(x(i))

111696
31904
15952


### 2. Create Sequential Model

In [52]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Bidirectional, Dense, Embedding

In [55]:
model = Sequential()
model.add(Embedding(MAX_WORDS+1, 32)) # +1 is for unknown
model.add(Bidirectional(LSTM(32, activation='tanh'))) # for gpu we use tanh instead of relu
model.add(Dense(128, activation='relu'))
model.add(Dense(256, activation='relu'))
model.add(Dense(128, activation='relu'))
model.add(Dense(6,activation='sigmoid'))


In [56]:
model.compile(loss='BinaryCrossentropy',optimizer='Adam')

In [57]:
model.summary()

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 32)          6400032   
                                                                 
 bidirectional (Bidirectiona  (None, 64)               16640     
 l)                                                              
                                                                 
 dense (Dense)               (None, 128)               8320      
                                                                 
 dense_1 (Dense)             (None, 256)               33024     
                                                                 
 dense_2 (Dense)             (None, 128)               32896     
                                                                 
 dense_3 (Dense)             (None, 6)                 774       
                                                      

In [58]:
history = model.fit(train,epochs=1, validation_data=val)

1601/6981 [=====>........................] - ETA: 1:40:57 - loss: 0.0857

KeyboardInterrupt: 

In [None]:
history.history

NameError: name 'history' is not defined

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.figure(figsize=(8,5))
pd.DataFrame(history.history).plot()
plt.show

### 3. Make Predictions

### 4. Evaluate Model

### 5. Test and Gradio