In [49]:
# importing libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk

import tensorflow as tf

In [34]:
# loading data

train_data = pd.read_csv('train.csv', encoding='utf-8')
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159571 entries, 0 to 159570
Data columns (total 8 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   id             159571 non-null  object
 1   comment_text   159571 non-null  object
 2   toxic          159571 non-null  int64 
 3   severe_toxic   159571 non-null  int64 
 4   obscene        159571 non-null  int64 
 5   threat         159571 non-null  int64 
 6   insult         159571 non-null  int64 
 7   identity_hate  159571 non-null  int64 
dtypes: int64(6), object(2)
memory usage: 9.7+ MB


In [35]:
train_data.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [36]:
train_data.isnull().sum()

id               0
comment_text     0
toxic            0
severe_toxic     0
obscene          0
threat           0
insult           0
identity_hate    0
dtype: int64

In [37]:
train_data.iloc[6]['comment_text']

'COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK'

In [38]:
train_data[train_data.columns[2:]].iloc[6]

toxic            1
severe_toxic     1
obscene          1
threat           0
insult           1
identity_hate    0
Name: 6, dtype: int64

# Text Normalization and Cleaning

Methods for text normalization and cleaning include:


In [68]:
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.corpus import stopwords
from collections import Counter
import re

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

def remove_emoji(string):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)

def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

def Text_Preprocessing_module(text):
    #Converting Object Data type into String Datatype

    text = str(text)
    # Removal of Punctuation - !"#$%&\'()*+,-./:;<=>?@[\\]^_{|}~`
    text = re.sub(r'[^\w\s]', '', text)
    #Remove URLS
    text = remove_urls(text)
    # Removing Emojis
    text = remove_emoji(text)
    # Lower casing the Text
    text = text.lower()
    # Stopwords removing
    text = [word for word in word_tokenize(text) if word not in stop_words]
    # # Removing Frequent Words
    # cnt = Counter()
    # for word in text:
    #     cnt[word] += 1
    # FREQWORDS = set([w for (w, wc) in cnt.most_common(10)])
    text = " ".join([word for word in text])
    #Tokenization
    text = word_tokenize(text)
    # Stemming and Lemmenization
    text = [lemmatizer.lemmatize(word) for word in text]
    # text = [stemmer.stem(word) for word in text] 
    
    return text



In [69]:
train_data['comment_text'] = train_data['comment_text'].apply(lambda x: Text_Preprocessing_module(x))
# train_data.to_csv('/content/drive/MyDrive/train_sample_processed1.csv', index=False)


In [70]:
train_data['comment_text'] = train_data['comment_text'].apply(lambda x: ' '.join(x))

In [71]:
train_data.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,explanation edits made username hardcore metal...,0,0,0,0,0,0
1,000103f0d9cfb60f,daww match background colour im seemingly stuc...,0,0,0,0,0,0
2,000113f07ec002fd,hey man im really trying edit war guy constant...,0,0,0,0,0,0
3,0001b41b1c6bb37e,cant make real suggestion improvement wondered...,0,0,0,0,0,0
4,0001d958c54c6e35,sir hero chance remember page thats,0,0,0,0,0,0


In [72]:
from tensorflow.keras.layers import TextVectorization

In [73]:
X = train_data['comment_text']
y = train_data[train_data.columns[2:]].values

In [74]:
MAX_FEATURES = 200000 # max number of words for tokenizer or vocabulary size

In [75]:
vectorizer = TextVectorization(max_tokens=MAX_FEATURES, 
                               output_sequence_length=1800,
                               output_mode='int')

In [76]:
vectorizer.adapt(X.values)

In [77]:
vectorizer('Hello World how are you doing today?')

<tf.Tensor: shape=(1800,), dtype=int64, numpy=array([208, 157,   1, ...,   0,   0,   0], dtype=int64)>

In [79]:
vectorized_text = vectorizer(X.values)
len(X)

159571

In [80]:
vectorized_text

<tf.Tensor: shape=(159571, 1800), dtype=int64, numpy=
array([[  461,    55,    58, ...,     0,     0,     0],
       [    1,   963,  1207, ...,     0,     0,     0],
       [  320,   317,    16, ...,     0,     0,     0],
       ...,
       [28800,  6739,   273, ...,     0,     0,     0],
       [   50,     9,   125, ...,     0,     0,     0],
       [   61,    10,    13, ...,     0,     0,     0]], dtype=int64)>

In [81]:
#   Making a tensorflow dataset from the vectorized text
# tensorflow dataset is a data structure that is optimized for training on a GPU or TPU
# MCSHBAP - map cache shuffle batch prefetch 
dataset = tf.data.Dataset.from_tensor_slices((vectorized_text, y))
dataset = dataset.cache()
dataset = dataset.shuffle(160000)
dataset = dataset.batch(16) # batch size of 16
dataset = dataset.prefetch(tf.data.AUTOTUNE) # prefetching data to optimize training and AUTOtune is used to automatically determine the number of batches to prefetch


In [82]:
batch_x , batch_y = dataset.as_numpy_iterator().next()

In [83]:
batch_x.shape

(16, 1800)

In [84]:
batch_y.shape

(16, 6)

In [85]:
train = dataset.take(int(len(X)*0.7))
val = dataset.skip(int(len(X)*0.7)).take(int(len(X)*0.2))
test = dataset.skip(int(len(X)*0.9)).take(int(len(X)*0.1))

In [86]:
train_generator = train.as_numpy_iterator()

In [87]:
train_generator.next()

(array([[  74,   13, 1224, ...,    0,    0,    0],
        [ 836, 3722, 5227, ...,    0,    0,    0],
        [ 946, 7974,  112, ...,    0,    0,    0],
        ...,
        [   7,  146,   80, ...,    0,    0,    0],
        [  43,  515,  236, ...,    0,    0,    0],
        [ 631,  500,   33, ...,    0,    0,    0]], dtype=int64),
 array([[0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1],
        [1, 0, 1, 0, 1, 0]], dtype=int64))

2. Create Sequential Model

In [88]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional, Embedding

In [89]:
model = Sequential()
model.add(Embedding(MAX_FEATURES+1,32)) # embedding layer to convert words to vectors of size 32
model.add(Bidirectional(LSTM(32,activation='tanh',return_sequences=True))) # bidirectional LSTM layer
model.add(Bidirectional(LSTM(32,activation='tanh'))) # bidirectional LSTM layer
model.add(Dense(128,activation='relu')) # dense layer
model.add(Dense(256,activation='relu')) # dense layer
model.add(Dense(128,activation='relu')) # dense layer
model.add(Dense(6,activation='relu')) # dense layer


In [90]:
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])
model.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, None, 32)          6400032   
                                                                 
 bidirectional_6 (Bidirectio  (None, None, 64)         16640     
 nal)                                                            
                                                                 
 bidirectional_7 (Bidirectio  (None, 64)               24832     
 nal)                                                            
                                                                 
 dense_8 (Dense)             (None, 128)               8320      
                                                                 
 dense_9 (Dense)             (None, 256)               33024     
                                                                 
 dense_10 (Dense)            (None, 128)              

In [91]:
history = model.fit(train,epochs=5,validation_data=val)

Epoch 1/5
1656/9974 [===>..........................] - ETA: 5:50:55 - loss: 0.2489 - accuracy: 0.9675

KeyboardInterrupt: 