# **A NLP model using CNN to classify the toxic words from the comments.**

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

<h2 align=center> Toxic Comments Classification using 1D CNN with Keras</h2>

### Task 1: Import Packages and Functions

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing import text, sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Conv1D, GlobalMaxPooling1D, MaxPooling1D
from sklearn.model_selection import train_test_split
print(tf.__version__)

### Task 2: Load and Explore Data

In [None]:
# Load data
train_df = pd.read_csv('/kaggle/input/jigsaw-toxic-comment-classification-challenge/train.csv.zip').fillna(' ')
train_df.sample(10, random_state = 1)

In [None]:
 x = train_df['comment_text'].values
 print(x)

In [None]:
# View few toxic comments
train_df.loc[train_df['toxic'] == 1].sample(10 , random_state = 10)

In [None]:
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
comments = train_df['comment_text'].loc[train_df['toxic']==1].values
wordcloud = WordCloud(
    width = 640,
    height = 640,
    background_color = 'black',
    stopwords = STOPWORDS).generate(str(comments))
fig = plt.figure(
    figsize = (12, 8),
    facecolor = 'k',
    edgecolor = 'k')
plt.imshow(wordcloud, interpolation = 'bilinear')
plt.axis('off')
plt.tight_layout(pad=0)
plt.show()

In [None]:
y = train_df['toxic'].values
print(y)

In [None]:
# Plot frequency of toxic comments
train_df['toxic'].plot(kind = 'hist' , title = 'Distribution Of Toxic Comments')

In [None]:
train_df['toxic'].value_counts()

### Task 3: Data Prep — Tokenize and Pad Text Data

In [None]:
# Most frequent 20000 words will be embedded and rest will be ignored
max_features = 20000

# The max lenth of the comments will be set as 400 and all those comments where lenght will not be 40
# They'll be padded to 400 length
max_text_length = 400

In [None]:
# text.Tokenizer - It vectorises the values of each text into integers where each integer is index of a token in a dictionary.
x_tokenizer = text.Tokenizer(max_features)

# We'll fit the tokenizer onto the list of x
x_tokenizer.fit_on_texts(list(x))

#After that we'll convert the tokenized text into list of list of sequences of numbers
x_tokenized = x_tokenizer.texts_to_sequences(x)

#We'll pad each of these sequences to the max_length of 400
x_train_val = sequence.pad_sequences(x_tokenized , maxlen = max_text_length)

### Task 4: Prepare Embedding Matrix with Pre-trained GloVe Embeddings

In [None]:
# # !wget http://nlp.stanford.edu/data/glove.6B.zip
# !unzip glove.6B.zip.1

In [None]:
embedding_dim = 100
embeddings_index = dict()
f = open('/kaggle/input/glove6b100dtxt/glove.6B.100d.txt')
for line in f:
  values = line.split()
  word = values[0]
  coefs = np.asarray(values[1:] , dtype = 'float32')
  embeddings_index[word] = coefs

f.close()
print(f'Found {len(embeddings_index)} word vectors. ')

In [None]:
embedding_matrix = np.zeros((max_features , embedding_dim))
for word , index in x_tokenizer.word_index.items():
  if index > max_features -1:
    break
  else:
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
      embedding_matrix[index] = embedding_vector

### Task 5: Create the Embedding Layer

In [None]:
model = Sequential()
model.add(Embedding(max_features,
                    embedding_dim,
                    embeddings_initializer = tf.keras.initializers.Constant(
                        embedding_matrix),
                    trainable = False
                    ))

model.add(Dropout(0.2))

### Task 6: Build the Model

In [None]:
filters = 250
kernel_size = 3 
hidden_dims = 250

In [None]:
model.add(Conv1D(filters,
                 kernel_size,
                 padding = 'valid'))
model.add(MaxPooling1D())
model.add(Conv1D(filters,
                 5,
                 padding = 'valid',
                 activation = 'relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(hidden_dims , activation='relu'))
model.add(Dropout(0.2))

model.add(Dense(1 , activation= 'sigmoid'))
model.summary()

In [None]:
model.compile(loss = 'binary_crossentropy' , optimizer= 'adam' , metrics = ['accuracy'])

### Task 7: Train Model

In [None]:
x_train , x_val , y_train , y_val = train_test_split(x_train_val , y , test_size = 0.15 ,random_state = 1)

In [None]:
batch_size = 32
model.fit(x_train , y_train ,
          batch_size = 32,
          epochs =3,
          validation_data = (x_val ,y_val))

### Task 8: Evaluate Model

In [None]:
test_df = pd.read_csv('/kaggle/input/jigsaw-toxic-comment-classification-challenge/test.csv.zip')

In [None]:
x_test = test_df['comment_text'].values

In [None]:
x_test_tokenized = x_tokenizer.texts_to_sequences(x_test)
x_testing = sequence.pad_sequences(x_test_tokenized ,maxlen =max_text_length)

In [None]:
y_testing =model.predict(x_testing , verbose = 1 , batch_size= 32)

In [None]:
y_testing.shape

In [None]:
y_testing[0]

In [None]:
test_df['Toxic'] = ['not toxic' if x < .5 else 'toxic' for x in y_testing]
test_df[[ 'comment_text' , 'Toxic']].head(20)