### Importing data and libraries

In [1]:
import os
import numpy as np
import pandas as pd

import re
import nltk
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

import tensorflow as tf
from tensorflow.python.keras.models import Model, Sequential
from tensorflow.python.keras.layers import Dense, Embedding, Input, Activation, Masking
from tensorflow.python.keras.layers import LSTM, Bidirectional, GlobalMaxPool1D, Dropout
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
from tensorflow.python.keras import optimizers, initializers, layers

import seaborn as sb
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
train = pd.read_csv('./jigsaw-toxic-comment-train.csv.zip')
test = pd.read_csv('./test.csv.zip')

In [3]:
print('Train shape: ' + str(train.shape))
print('Test shape: ' + str(test.shape))

Train shape: (223549, 8)
Test shape: (63812, 3)


### Data Exploration and Analysis

In [4]:
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [5]:
test.head()

Unnamed: 0,id,content,lang
0,0,Doctor Who adlı viki başlığına 12. doctor olar...,tr
1,1,"Вполне возможно, но я пока не вижу необходимо...",ru
2,2,"Quindi tu sei uno di quelli conservativi , ...",it
3,3,Malesef gerçekleştirilmedi ancak şöyle bir şey...,tr
4,4,:Resim:Seldabagcan.jpg resminde kaynak sorunu ...,tr


In [6]:
train.describe()

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
count,223549.0,223549.0,223549.0,223549.0,223549.0,223549.0
mean,0.095657,0.008777,0.054306,0.003082,0.050566,0.00947
std,0.294121,0.093272,0.226621,0.055431,0.21911,0.096852
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0


In [7]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 223549 entries, 0 to 223548
Data columns (total 8 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   id             223549 non-null  object
 1   comment_text   223549 non-null  object
 2   toxic          223549 non-null  int64 
 3   severe_toxic   223549 non-null  int64 
 4   obscene        223549 non-null  int64 
 5   threat         223549 non-null  int64 
 6   insult         223549 non-null  int64 
 7   identity_hate  223549 non-null  int64 
dtypes: int64(6), object(2)
memory usage: 13.6+ MB


### Cleaning the Dataset

In [8]:
review = train['comment_text'].loc[0]
review

"Explanation\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27"

In [9]:
def clean_text(text):
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = re.sub('\[[^]]*\]', ' ', text)
    text = re.sub('[^a-zA-Z]', ' ', text)
    text = text.lower()
    return text

In [10]:
train['comment_text'] = train['comment_text'].apply(lambda x: clean_text(x))
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,explanationwhy the edits made under my usernam...,0,0,0,0,0,0
1,000103f0d9cfb60f,d aww he matches this background colour i m s...,0,0,0,0,0,0
2,000113f07ec002fd,hey man i m really not trying to edit war it...,0,0,0,0,0,0
3,0001b41b1c6bb37e,morei can t make any real suggestions on impr...,0,0,0,0,0,0
4,0001d958c54c6e35,you sir are my hero any chance you remember...,0,0,0,0,0,0


In [11]:
test['content'] = test['content'].apply(lambda x: clean_text(x))
test.head()

Unnamed: 0,id,content,lang
0,0,doctor who adl viki ba l na doctor olarak...,tr
1,1,...,ru
2,2,quindi tu sei uno di quelli conservativi ...,it
3,3,malesef ger ekle tirilmedi ancak yle bir ey...,tr
4,4,resim seldabagcan jpg resminde kaynak sorunu ...,tr


In [12]:
def remove_stopwords(text):
    text = text.split()
    text = [word for word in text if not word in set(stopwords.words('english'))]
    lem = WordNetLemmatizer()
    text = [lem.lemmatize(word) for word in text]
    text = ' '.join(text)
    return text

In [13]:
import sys
print(sys.getrecursionlimit())
sys.setrecursionlimit(10000000)

3000


In [14]:
train['comment_text'] = train['comment_text'].apply(lambda x: clean_text(x))
test['content'] = test['content'].apply(lambda x: clean_text(x))
train.to_csv('clean_train.csv')
test.to_csv('clean_test.csv')

In [15]:
train_df = train['comment_text']
test_df = test['content']

### LSTM

Steps for the Model:
- Tokenize
- Pad
- Create Model
- Fit the Model
- Evaluate

In [29]:
max_features = 20000
maxlen = 200
tokenizer = Tokenizer(num_words = max_features)

In [30]:
# Tokenize

tokenizer.fit_on_texts(train_df)
X_train_token = tokenizer.texts_to_sequences(train_df)

tokenizer.fit_on_texts(test_df)
X_test_token = tokenizer.texts_to_sequences(test_df)

In [31]:
# Pad

X_train = pad_sequences(X_train_token, maxlen = maxlen, padding = 'post')
X_test  = pad_sequences(X_test_token, maxlen = maxlen, padding = 'post')
print(X_train.shape, X_test.shape)

(223549, 200) (63812, 200)


In [32]:
cols = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
targets = train[cols].values

In [37]:
# Create the Model

model = Sequential([Input(shape=(maxlen, )),
                    Embedding(max_features, 128, mask_zero = True),
                    LSTM(64, return_sequences = True, dropout = 0.2),
                    GlobalMaxPool1D(),
                    Dropout(0.2),
                    Dense(64, activation = 'relu'),
                    Dropout(0.2),
                    Dense(6, activation = 'softmax')])

In [38]:
model.compile(loss = 'binary_crossentropy',
              optimizer = 'adam',
              metrics = ['accuracy'])

In [39]:
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 200, 128)          2560000   
_________________________________________________________________
lstm_3 (LSTM)                (None, 200, 64)           49408     
_________________________________________________________________
global_max_pooling1d_3 (Glob (None, 64)                0         
_________________________________________________________________
dropout_6 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_6 (Dense)              (None, 64)                4160      
_________________________________________________________________
dropout_7 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_7 (Dense)              (None, 6)                

In [40]:
from keras.callbacks import ModelCheckpoint, EarlyStopping

monitor = EarlyStopping(monitor = 'val_loss', 
                        min_delta = 1e-3, 
                        patience = 5, verbose = 1, 
                        restore_best_weights = True)

history = model.fit(X_train, targets,
                    batch_size = 32,
                    epochs = 3, validation_split = 0.1,
                    callbacks = [monitor])

Epoch 1/3
Epoch 2/3

KeyboardInterrupt: 

In [None]:
# Evaluate

plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'], '')
plt.xlabel("Epochs")
plt.ylabel('Accuracy')
plt.title('Change of Accuracy over Epochs')
plt.legend(['accuracy', 'val_accuracy'])
plt.show()

In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'], '')
plt.xlabel("Epochs")
plt.ylabel('Loss')
plt.title('Change of Loss over Epochs')
plt.legend(['loss', 'val_loss'])
plt.show()

### Prediction on test data

In [2]:
prediction = model.predict(X_test)
prediction

NameError: name 'model' is not defined

In [None]:
import json
from keras.models import model_from_json, load_model

model.save('toxicity_model.h5')

# Save Weights + Architecture
model.save_weights('toxicity_model_weights.h5')
with open('toxicity_model_architecture.json', 'w') as f:
    f.write(model.to_json())

### Test on custom data

In [4]:
!pip install 'tensorflow==2.4.1'
!pip install 'keras==2.4.0'





In [6]:
from tensorflow import keras
model = keras.models.load_model('./server/models/model')

In [7]:
from tensorflow import keras
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
import re
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TreebankWordTokenizer
import numpy as np

In [8]:
CLASSES = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [9]:
def clean_text(text):
    text = text.lower()
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = re.sub('\[[^]]*\]', ' ', text)
    text = re.sub('[^a-zA-Z]', ' ', text)
    tokenizer = TreebankWordTokenizer()
    comment_tokens = tokenizer.tokenize(text)
    return comment_tokens

def remove_stopwords(text):
    stemmer=PorterStemmer()
    words=[]

    for word in text:
        if(word not in stopwords.words('english')):
            stem_word=stemmer.stem(word)
            words.append(stem_word)
    return words

def output_prediction(text, max_features = 22000, maxlen = 200):
    tokenizer=Tokenizer(num_words=max_features)
    tokenizer.fit_on_texts(text)
    tokenized_train=tokenizer.texts_to_sequences(text)
    x_train=pad_sequences(tokenized_train,maxlen=maxlen)
    if len(x_train) == 0:
        return {
            "error" : "Not Found"
        }
    prediction=model.predict(x_train)
    prediction = np.sum(prediction, axis=0)
    result = []
    for i, value in enumerate(prediction):
        if i == 0:
            continue
        if value > 0.05:
            result.append(CLASSES[i]) 
    return {"result" : result if len(result) != 0 else [CLASSES[0]]}

def text_analysis(text):
    cleaned_text_data = clean_text(text)
    preprocessed_data = remove_stopwords(cleaned_text_data)
    return output_prediction(preprocessed_data)

In [21]:
test = "bitch"

In [23]:
print(text_analysis(test))

{'result': ['toxic']}


In [32]:
import json

toxicity = text_analysis(test)
output = json.dumps(toxicity)
output2 = json.loads(output)

print(output2['result'])

['toxic']
