# Chat Toxicity

### Libraries

In [None]:
# basic libraries
import numpy as np
import pandas as pd
import os
# For plots
from matplotlib import pyplot as plt
from tensorflow.data import Dataset,AUTOTUNE
from tensorflow.keras.layers import TextVectorization

### Config

In [None]:
# 200000 token limit and 2000 sentence limit
TokenLimit=100000
SentenceLimit=1700
# Dataset Variables
ShuffleParameter=100000
BatchSize=16
# DatasetPrefetch=AUTOTUNE
# Creating Sequential model
LSTM_NEURONS=32
Dense1Neurons=128
Dense2Neurons=256
Dense3Neurons=128
OutputLayer=6

### Importing dataset

In [None]:
df=pd.read_csv('train.csv')

In [None]:
df.head()

### Looking into the data

In [None]:
# df[df['toxic']==1]

In [None]:
df.shape

In [None]:
df.columns

In [None]:
df.tail()

### Preprocessing The Data

#### Spliting Data and renaming a column

In [None]:
df.rename(columns = {'obscene':'sexually_explicit'}, inplace = True) 

In [None]:
xData=df['comment_text']
yData=df[df.columns[2:]]

In [None]:
yData.head()

#### Text Tokenization

In [None]:
vectorize=TextVectorization(max_tokens=TokenLimit,
                            output_mode='int',
                            output_sequence_length=SentenceLimit)

In [None]:
vectorize.adapt(xData)

In [None]:
vectorizedData=vectorize(xData)

#### Creating Dataset

In [None]:
# Mapping not required as its alreadt vectorized
dataset = Dataset.from_tensor_slices((vectorizedData, yData))

# Caching data on each epoch for faster training
dataset = dataset.cache()

# Shuffling data to avoid biased results -> Need to call After caching
dataset = dataset.shuffle(ShuffleParameter)

# Creating multiple batches for training
dataset = dataset.batch(BatchSize)

# Fetching batches and keeping them ready for the GPU to train
# Hastens the process
dataset = dataset.prefetch(AUTOTUNE)

#### Creating Test set , training set and validation set

In [None]:
trainSize=int((len(dataset)*70)/100)
testSize=int((len(dataset)*20)/100)
valSize=int((len(dataset)*10)/100)

In [None]:
trainData=dataset.take(trainSize)
testData=dataset.skip(trainSize).take(testSize)
valData=dataset.skip(trainSize+testSize).take(valSize)

### Creating Model

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Bidirectional, Dense, Embedding

In [None]:
model = Sequential([
    Embedding(TokenLimit+1, 32),
    Bidirectional(LSTM(LSTM_NEURONS, activation='tanh')),
    Dense(Dense1Neurons, activation='relu'),
    Dense(Dense2Neurons, activation='relu'),
    Dense(Dense3Neurons, activation='relu'),
    Dense(OutputLayer, activation='sigmoid')
])

#### Training the model

In [None]:
model.compile(loss='BinaryCrossentropy', optimizer='Adam')

In [None]:
history = model.fit(trainData, epochs=3, validation_data=valData)

In [None]:
model.save("coolModel.h5")

In [None]:
import joblib
joblib.dump(model,"coolModel.pkl",compress=9)

In [None]:
plt.figure(figsize=(8,5))
pd.DataFrame(history.history).plot()
plt.show()