In [None]:
##importing the dependencies
import re
import pandas as pd
import numpy as np
import nltk

In [None]:
#configuring the path of kaggle.json file
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
!kaggle competitions download -c jigsaw-toxic-comment-classification-challenge

Downloading jigsaw-toxic-comment-classification-challenge.zip to /content
 97% 51.0M/52.6M [00:03<00:00, 24.6MB/s]
100% 52.6M/52.6M [00:03<00:00, 16.9MB/s]


In [None]:
##extracting all the files from dataset
from zipfile import ZipFile
dataset = '/content/jigsaw-toxic-comment-classification-challenge.zip'
with ZipFile(dataset, 'r') as zip:
  zip.extractall()
  print('The dataset is extracted')

The dataset is extracted


In [None]:
!ls

jigsaw-toxic-comment-classification-challenge.zip  test.csv.zip
kaggle.json					   test_labels.csv.zip
sample_data					   train.csv.zip
sample_submission.csv.zip


In [None]:
##extracting the train zipfile
from zipfile import ZipFile
dataset = '/content/train.csv.zip'

with ZipFile(dataset, 'r') as zip:
  zip.extractall()
  print('The dataset is extracted')

The dataset is extracted


In [None]:
!ls

jigsaw-toxic-comment-classification-challenge.zip  test.csv.zip
kaggle.json					   test_labels.csv.zip
sample_data					   train.csv
sample_submission.csv.zip			   train.csv.zip


In [None]:
data = pd.read_csv('/content/train.csv')

In [None]:
data.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [None]:
X = data['comment_text']

In [None]:
y = data[data.columns[2:]].values

Preprocessing the comment text

In [None]:
#making all the text to lowercase
X = X.apply(lambda x : x.lower())

In [None]:
##remmoving the urls in text data
import re
def remove_url(text):
  return re.sub(r"https?://\S+|www\.\S+", "", text)

In [None]:
#this will remove all the url from comment_text
X = X.apply(lambda x : remove_url(x))

In [None]:
##function for removing the html tags
def remove_html(text):
  html = re.compile(r"<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});")
  return re.sub(html, '',text)

In [None]:
X = X.apply(lambda x : remove_html(x))

In [None]:
##function for removing non ascii - characters
def remove_nonasci(text):
  return re.sub(r'[^\x00-\x7f]',r'', text)

In [None]:
X = X.apply(lambda x : remove_nonasci(x))

In [None]:
##function for removing the special characters
def remove_special_characters(text):
    """
        Remove special special characters, including symbols, emojis, and other graphic characters
    """
    emoji_pattern = re.compile(
        '['
        u'\U0001F600-\U0001F64F'  # emoticons
        u'\U0001F300-\U0001F5FF'  # symbols & pictographs
        u'\U0001F680-\U0001F6FF'  # transport & map symbols
        u'\U0001F1E0-\U0001F1FF'  # flags (iOS)
        u'\U00002702-\U000027B0'
        u'\U000024C2-\U0001F251'
        ']+',
        flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

In [None]:
X = X.apply(lambda x : remove_special_characters(x))

In [None]:
from tqdm import tqdm
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')

In [None]:
##removing all the non alphabet characters including the numbers
corpus = []
for i in tqdm(X['text']):
  review = re.sub('[^a-zA-Z]' , ' ' ,i)
  review = review.split()
  review = ' '.join(review)
  corpus.append(review)

100%|██████████| 159571/159571 [00:04<00:00, 38354.26it/s]


In [None]:
##removing all the stopwords form sentences
new_corpus = []
for i in tqdm(corpus):
  review = i.split()
  review = [word for word in review if word not in stopwords.words('english')]
  review = ' '.join(review)
  new_corpus.append(review)

100%|██████████| 159571/159571 [18:58<00:00, 140.18it/s]


In [None]:
pd.set_option('display.max_colwidth', None)
#X.rename(columns={0:'text'} , inplace=True)
X['clean_text'] = new_corpus

**Model Building**

In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense , Embedding , Input
from tensorflow.keras.layers import LSTM , Bidirectional , Dropout
from tensorflow.keras.layers import TextVectorization
from tensorflow.keras.callbacks import EarlyStopping

In [None]:
X.head(10)

Unnamed: 0,text,clean_text
0,explanation why the edits made under my username hardcore metallica fan were reverted they weren t vandalisms just closure on some gas after i voted at new york dolls fac and please don t remove the template from the talk page since i m retired now,explanation edits made username hardcore metallica fan reverted vandalisms closure gas voted new york dolls fac please remove template talk page since retired
1,d aww he matches this background colour i m seemingly stuck with thanks talk january utc,aww matches background colour seemingly stuck thanks talk january utc
2,hey man i m really not trying to edit war it s just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page he seems to care more about the formatting than the actual info,hey man really trying edit war guy constantly removing relevant information talking edits instead talk page seems care formatting actual info
3,more i can t make any real suggestions on improvement i wondered if the section statistics should be later on or a subsection of types of accidents i think the references may need tidying so that they are all in the exact same format ie date format etc i can do that later on if no one else does first if you have any preferences for formatting style on references or want to do it yourself please let me know there appears to be a backlog on articles for review so i guess there may be a delay until a reviewer turns up it s listed in the relevant form eg wikipedia good article nominations transport,make real suggestions improvement wondered section statistics later subsection types accidents think references may need tidying exact format ie date format etc later one else first preferences formatting style references want please let know appears backlog articles review guess may delay reviewer turns listed relevant form eg wikipedia good article nominations transport
4,you sir are my hero any chance you remember what page that s on,sir hero chance remember page
5,congratulations from me as well use the tools well talk,congratulations well use tools well talk
6,cocksucker before you piss around on my work,cocksucker piss around work
7,your vandalism to the matt shirvington article has been reverted please don t do it again or you will be banned,vandalism matt shirvington article reverted please banned
8,sorry if the word nonsense was offensive to you anyway i m not intending to write anything in the article wow they would jump on me for vandalism i m merely requesting that it be more encyclopedic so one can use it for school as a reference i have been to the selective breeding page but it s almost a stub it points to animal breeding which is a short messy article that gives you no info there must be someone around with expertise in eugenics,sorry word nonsense offensive anyway intending write anything article wow would jump vandalism merely requesting encyclopedic one use school reference selective breeding page almost stub points animal breeding short messy article gives info must someone around expertise eugenics
9,alignment on this subject and which are contrary to those of dulithgow,alignment subject contrary dulithgow


In [None]:
MAX_WORDS = 100000   ##number of words in the vocab
from tensorflow.keras.layers import TextVectorization
vectorizer = TextVectorization(max_tokens=MAX_WORDS,
                               output_sequence_length=2000,
                               output_mode='int')

In [None]:
vectorizer.adapt(X['clean_text'].values)

In [None]:
len(vectorizer.get_vocabulary())

100000

In [None]:
vectorized_text = vectorizer(X['clean_text'].values)
dataset = tf.data.Dataset.from_tensor_slices((vectorized_text , y))
dataset = dataset.cache()
dataset = dataset.shuffle(160000)
dataset = dataset.batch(16)  ##batch of 16 data point
dataset = dataset.prefetch(8)   #it helps prevent the bottle neck
batch_X , batch_y = dataset.as_numpy_iterator().next() ##just like train test split
train = dataset.take(int(len(dataset)*.7))  ##70 % of data for training 
val = dataset.skip(int(len(dataset)*.7)).take(int(len(dataset)*.2))  # skiping the 70% and assiging the 20% for val
test = dataset.skip(int(len(dataset)*.9)).take(int(len(dataset)*.1)) ## skipping the 90% and assiging the 10% for test
train_generator = train.as_numpy_iterator()
train_generator.next()
model = Sequential()
#create the embedding layer
model.add(Embedding(MAX_WORDS +1 , 32))
#bidirectional layer for reading the text from both side
model.add(Bidirectional(LSTM(32 , activation='tanh')))
#feature extractor fully connected layers
model.add(Dense(128 , activation='relu'))
model.add(Dense(256 , activation='relu'))
model.add(Dense(256 , activation='relu'))
#final layer
model.add(Dense(6 , activation='sigmoid'))
model.compile(loss='BinaryCrossentropy' , optimizer='adam')

In [None]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 32)          3200032   
                                                                 
 bidirectional (Bidirectiona  (None, 64)               16640     
 l)                                                              
                                                                 
 dense (Dense)               (None, 128)               8320      
                                                                 
 dense_1 (Dense)             (None, 256)               33024     
                                                                 
 dense_2 (Dense)             (None, 256)               65792     
                                                                 
 dense_3 (Dense)             (None, 6)                 1542      
                                                        

In [None]:
history = model.fit(train , epochs=1 , validation_data=val)



In [None]:
y_pred = model.predict(test)



In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score , confusion_matrix , classification_report
from sklearn.model_selection import train_test_split

In [None]:
vectorizer = TfidfVectorizer()

In [None]:
x = vectorizer.fit_transform(X['clean_text'])

In [None]:
print(x)

  (0, 122163)	0.2457492371378986
  (0, 132327)	0.13081076925883425
  (0, 105571)	0.08850551739932903
  (0, 142266)	0.08945784817396199
  (0, 143687)	0.16711781068666715
  (0, 120904)	0.14684509584821537
  (0, 110746)	0.09567049668511367
  (0, 49568)	0.2389972791776176
  (0, 40960)	0.32310321376858325
  (0, 163310)	0.207450547104356
  (0, 98595)	0.1285967365433928
  (0, 156613)	0.23753914200553589
  (0, 56181)	0.24544406872926616
  (0, 27046)	0.2768200417360127
  (0, 154229)	0.3073421958968296
  (0, 122407)	0.15842071837350788
  (0, 50156)	0.20515475109908857
  (0, 91109)	0.30948932643679034
  (0, 62337)	0.2706165642650714
  (0, 153458)	0.18850082861893697
  (0, 86251)	0.12792286719627344
  (0, 43796)	0.13090586741784432
  (0, 49048)	0.18421473509236752
  (1, 153634)	0.20168769931346445
  (1, 73821)	0.28947464145034674
  :	:
  (159569, 84841)	0.3999722770472836
  (159569, 116035)	0.32689678015425655
  (159569, 1440)	0.3279384267716939
  (159569, 155250)	0.38360295720466253
  (159569, 83

In [None]:
X_train ,X_test , Y_train , Y_test = train_test_split(x , y , test_size=0.10 , random_state=2)

In [None]:
X_train.shape

(143613, 164770)

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
log_model = RandomForestClassifier()
log_model.fit(X_train[:100000] , Y_train[:100000])

In [None]:
log_y_pred = log_model.predict(X_test)

In [None]:
accuracy_score(log_y_pred , Y_test)

0.9098884572001504