# Main Library

In [1]:
# Reading Data
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os, nltk, re, string

# Data Preprocessing
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import word_tokenize, sent_tokenize

# Featuer Extraction
from sklearn.feature_extraction.text import TfidfVectorizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Building Model
import tensorflow as tf
import tensorflow.keras as k
from tensorflow.keras.layers import Dense, Embedding, Bidirectional, LSTM, GlobalAveragePooling1D
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier

## https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge/data

# Reading Data

In [2]:
data_train = pd.read_csv(r'D:\Courses language programming\LLM - Transformer - NLP\NLP - Complete Course\Projects For NLP\Data\toxic-comment-classification-challenge\train.csv')
data_test = pd.read_csv(r'D:\Courses language programming\LLM - Transformer - NLP\NLP - Complete Course\Projects For NLP\Data\toxic-comment-classification-challenge\test.csv')
data_train.head(4)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0


In [3]:
data_test.head(4)

Unnamed: 0,id,comment_text
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap..."
3,00017563c3f7919a,":If you have a look back at the source, the in..."


In [4]:
data_train.isnull().sum()

id               0
comment_text     0
toxic            0
severe_toxic     0
obscene          0
threat           0
insult           0
identity_hate    0
dtype: int64

In [5]:
data_test.isnull().sum()

id              0
comment_text    0
dtype: int64

# Text Preprocessing

In [6]:
punc = string.punctuation
stopword = stopwords.words('english')
ps = PorterStemmer()
lema = WordNetLemmatizer()

In [7]:
def preprocess(data):
    data = data.lower()
    data = re.sub('[^a-zA-Z]', ' ', data)
    data = data.split()
    data = ' '.join([lema.lemmatize(word) for word in data if (word not in punc) or (word not in stopword)])
    return data

In [8]:
data_train['text'] = data_train['comment_text'].apply(preprocess)
data_test['text'] = data_test['comment_text'].apply(preprocess)

data_train.head(4)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,text
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,explanation why the edits made under my userna...
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,d aww he match this background colour i m seem...
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,hey man i m really not trying to edit war it s...
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0,more i can t make any real suggestion on impro...


In [9]:
data_test.head(4)

Unnamed: 0,id,comment_text,text
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...,yo bitch ja rule is more succesful then you ll...
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...,from rfc the title is fine a it is imo
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap...",source zawe ashton on lapland
3,00017563c3f7919a,":If you have a look back at the source, the in...",if you have a look back at the source the info...


# Drop Unusefull Columns

In [10]:
def drop(data):
    data = data.drop(columns=['id', 'comment_text'], axis=1)
    return data

data_train = drop(data_train)
data_test = drop(data_test)

data_train.head(2)

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate,text
0,0,0,0,0,0,0,explanation why the edits made under my userna...
1,0,0,0,0,0,0,d aww he match this background colour i m seem...


In [11]:
data_test.head(2)

Unnamed: 0,text
0,yo bitch ja rule is more succesful then you ll...
1,from rfc the title is fine a it is imo


# Splitting Data

In [12]:
X = data_train['text']
Y = data_train.drop(columns='text', axis=1)

Y[:2], X[:2]

(   toxic  severe_toxic  obscene  threat  insult  identity_hate
 0      0             0        0       0       0              0
 1      0             0        0       0       0              0,
 0    explanation why the edits made under my userna...
 1    d aww he match this background colour i m seem...
 Name: text, dtype: object)

# Feature Extranction

## Using TF-IDF

In [14]:
tfidf = TfidfVectorizer()
new_x = tfidf.fit_transform(X)

new_x

<159571x158781 sparse matrix of type '<class 'numpy.float64'>'
	with 6689980 stored elements in Compressed Sparse Row format>

In [15]:
data_test = tfidf.transform(data_test)

## Using Tokenizer

In [16]:
tokenize = Tokenizer()
tokenize.fit_on_texts(X)
x_seq = tokenize.texts_to_sequences(X)
maxlen = max([len(seq) for seq in x_seq])

x_pad = pad_sequences(x_seq, maxlen=maxlen, padding='pre')
word_voc_length = len(tokenize.word_index) + 1

print("The Shape Is Padding Data is --> ", x_pad.shape)
print("The Max Length of Word is --> ", maxlen)
print("Length The Word Vocab is --> " ,word_voc_length)

The Shape Is Padding Data is -->  (159571, 1403)
The Max Length of Word is -->  1403
Length The Word Vocab is -->  158808


# Splitting Data into Training & Testing

In [17]:
x_train_TFIDF, x_test_TFIDF, y_train_TFIDF, y_test_TFIDF = train_test_split(new_x, Y, train_size=0.7)

x_train, x_test, y_train, y_test = train_test_split(x_pad, Y, train_size=0.7, shuffle=True, random_state=42)

# Building Deep Learning Model

In [28]:
model = k.models.Sequential([
    Embedding(word_voc_length, 100, input_length=maxlen),
    GlobalAveragePooling1D(),
    Dense(128, activation="relu"),
    Dense(6, activation="softmax")
])

model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 1403, 100)         15880800  
                                                                 
 global_average_pooling1d_1  (None, 100)               0         
  (GlobalAveragePooling1D)                                       
                                                                 
 dense_2 (Dense)             (None, 128)               12928     
                                                                 
 dense_3 (Dense)             (None, 6)                 774       
                                                                 
Total params: 15894502 (60.63 MB)
Trainable params: 15894502 (60.63 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [39]:
model.fit(x_train, y_train, epochs=10, validation_data=(x_test, y_test))

Epoch 1/10
Epoch 2/10

# Building Machine Learning Model

## RandomForestClassifier

In [36]:
model_tfidf_rf = RandomForestClassifier()
model_tfidf_rf.fit(x_train_TFIDF, y_train_TFIDF)

print('THe Training Score is --> ', model_tfidf_rf.score(x_train_TFIDF, y_train_TFIDF))
print('THe Testing Score is --> ', model_tfidf_rf.score(x_test_TFIDF, y_test_TFIDF))

THe Training Score is -->  0.9991136894690195
THe Testing Score is -->  0.9104695855614974


In [37]:
model_tokenize = RandomForestClassifier()
model_tokenize.fit(x_train, y_train)

print('THe Training Score is --> ', model_tokenize.score(x_train, y_train))
print('THe Testing Score is --> ', model_tokenize.score(x_test, y_test))

THe Training Score is -->  0.999185310522028
THe Testing Score is -->  0.8984375


# Prediction