# Downloading the Dataset

In [1]:
!gdown --id 18l6IwSqavnqtLQpVnrRqOugZf9XkhEAN

Downloading...
From: https://drive.google.com/uc?id=18l6IwSqavnqtLQpVnrRqOugZf9XkhEAN
To: /content/jigsaw-toxic-comment-classification-challenge.zip
55.2MB [00:00, 82.8MB/s]


# Extracting the Dataset

In [2]:
!unzip jigsaw-toxic-comment-classification-challenge.zip

Archive:  jigsaw-toxic-comment-classification-challenge.zip
  inflating: sample_submission.csv.zip  
  inflating: test.csv.zip            
  inflating: test_labels.csv.zip     
  inflating: train.csv.zip           


In [3]:
!unzip sample_submission.csv.zip

Archive:  sample_submission.csv.zip
  inflating: sample_submission.csv   


In [4]:
!unzip test.csv.zip

Archive:  test.csv.zip
  inflating: test.csv                


In [5]:
!unzip test_labels.csv.zip

Archive:  test_labels.csv.zip
  inflating: test_labels.csv         


In [6]:
!unzip train.csv.zip

Archive:  train.csv.zip
  inflating: train.csv               


# Importing and Downloading necessary packages and libraries

In [7]:
import os
import glob
from bs4 import BeautifulSoup
import lxml.html
import re
import numpy as np    
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer 
import bisect
import math
from sklearn.feature_extraction.text import TfidfVectorizer
import scipy
from gensim.models import Word2Vec
import csv
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import precision_recall_fscore_support

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

# Data from csv files
In this section, data is extracted from train.csv and test.csv file using csv.reader and stores in a list where each row contains all the fields of each csv row. 

In [8]:
fields1 = []
train_data = []
  
with open("train.csv", 'r') as csvfile:
  csvreader = csv.reader(csvfile)
  fields1 = next(csvreader)
  for row in csvreader:
    train_data.append(row)

fields2 = []
test_data = []
  
with open("test.csv", 'r') as csvfile:
  csvreader = csv.reader(csvfile)
  fields2 = next(csvreader)
  for row in csvreader:
    test_data.append(row)

# Data Preprocessing
In this section, data preprocessing is done on training data.

In [9]:
files = []
corpus = []

punc = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
porter = PorterStemmer()
lemmatizer = WordNetLemmatizer() 
postag = nltk.corpus.wordnet
stop_words = set(stopwords.words('english'))

for row in train_data :
  raw_text = row[1]
  for sym in raw_text : 
    # Removing punctuation
    if sym in punc : 
      raw_text = raw_text.replace(sym, "")
  # Removing non-alphabetic text and Converting text to lower-case
  words = [word.lower() for word in raw_text.split() if word.isalpha()]
  # Removing stopwords
  words = [w for w in words if not w in stop_words]
  # Performing lemmatization
  lemmatized = [lemmatizer.lemmatize(word) for word in words]
  temp_files = []
  temp_files.append(row[0])
  temp_files.append(lemmatized)
  # Storing file data to list
  files.append(temp_files)
  # Appending data to 'files' list
  corpus.append(lemmatized)
  
print("Total Documents: ", len(files))

Total Documents:  159571


# Word2Vec model training using CBOW

In [10]:
vect_size = 100

print("Training with CBOW started...")
# Training Word2Vec model using CBOW 
cbow_model = Word2Vec(corpus, size = vect_size, min_count = 2,window = 10, sg = 0, hs = 1, iter = 5, workers = 10)

print("Training with CBOW done.")

Training with CBOW started...
Training with CBOW done.


# Document Vectors
In this section, document vectors are created in which vector for each word is added in the row and then it is normalised with the length of that document.

In [11]:
print("Creating vectors for Documents using CBOW word2vec ...")
# Calculating weighted average and forming document vectors
count = 0
train_arr = []
for i in corpus:
    demo = np.zeros((vect_size,),dtype = float) # initializing a zero matrix
    for j in i:
        try:
            demo = demo + cbow_model.wv.__getitem__(j) # adding values of each token in doc to vector
        except:
          continue
    count = count + 1
    try:
      train_arr.append(demo/len(i))
    except:
      train_arr.append(demo)
    if count%10000 == 0:
      print("Progress:", count, "/", len(corpus))

print("Vectors created.")

Creating vectors for Documents using CBOW word2vec ...


  


Progress: 10000 / 159571
Progress: 20000 / 159571
Progress: 30000 / 159571
Progress: 40000 / 159571
Progress: 50000 / 159571
Progress: 60000 / 159571
Progress: 70000 / 159571
Progress: 80000 / 159571
Progress: 90000 / 159571
Progress: 100000 / 159571
Progress: 110000 / 159571
Progress: 120000 / 159571
Progress: 130000 / 159571
Progress: 140000 / 159571
Progress: 150000 / 159571
Vectors created.


# Storing training labels
In this section, all the six labels from training data are loaded in separate lists.

In [12]:
train_labels1 = []
train_labels2 = []
train_labels3 = []
train_labels4 = []
train_labels5 = []
train_labels6 = []

for row in train_data : 
  train_labels1.append(row[2])
  train_labels2.append(row[3])
  train_labels3.append(row[4])
  train_labels4.append(row[5])
  train_labels5.append(row[6])
  train_labels6.append(row[7])


y1 = np.array(train_labels1)
y2 = np.array(train_labels2)
y3 = np.array(train_labels3)
y4 = np.array(train_labels4)
y5 = np.array(train_labels5)
y6 = np.array(train_labels6)

# Scaling of training data for proper execution

In [13]:
from sklearn.svm import LinearSVC

X = np.array(train_arr)
X = np.nan_to_num(X)
scaler = StandardScaler().fit(X)
X_scaled = scaler.transform(X)

# Implementing SVM for classification using LinearSVC


In [14]:
clf1 = make_pipeline(LinearSVC(random_state=0, tol=1e-5, verbose=True, max_iter=100))
clf1.fit(X_scaled, y1)

clf2 = make_pipeline(LinearSVC(random_state=0, tol=1e-5, verbose=True, max_iter=100))
clf2.fit(X_scaled, y2)

clf3 = make_pipeline(LinearSVC(random_state=0, tol=1e-5, verbose=True, max_iter=100))
clf3.fit(X_scaled, y3)

clf4 = make_pipeline(LinearSVC(random_state=0, tol=1e-5, verbose=True, max_iter=100))
clf4.fit(X_scaled, y4)

clf5 = make_pipeline(LinearSVC(random_state=0, tol=1e-5, verbose=True, max_iter=100))
clf5.fit(X_scaled, y5)

clf6 = make_pipeline(LinearSVC(random_state=0, tol=1e-5, verbose=True, max_iter=100))
clf6.fit(X_scaled, y6)

[LibLinear]



[LibLinear]



[LibLinear]



[LibLinear]



[LibLinear]



[LibLinear]



Pipeline(memory=None,
         steps=[('linearsvc',
                 LinearSVC(C=1.0, class_weight=None, dual=True,
                           fit_intercept=True, intercept_scaling=1,
                           loss='squared_hinge', max_iter=100,
                           multi_class='ovr', penalty='l2', random_state=0,
                           tol=1e-05, verbose=True))],
         verbose=False)

# Accuracy of trained data

In [15]:
from sklearn.metrics import accuracy_score

#Optional training accuracy

predicted = clf1.predict(X_scaled)
print("Training accuracy using sklearn LinearSVC:",round(accuracy_score(y1, predicted)*100,2))

Training accuracy using sklearn LinearSVC: 94.27


# Text preprocessing on test data

In [16]:
files2 = []
corpus2 = []

for row in test_data :
  raw_text = row[1]
  for sym in raw_text : 
    # Removing punctuation
    if sym in punc : 
      raw_text = raw_text.replace(sym, "")
  # Removing non-alphabetic text and Converting text to lower-case
  words = [word.lower() for word in raw_text.split() if word.isalpha()]
  # Removing stopwords
  words = [w for w in words if not w in stop_words]
  # Performing lemmatization
  lemmatized = [lemmatizer.lemmatize(word) for word in words]
  temp_files = []
  temp_files.append(row[0])
  temp_files.append(lemmatized)
  # Storing file data to list
  files2.append(temp_files)
  # Appending data to 'files' list
  corpus2.append(lemmatized)
  
print("Total Documents: ", len(files))

Total Documents:  159571


# Creating vectors of test data

In [17]:
print("Creating vectors for test data using CBOW word2vec ...")
# Calculating weighted average and forming document vectors
count = 0
test_arr = []
for i in corpus2:
    demo = np.zeros((vect_size,),dtype = float) # initializing a zero matrix
    for j in i:
        try:
            demo = demo + cbow_model.wv.__getitem__(j) # adding values of each token in doc to vector
        except:
          continue
    count = count + 1
    try:
      test_arr.append(demo/len(i))
    except:
      test_arr.append(demo)
    if count%10000 == 0:
      print("Progress:", count, "/", len(corpus2))

print("Vectors created.")

Creating vectors for test data using CBOW word2vec ...


  


Progress: 10000 / 153164
Progress: 20000 / 153164
Progress: 30000 / 153164
Progress: 40000 / 153164
Progress: 50000 / 153164
Progress: 60000 / 153164
Progress: 70000 / 153164
Progress: 80000 / 153164
Progress: 90000 / 153164
Progress: 100000 / 153164
Progress: 110000 / 153164
Progress: 120000 / 153164
Progress: 130000 / 153164
Progress: 140000 / 153164
Progress: 150000 / 153164
Vectors created.


# Placing the predicted labels into output file

In [18]:
X = np.array(test_arr)
X = np.nan_to_num(X)
X_scaled = scaler.transform(X)

predicted1 = clf1.predict(X_scaled)
predicted2 = clf2.predict(X_scaled)
predicted3 = clf3.predict(X_scaled)
predicted4 = clf4.predict(X_scaled)
predicted5 = clf5.predict(X_scaled)
predicted6 = clf6.predict(X_scaled)


final_csv = []
header = ["id", "toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

for i in range(len(predicted1)):
    final_csv.append([files2[i][0], predicted1[i], predicted2[i], predicted3[i], predicted4[i], predicted5[i], predicted6[i]])


import csv

with open('output_w2v.csv','w') as f:
    writer = csv.writer(f)
    writer.writerows(final_csv)


# Loading the target labels from test_labels.csv

In [19]:
test_labels = []

with open("test_labels.csv", 'r') as csvfile:
  csvreader = csv.reader(csvfile)
  fields2 = next(csvreader)
  for row in csvreader:
    test_labels.append(row)

# Removing the rows with -1 value from target and predicted test labels



In [20]:
y_pred1 = []
y_pred2 = []
y_pred3 = []
y_pred4 = []
y_pred5 = []
y_pred6 = []

y_actual1 = []
y_actual2 = []
y_actual3 = []
y_actual4 = []
y_actual5 = []
y_actual6 = []

for i in range(len(predicted1)):
    if(test_labels[i][1] != -1):
        y_pred1.append(predicted1[i])
        y_pred2.append(predicted2[i])
        y_pred3.append(predicted3[i])
        y_pred4.append(predicted4[i])
        y_pred5.append(predicted5[i])
        y_pred6.append(predicted6[i])

        y_actual1.append(test_labels[i][1])
        y_actual2.append(test_labels[i][2])
        y_actual3.append(test_labels[i][3])
        y_actual4.append(test_labels[i][4])
        y_actual5.append(test_labels[i][5])
        y_actual6.append(test_labels[i][6])

# Finding out the Micro and Macro - Precision, Recall and F1Score

In [21]:
m1 = precision_recall_fscore_support(y_actual1, y_pred1, average='micro')
m2 = precision_recall_fscore_support(y_actual2, y_pred2, average='micro')
m3 = precision_recall_fscore_support(y_actual3, y_pred3, average='micro')
m4 = precision_recall_fscore_support(y_actual4, y_pred4, average='micro')
m5 = precision_recall_fscore_support(y_actual5, y_pred5, average='micro')
m6 = precision_recall_fscore_support(y_actual6, y_pred6, average='micro')

print("Label 1 -> Micro-precision:", round(m1[0], 2), "\tRecall:", round(m1[1], 2), "\tFScore:", round(m1[2], 2))
print("Label 2 -> Micro-precision:", round(m2[0], 2), "\tRecall:", round(m2[1], 2), "\tFScore:", round(m2[2], 2))
print("Label 3 -> Micro-precision:", round(m3[0], 2), "\tRecall:", round(m3[1], 2), "\tFScore:", round(m3[2], 2))
print("Label 4 -> Micro-precision:", round(m4[0], 2), "\tRecall:", round(m4[1], 2), "\tFScore:", round(m4[2], 2))
print("Label 5 -> Micro-precision:", round(m5[0], 2), "\tRecall:", round(m5[1], 2), "\tFScore:", round(m5[2], 2))
print("Label 6 -> Micro-precision:", round(m6[0], 2), "\tRecall:", round(m6[1], 2), "\tFScore:", round(m6[2], 2))

print("")

ma1 = precision_recall_fscore_support(y_actual1, y_pred1, average='macro')
ma2 = precision_recall_fscore_support(y_actual2, y_pred2, average='macro')
ma3 = precision_recall_fscore_support(y_actual3, y_pred3, average='macro')
ma4 = precision_recall_fscore_support(y_actual4, y_pred4, average='macro')
ma5 = precision_recall_fscore_support(y_actual5, y_pred5, average='macro')
ma6 = precision_recall_fscore_support(y_actual6, y_pred6, average='macro')

print("Label 1 -> Macro-precision:", round(ma1[0], 2), "\tRecall:", round(ma1[1], 2), "\tFScore:", round(ma1[2], 2))
print("Label 2 -> Macro-precision:", round(ma2[0], 2), "\tRecall:", round(ma2[1], 2), "\tFScore:", round(ma2[2], 2))
print("Label 3 -> Macro-precision:", round(ma3[0], 2), "\tRecall:", round(ma3[1], 2), "\tFScore:", round(ma3[2], 2))
print("Label 4 -> Macro-precision:", round(ma4[0], 2), "\tRecall:", round(ma4[1], 2), "\tFScore:", round(ma4[2], 2))
print("Label 5 -> Macro-precision:", round(ma5[0], 2), "\tRecall:", round(ma5[1], 2), "\tFScore:", round(ma5[2], 2))
print("Label 6 -> Macro-precision:", round(ma6[0], 2), "\tRecall:", round(ma6[1], 2), "\tFScore:", round(ma6[2], 2))

aMiP = (m1[0] + m2[0] + m3[0] + m4[0] + m5[0] + m6[0])/6
aMaP = (ma1[0] + ma2[0] + ma3[0] + ma4[0] + ma5[0] + ma6[0])/6
aMiR = (m1[1] + m2[1] + m3[1] + m4[1] + m5[1] + m6[1])/6
aMaR = (ma1[1] + ma2[1] + ma3[1] + ma4[1] + ma5[1] + ma6[1])/6
aMiF = (m1[2] + m2[2] + m3[2] + m4[2] + m5[2] + m6[2])/6
aMaF = (ma1[2] + ma2[2] + ma3[2] + ma4[2] + ma5[2] + ma6[2])/6

print("")

print("Average Micro Precision:",round(aMiP,2))
print("Average Macro Precision:",round(aMaP,2))

print("Average Micro Recall:",round(aMiR,2))
print("Average Micro Recall:",round(aMiR,2))
print("Average Macro Recall:",round(aMaR,2))

print("Average Micro FScore:",round(aMiF,2))
print("Average Macro FScore:",round(aMaF,2))

Label 1 -> Micro-precision: 0.38 	Recall: 0.38 	FScore: 0.38
Label 2 -> Micro-precision: 0.41 	Recall: 0.41 	FScore: 0.41
Label 3 -> Micro-precision: 0.4 	Recall: 0.4 	FScore: 0.4
Label 4 -> Micro-precision: 0.42 	Recall: 0.42 	FScore: 0.42
Label 5 -> Micro-precision: 0.4 	Recall: 0.4 	FScore: 0.4
Label 6 -> Micro-precision: 0.41 	Recall: 0.41 	FScore: 0.41



  _warn_prf(average, modifier, msg_start, len(result))


Label 1 -> Macro-precision: 0.2 	Recall: 0.55 	FScore: 0.28
Label 2 -> Macro-precision: 0.15 	Recall: 0.36 	FScore: 0.21
Label 3 -> Macro-precision: 0.19 	Recall: 0.51 	FScore: 0.28
Label 4 -> Macro-precision: 0.15 	Recall: 0.39 	FScore: 0.22
Label 5 -> Macro-precision: 0.18 	Recall: 0.47 	FScore: 0.26
Label 6 -> Macro-precision: 0.18 	Recall: 0.34 	FScore: 0.21

Average Micro Precision: 0.4
Average Macro Precision: 0.18
Average Micro Recall: 0.4
Average Micro Recall: 0.4
Average Macro Recall: 0.44
Average Micro FScore: 0.4
Average Macro FScore: 0.24
