# Import the dataset

In [95]:
import pandas as pd
import numpy as np
import spacy
from tqdm import tqdm
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score

import re
import gc

In [72]:
tqdm.pandas()

In [34]:
! python -m spacy download en_core_web_lg

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_lg')


In [35]:
! python -m spacy link en_core_web_lg en --force

[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.6/dist-packages/en_core_web_lg -->
/usr/local/lib/python3.6/dist-packages/spacy/data/en
You can now load the model via spacy.load('en')


In [36]:
! pip install kaggle



In [37]:
from google.colab import files
files.upload()

{}

In [38]:
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/

mkdir: cannot create directory ‘/root/.kaggle’: File exists


In [39]:
! chmod 600 ~/.kaggle/kaggle.json

In [40]:
! kaggle competitions download -c jigsaw-toxic-comment-classification-challenge

Downloading sample_submission.csv.zip to /content
  0% 0.00/1.39M [00:00<?, ?B/s]
100% 1.39M/1.39M [00:00<00:00, 46.7MB/s]
Downloading train.csv.zip to /content
 84% 22.0M/26.3M [00:00<00:00, 65.9MB/s]
100% 26.3M/26.3M [00:00<00:00, 104MB/s] 
Downloading test_labels.csv.zip to /content
  0% 0.00/1.46M [00:00<?, ?B/s]
100% 1.46M/1.46M [00:00<00:00, 144MB/s]
Downloading test.csv.zip to /content
 64% 15.0M/23.4M [00:00<00:00, 48.2MB/s]
100% 23.4M/23.4M [00:00<00:00, 77.8MB/s]


In [41]:
! mkdir dataset

mkdir: cannot create directory ‘dataset’: File exists


In [42]:
! unzip test.csv.zip -d dataset

Archive:  test.csv.zip
replace dataset/test.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: dataset/test.csv        


In [43]:
! unzip train.csv.zip -d dataset

Archive:  train.csv.zip
replace dataset/train.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: dataset/train.csv       


# Preprocessing

In [44]:
data = pd.read_csv('dataset/train.csv', dtype={'comment_text':'string'})
data.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation Why the edits made under my userna...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,""" More I can't make any real suggestions on im...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [45]:
test_data = pd.read_csv('dataset/test.csv', dtype={'comment_text':'string'})
test_data.head()

Unnamed: 0,id,comment_text
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...
1,0000247867823ef7,"== From RfC == The title is fine as it is, ..."
2,00013b17ad220c46,""" == Sources == * Zawe Ashton on Lapland..."
3,00017563c3f7919a,":If you have a look back at the source, the in..."
4,00017695ad8997eb,I don't anonymously edit articles at all.


In [46]:
data = data.drop(columns='id')
data.head()

Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,Explanation Why the edits made under my userna...,0,0,0,0,0,0
1,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,""" More I can't make any real suggestions on im...",0,0,0,0,0,0
4,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [47]:
ids = test_data.iloc[:,0]
ids

0         00001cee341fdb12
1         0000247867823ef7
2         00013b17ad220c46
3         00017563c3f7919a
4         00017695ad8997eb
                ...       
153159    fffcd0960ee309b5
153160    fffd7a9a6eb32c16
153161    fffda9e8d6fafa9e
153162    fffe8f1340a79fc2
153163    ffffce3fb183ee80
Name: id, Length: 153164, dtype: object

In [48]:
test_data = test_data.drop(columns='id')
test_data.head()

Unnamed: 0,comment_text
0,Yo bitch Ja Rule is more succesful then you'll...
1,"== From RfC == The title is fine as it is, ..."
2,""" == Sources == * Zawe Ashton on Lapland..."
3,":If you have a look back at the source, the in..."
4,I don't anonymously edit articles at all.


In [49]:
def to_lower(text):
  return text.lower()

In [50]:
def remove_abbreviation(text):
    text = re.sub("^ *","", text)
    text = re.sub("\n"," ",text)
    text = re.sub(' {2,}', ' ', text)
    text = re.sub("\[.*\]"," ",text)
    text = re.sub("\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}"," ",text)
    text = re.sub(r"\?"," ",text)
    text = re.sub("don'?t","do not",text)
    text = re.sub("doesn'?t", "does not",text)
    text = re.sub("didn'?t", "did not",text)
    text = re.sub("hasn'?t", "has not",text)
    text = re.sub("haven'?t", "have not",text)
    text = re.sub("hadn'?t", "had not",text)
    text = re.sub("won'?t", "will not",text)
    text = re.sub("wouldn'?t", "would not",text)
    text = re.sub("can'?t", "can not",text)
    text = re.sub("cannot", "can not",text)
    text = re.sub("i'?m", "i am",text)
    text = re.sub("i'?ll", "i will",text)
    text = re.sub("it'?s", "it is",text)
    text = re.sub("that'?s", "that is",text)
    text = re.sub("weren'?t", "were not",text)
    text = re.sub("i'?d","i would",text)
    text = re.sub("i'?ve","i have",text)
    text = re.sub("she'?d","she would",text)
    text = re.sub("they'?ll","they will",text)
    text = re.sub("they'?re","they are",text)
    text = re.sub("we'?d","we would",text)
    text = re.sub("we'?ll","we will",text)
    text = re.sub("we'?ve","we have",text)
    text = re.sub("it'?ll","it will",text)
    text = re.sub("there'?s","there is",text)
    text = re.sub("where'?s","where is",text)
    text = re.sub("they'?re","they are",text)
    text = re.sub("let'?s","let us",text)
    text = re.sub("couldn'?t","could not",text)
    text = re.sub("shouldn'?t","should not",text)
    text = re.sub("wasn'?t","was not",text)
    text = re.sub("could'?ve","could have",text)
    text = re.sub("might'?ve","might have",text)
    text = re.sub("must'?ve","must have",text)
    text = re.sub("should'?ve","should have",text)
    text = re.sub("would'?ve","would have",text)
    text = re.sub("who'?s","who is",text)
    text = re.sub("you'?re", "you are", text)
    text = re.sub("y'?all", "you all", text)
    text = re.sub("'d've"," would have", text)
    text = re.sub("'d"," would", text)
    text = re.sub("'re"," are", text)
    text = re.sub("'ve"," have", text)
    text = re.sub("\bim\b", "i am",text)
    text = re.sub("[^a-zA-Z .,]+", "", text)
    return text

In [51]:
def remove_url(text):
  text = re.sub(r"\b(?:(?:https|ftp|http|www)://)?\w[\w-]*(?:\.[\w-]+)+\S*", '', text, flags=re.MULTILINE)
  return text

In [52]:
def preprocessing_pipeline(text):
  text = to_lower(text)
  text = remove_url(text)
  text = remove_abbreviation(text)
  return text

data['comment_text'] = data.loc[:,'comment_text'].apply(lambda text : preprocessing_pipeline(text))

In [53]:
test_data['comment_text'] = test_data.loc[:,'comment_text'].apply(lambda text : preprocessing_pipeline(text))

In [54]:
data.shape

(159571, 7)

In [55]:
test_data.shape

(153164, 1)

In [56]:
del preprocessing_pipeline
del to_lower
del remove_url
del remove_abbreviation

gc.collect()

282

# Word Embedding

In [57]:
nlp = spacy.load('en')
doc = nlp(u"Chicken and Mutton")
doc.vector

array([-3.49166662e-01, -1.01247333e-01,  3.53286654e-01,  6.50443360e-02,
        2.39446655e-01,  5.97852647e-01, -1.07333565e-03, -7.28713349e-02,
        3.91490340e-01,  1.25222635e+00, -2.64506668e-01,  3.16064000e-01,
       -3.62883329e-01, -1.88919976e-01,  2.13089958e-02, -1.69803336e-01,
       -3.91406566e-03,  5.98036706e-01, -1.75683334e-01,  3.39501023e-01,
       -1.22946002e-01,  6.92796707e-02, -2.08379015e-01, -9.66816768e-02,
        7.42769986e-02, -4.27128673e-01, -3.50886673e-01,  8.73756707e-02,
       -4.52162437e-02, -4.01996702e-01, -9.16823372e-02,  1.54026002e-01,
        2.10853338e-01, -3.14760000e-01,  2.60806680e-01, -1.28430009e-01,
       -3.83560061e-02,  2.03157321e-01, -1.96409002e-02,  6.63482368e-01,
       -2.13727236e-01, -9.52803269e-02,  1.26594007e-01, -1.53995201e-01,
        8.36036578e-02,  2.16826662e-01, -1.70766618e-02,  1.13676667e-01,
        7.00843334e-02,  3.70599985e-01,  2.42653012e-01,  5.42867184e-03,
        3.38386707e-02, -

In [70]:
X = np.zeros((data.shape[0], 300), dtype=np.float32)
counter = 0

In [73]:
def word_embedding(text):
  global counter
  X[counter,:] = nlp(text).vector
  counter = counter + 1

data.loc[:,'comment_text'].progress_apply(lambda text : word_embedding(text))

100%|██████████| 159571/159571 [55:40<00:00, 47.76it/s]


0         None
1         None
2         None
3         None
4         None
          ... 
159566    None
159567    None
159568    None
159569    None
159570    None
Name: comment_text, Length: 159571, dtype: object

In [74]:
X

array([[ 0.03448813,  0.09872174, -0.17731757, ..., -0.01966973,
        -0.00583382,  0.12877746],
       [-0.00617565,  0.2648892 , -0.0425774 , ...,  0.1009116 ,
        -0.1026927 ,  0.1888336 ],
       [-0.06958876,  0.17466746, -0.21078135, ..., -0.00699711,
        -0.02979095,  0.15146053],
       ...,
       [-0.03069575,  0.24811323, -0.11504405, ..., -0.10391007,
        -0.13967338,  0.09451273],
       [-0.00345617,  0.154048  , -0.20982797, ..., -0.05986919,
         0.06772245,  0.19867161],
       [-0.03142541,  0.21293637, -0.2588035 , ..., -0.00225536,
         0.04896529,  0.12037323]], dtype=float32)

In [75]:
X.shape

(159571, 300)

In [76]:
Test = np.zeros((test_data.shape[0], 300), dtype=np.float32)
counter = 0

In [77]:
def word_embedding_test(text):
  global counter
  Test[counter,:] = nlp(text).vector
  counter = counter + 1

test_data.loc[:,'comment_text'].progress_apply(lambda text : word_embedding_test(text))

100%|██████████| 153164/153164 [50:18<00:00, 50.75it/s]


0         None
1         None
2         None
3         None
4         None
          ... 
153159    None
153160    None
153161    None
153162    None
153163    None
Name: comment_text, Length: 153164, dtype: object

In [84]:
Test.shape

(153164, 300)

In [85]:
Test

array([[-0.12088359,  0.07093609, -0.17736606, ..., -0.01424188,
         0.04577019,  0.11794006],
       [ 0.01906666,  0.15359667, -0.14850616, ..., -0.1201122 ,
        -0.1764278 ,  0.0737977 ],
       [-0.0118145 , -0.03195562, -0.05409718, ..., -0.0575075 ,
        -0.02722887,  0.15392739],
       ...,
       [-0.01106421,  0.11376979, -0.0664166 , ..., -0.05974143,
        -0.00114315,  0.05005007],
       [ 0.00324723,  0.11186893, -0.12036144, ..., -0.05733033,
        -0.01482491,  0.10719597],
       [-0.09983765,  0.24917597, -0.22282346, ..., -0.01183287,
        -0.03641475,  0.12520471]], dtype=float32)

In [86]:
Y = data.iloc[:,1:].values
Y

array([[0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       ...,
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0]])

In [87]:
categories = data.iloc[:,1:].columns
categories

Index(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult',
       'identity_hate'],
      dtype='object')

In [88]:
del data
del test_data

gc.collect()

576

# Decision Trees

In [97]:
scores = 0

# alpha_values = [0.9999999999999999, 0.25, 0.85, 0.1, 0.7, 0.25]

for index,category in enumerate(categories):
    decision_tree = DecisionTreeClassifier(max_depth=300)
    score = np.mean(cross_val_score(decision_tree, X, Y[:,index], cv=3, scoring='roc_auc'))
    scores += score
    print(f"{category} : {score}")

print("\nAverage Score : {:.5f}".format(scores/6))

toxic : 0.7304181645729786
severe_toxic : 0.6251566305644697
obscene : 0.7444485529372136
threat : 0.5586508555752782
insult : 0.7175446262217874
identity_hate : 0.5936014313934518

Average Score : 0.66164
