# Data Import

In [10]:
! pip install -q kaggle

In [11]:
from google.colab import files
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"sid200026","key":"2be1f3190f0c258ddbf019aa2783f167"}'}

In [12]:
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json

In [13]:
! kaggle competitions download -c jigsaw-toxic-comment-classification-challenge

Downloading train.csv.zip to /content
 34% 9.00M/26.3M [00:00<00:00, 58.8MB/s]
100% 26.3M/26.3M [00:00<00:00, 104MB/s] 
Downloading test_labels.csv.zip to /content
  0% 0.00/1.46M [00:00<?, ?B/s]
100% 1.46M/1.46M [00:00<00:00, 98.5MB/s]
Downloading sample_submission.csv.zip to /content
  0% 0.00/1.39M [00:00<?, ?B/s]
100% 1.39M/1.39M [00:00<00:00, 93.6MB/s]
Downloading test.csv.zip to /content
 38% 9.00M/23.4M [00:00<00:00, 31.7MB/s]
100% 23.4M/23.4M [00:00<00:00, 59.1MB/s]


In [14]:
! mkdir dataset

In [15]:
! unzip train.csv.zip -d dataset

Archive:  train.csv.zip
  inflating: dataset/train.csv       


# Data Analysis

In [16]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [17]:
data = pd.read_csv('dataset/train.csv')
data.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [18]:
categories = data.columns[2:]
categories

Index(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult',
       'identity_hate'],
      dtype='object')

In [19]:
for category in categories:
  print(f'{category}\n{data.loc[:,category].value_counts()}')

toxic
0    144277
1     15294
Name: toxic, dtype: int64
severe_toxic
0    157976
1      1595
Name: severe_toxic, dtype: int64
obscene
0    151122
1      8449
Name: obscene, dtype: int64
threat
0    159093
1       478
Name: threat, dtype: int64
insult
0    151694
1      7877
Name: insult, dtype: int64
identity_hate
0    158166
1      1405
Name: identity_hate, dtype: int64


In [20]:
data.isna().sum()

id               0
comment_text     0
toxic            0
severe_toxic     0
obscene          0
threat           0
insult           0
identity_hate    0
dtype: int64

# Preprocessing

In [21]:
def to_lower(text):
  return text.lower()

to_lower("TeSt Mssg")

'test mssg'

In [22]:
def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

remove_urls('Sid www.github.com/Sid200026')

'Sid '

In [23]:
def remove_html(text):
    html_pattern = re.compile('<.*?>')
    return html_pattern.sub(r'', text)

remove_html("""
<h1>Hi</h1>
<p>Message</p>
""")

'\nHi\nMessage\n'

In [24]:
import re

def remove_abbreviation(text):
    text = re.sub("\n"," ",text)
    text = re.sub("\[.*\]"," ",text)
    text = re.sub("\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}"," ",text)
    text = re.sub(r"\?"," ",text)
    text = re.sub("don't","do not",text)
    text = re.sub("doesn't", "does not",text)
    text = re.sub("didn't", "did not",text)
    text = re.sub("hasn't", "has not",text)
    text = re.sub("haven't", "have not",text)
    text = re.sub("hadn't", "had not",text)
    text = re.sub("won't", "will not",text)
    text = re.sub("wouldn't", "would not",text)
    text = re.sub("can't", "can not",text)
    text = re.sub("cannot", "can not",text)
    text = re.sub("i'm", "i am",text)
    text = re.sub("i'll", "i will",text)
    text = re.sub("its", "it is",text)
    text = re.sub("it's", "it is",text)
    text = re.sub("that's", "that is",text)
    text = re.sub("weren't", "were not",text)
    text = re.sub("i'd","i would",text)
    text = re.sub("i've","i have",text)
    text = re.sub("she'd","she would",text)
    text = re.sub("they'll","they will",text)
    text = re.sub("they're","they are",text)
    text = re.sub("we'd","we would",text)
    text = re.sub("we'll","we will",text)
    text = re.sub("we've","we have",text)
    text = re.sub("it'll","it will",text)
    text = re.sub("there's","there is",text)
    text = re.sub("where's","where is",text)
    text = re.sub("they're","they are",text)
    text = re.sub("let's","let us",text)
    text = re.sub("couldn't","could not",text)
    text = re.sub("shouldn't","should not",text)
    text = re.sub("wasn't","was not",text)
    text = re.sub("could've","could have",text)
    text = re.sub("might've","might have",text)
    text = re.sub("must've","must have",text)
    text = re.sub("should've","should have",text)
    text = re.sub("would've","would have",text)
    text = re.sub("who's","who is",text)
    text = re.sub("\bim\b", "i am",text)
    text = re.sub(r'[^\w\s]','',text)
    text = re.sub("\d+", "", text)
    return text

remove_abbreviation("I'm a great guy weren't")

'Im a great guy were not'

In [25]:
import string
PUNCT_TO_REMOVE = string.punctuation
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))

remove_punctuation('This is a, map.')

'This is a map'

In [26]:
from  spacy.lang.en.stop_words import STOP_WORDS
stopwords = list(STOP_WORDS)

def remove_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in stopwords])

remove_stopwords('Nice is a picture')

'Nice picture'

In [27]:
data = data.drop(columns=['id'])
data.head()

Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [29]:
preprocessing_pipeline = [
                          to_lower,
                          remove_urls,
                          remove_html,
                          remove_abbreviation,
                          remove_punctuation,
                          remove_stopwords
]

for preprocessor in preprocessing_pipeline:
  data['comment_text'] = data.loc[:,'comment_text'].apply(lambda text : preprocessor(text))
data.head()

Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,explanation edit username hardcore metallica f...,0,0,0,0,0,0
1,daww matches background colour seemingly stuck...,0,0,0,0,0,0
2,hey man trying edit war guy constantly removin...,0,0,0,0,0,0
3,real suggestions improvement wondered section ...,0,0,0,0,0,0
4,sir hero chance remember page,0,0,0,0,0,0


In [34]:
data.to_csv('processed_train.csv')