In [1]:
import numpy as np      # Importing Packages and libraries...
import pandas as pd
import spacy
import nltk
from textblob import TextBlob
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\VISHU\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [2]:
nlp = spacy.load("en_core_web_lg")     # Importing the spacy pipeline with all models trained...

In [3]:
dataset = pd.read_csv("E:/downloads/train.csv")  # Retrieving the dataset...

In [4]:
dataset.sample(5)     # Getting 5 random rows of the dataset...

Unnamed: 0,id,comment_text,malignant,highly_malignant,rude,threat,abuse,loathe
14835,2736156922c31170,"""You are trying to make something legal by say...",0,0,0,0,0,0
74822,c82705e62d175fa2,"""\nSure. Definitely get some of what content ...",0,0,0,0,0,0
57278,9927e9a13519714a,blessed holidays and new year 2011. Yes.,0,0,0,0,0,0
10679,1c3c7f22e23a92d2,If I am not mistaken the edit consisted of del...,0,0,0,0,0,0
53192,8e328798f6397fcc,"""\n\nDon't worry about the above too much - bu...",0,0,0,0,0,0


In [5]:
dataset.shape

(159571, 8)

In [6]:
name = ['Malignant_Comment_Classification'] * 159571
data = pd.DataFrame(name, columns=['corpus_name'])      # Creating Columns as per the guidelines of the project...
data['raw_sentence'] = dataset['comment_text']
data['clean_sentence_training'] = ""
data['tokenized'] = ""
data['clean_sentence_EDA'] = ""   # Column with emoji removed as well...

In [7]:
lst = []
for col in data.columns:
    if data[col].isnull().any():    # Checking if any column has any null values...
        lst.append(col)
print(lst)     # None of the entry of sentence is null...

[]


In [8]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
STOPWORDS = set(stopwords.words("english"))    # Creating a set of Stop words in English Vocabulary...
for i in range(0, len(data)):
    words = word_tokenize(data['raw_sentence'][i])
    senten = ""         # Iterating through the entire column and extracting the sentence...
    for word in words:
        if word not in STOPWORDS:     # Tokenizing the sentence and removing the Stopwords...
            senten = senten + word + " "
    data['tokenized'][i] = senten
data.head(5)

Unnamed: 0,corpus_name,raw_sentence,clean_sentence_training,tokenized,clean_sentence_EDA
0,Malignant_Comment_Classification,Explanation\nWhy the edits made under my usern...,,Explanation Why edits made username Hardcore M...,
1,Malignant_Comment_Classification,D'aww! He matches this background colour I'm s...,,D'aww ! He matches background colour I 'm seem...,
2,Malignant_Comment_Classification,"Hey man, I'm really not trying to edit war. It...",,"Hey man , I 'm really trying edit war . It 's ...",
3,Malignant_Comment_Classification,"""\nMore\nI can't make any real suggestions on ...",,`` More I ca n't make real suggestions improve...,
4,Malignant_Comment_Classification,"You, sir, are my hero. Any chance you remember...",,"You , sir , hero . Any chance remember page 's ?",


In [9]:
for i in range(0, len(data)):
    sentence = data['tokenized'][i]    # Taking each sentence as the sentence with stop words removed...
    sen = TextBlob(sentence)
    lemmatized = " ".join([word.lemmatize() for word in sen.words])    # Lemmatizing and parsing the sentence together...
    data['clean_sentence_training'][i] = lemmatized
data.head(5)

Unnamed: 0,corpus_name,raw_sentence,clean_sentence_training,tokenized,clean_sentence_EDA
0,Malignant_Comment_Classification,Explanation\nWhy the edits made under my usern...,Explanation Why edits made username Hardcore M...,Explanation Why edits made username Hardcore M...,
1,Malignant_Comment_Classification,D'aww! He matches this background colour I'm s...,D'aww He match background colour I 'm seemingl...,D'aww ! He matches background colour I 'm seem...,
2,Malignant_Comment_Classification,"Hey man, I'm really not trying to edit war. It...",Hey man I 'm really trying edit war It 's guy ...,"Hey man , I 'm really trying edit war . It 's ...",
3,Malignant_Comment_Classification,"""\nMore\nI can't make any real suggestions on ...",More I ca n't make real suggestion improvement...,`` More I ca n't make real suggestions improve...,
4,Malignant_Comment_Classification,"You, sir, are my hero. Any chance you remember...",You sir hero Any chance remember page 's,"You , sir , hero . Any chance remember page 's ?",


In [10]:
import re
import pickle
import emot.emo_unicode       # class to remove the emoji...
def remove_emoji(string):
    emoji_pattern = re.compile("["
                            u"\U0001F600-\U0001F64F" # emoticons...
                            u"\U0001F300-\U0001F5FF" # symbols & pictographs...
                            u"\U0001F680-\U0001F6FF" # transport & map symbols...
                            u"\U0001F1E0-\U0001F1FF" # flags (iOS)...
                            u"\U00002702-\U000027B0"
                            u"\U000024C2-\U0001F251"
                            "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)

for i in range(0, len(data)):
    sentence = data['clean_sentence_training'][i]
    data['clean_sentence_EDA'][i] = remove_emoji(sentence)    # Sentence with emoji removed...
data.head(5)

Unnamed: 0,corpus_name,raw_sentence,clean_sentence_training,tokenized,clean_sentence_EDA
0,Malignant_Comment_Classification,Explanation\nWhy the edits made under my usern...,Explanation Why edits made username Hardcore M...,Explanation Why edits made username Hardcore M...,Explanation Why edits made username Hardcore M...
1,Malignant_Comment_Classification,D'aww! He matches this background colour I'm s...,D'aww He match background colour I 'm seemingl...,D'aww ! He matches background colour I 'm seem...,D'aww He match background colour I 'm seemingl...
2,Malignant_Comment_Classification,"Hey man, I'm really not trying to edit war. It...",Hey man I 'm really trying edit war It 's guy ...,"Hey man , I 'm really trying edit war . It 's ...",Hey man I 'm really trying edit war It 's guy ...
3,Malignant_Comment_Classification,"""\nMore\nI can't make any real suggestions on ...",More I ca n't make real suggestion improvement...,`` More I ca n't make real suggestions improve...,More I ca n't make real suggestion improvement...
4,Malignant_Comment_Classification,"You, sir, are my hero. Any chance you remember...",You sir hero Any chance remember page 's,"You , sir , hero . Any chance remember page 's ?",You sir hero Any chance remember page 's


In [19]:
lst = []
import textblob     # Using textblob for sentiment analysis and thus, extracting the labels...
for i in range(0, len(data)):
    score = textblob.TextBlob(data['clean_sentence_training'][i]).polarity
    if score >= 0:
        label = 0
    elif score < 0 and score >= -0.5:
        label = 1
    elif score < -0.5:
        label = 2
    lst.append(label)
#data = data.drop(columns='label', axis=0)
data.head(5)

Unnamed: 0,corpus_name,raw_sentence,clean_sentence_training,tokenized,clean_sentence_EDA
0,Malignant_Comment_Classification,Explanation\nWhy the edits made under my usern...,Explanation Why edits made username Hardcore M...,Explanation Why edits made username Hardcore M...,Explanation Why edits made username Hardcore M...
1,Malignant_Comment_Classification,D'aww! He matches this background colour I'm s...,D'aww He match background colour I 'm seemingl...,D'aww ! He matches background colour I 'm seem...,D'aww He match background colour I 'm seemingl...
2,Malignant_Comment_Classification,"Hey man, I'm really not trying to edit war. It...",Hey man I 'm really trying edit war It 's guy ...,"Hey man , I 'm really trying edit war . It 's ...",Hey man I 'm really trying edit war It 's guy ...
3,Malignant_Comment_Classification,"""\nMore\nI can't make any real suggestions on ...",More I ca n't make real suggestion improvement...,`` More I ca n't make real suggestions improve...,More I ca n't make real suggestion improvement...
4,Malignant_Comment_Classification,"You, sir, are my hero. Any chance you remember...",You sir hero Any chance remember page 's,"You , sir , hero . Any chance remember page 's ?",You sir hero Any chance remember page 's


In [21]:
data['label'] = lst

Unnamed: 0,corpus_name,raw_sentence,clean_sentence_training,tokenized,clean_sentence_EDA,label
0,Malignant_Comment_Classification,Explanation\nWhy the edits made under my usern...,Explanation Why edits made username Hardcore M...,Explanation Why edits made username Hardcore M...,Explanation Why edits made username Hardcore M...,0
1,Malignant_Comment_Classification,D'aww! He matches this background colour I'm s...,D'aww He match background colour I 'm seemingl...,D'aww ! He matches background colour I 'm seem...,D'aww He match background colour I 'm seemingl...,0
2,Malignant_Comment_Classification,"Hey man, I'm really not trying to edit war. It...",Hey man I 'm really trying edit war It 's guy ...,"Hey man , I 'm really trying edit war . It 's ...",Hey man I 'm really trying edit war It 's guy ...,0
3,Malignant_Comment_Classification,"""\nMore\nI can't make any real suggestions on ...",More I ca n't make real suggestion improvement...,`` More I ca n't make real suggestions improve...,More I ca n't make real suggestion improvement...,0
4,Malignant_Comment_Classification,"You, sir, are my hero. Any chance you remember...",You sir hero Any chance remember page 's,"You , sir , hero . Any chance remember page 's ?",You sir hero Any chance remember page 's,0


In [28]:
data['label'].unique()
data = data.drop(columns="tokenized")    # dropping the tokenized columns...
data.columns

Index(['corpus_name', 'raw_sentence', 'clean_sentence_training',
       'clean_sentence_EDA', 'label'],
      dtype='object')

In [25]:
# data.to_csv("ProcessedI.csv", index=False)