# Toxic Comment Classification

## Importing Libraries

In [None]:
# contractions for pre-processing
!pip install contractions

Collecting contractions
  Downloading https://files.pythonhosted.org/packages/11/4d/378ab91284c2c3a06ab475b287721c09b7951d5ecb3edf4ffb0e1e7a568a/contractions-0.0.49-py2.py3-none-any.whl
Collecting textsearch>=0.0.21
  Downloading https://files.pythonhosted.org/packages/d3/fe/021d7d76961b5ceb9f8d022c4138461d83beff36c3938dc424586085e559/textsearch-0.0.21-py2.py3-none-any.whl
Collecting pyahocorasick
[?25l  Downloading https://files.pythonhosted.org/packages/7f/c2/eae730037ae1cbbfaa229d27030d1d5e34a1e41114b21447d1202ae9c220/pyahocorasick-1.4.2.tar.gz (321kB)
[K     |████████████████████████████████| 327kB 6.7MB/s 
[?25hCollecting anyascii
[?25l  Downloading https://files.pythonhosted.org/packages/a3/14/666cd44bf53f36a961544af592cb5c5c800013f9c51a4745af8d7c17362a/anyascii-0.2.0-py3-none-any.whl (283kB)
[K     |████████████████████████████████| 286kB 37.8MB/s 
[?25hBuilding wheels for collected packages: pyahocorasick
  Building wheel for pyahocorasick (setup.py) ... [?25l[?25hdone


In [None]:
#mandatory libraries
import numpy as np
import pandas as pd
import scipy

#nltk-preprocessing
import string
import nltk
import contractions
from wordcloud import WordCloud, STOPWORDS
from nltk.corpus import stopwords
from nltk import ne_chunk, pos_tag, word_tokenize
from nltk.tree import Tree
from nltk.stem.wordnet import WordNetLemmatizer


#misc
import re
import pickle
import joblib
import warnings
warnings.filterwarnings("ignore")
from collections.abc import Iterable

#metrics
from sklearn.metrics import hamming_loss
from sklearn.metrics import accuracy_score
from sklearn.metrics import log_loss
from sklearn.metrics import roc_curve, auc,roc_auc_score

#model loading
from tensorflow.keras.models import load_model

In [None]:
#supporting/essential downloads for NLTK library 
#to handle chuncking/stemming/stopwords

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

## Mounting Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


## Pre-Processing Functions

In [None]:
def convert_to_lower_case(text):

    """function to convert the input text to lower case"""
    
    return text.lower()

In [None]:
def remove_escape_char(text):

    """function to remove newline (\n),
    tab(\t) and slashes (/ , \) from the input text"""

    return re.sub(r"[\n\t\\\/]"," ",text, flags=re.MULTILINE)

In [None]:
def remove_html_tags(text):

    """function to remove html tags (< >) and its content 
    from the input text"""

    return re.sub(r"<.*>"," ",text, flags=re.MULTILINE)

In [None]:
def remove_links(text):
    """function to remove any kind of links with no 
    html tags"""

    text= re.sub(r"http\S+"," ",text, flags=re.MULTILINE)

    return re.sub(r"www\S+"," ",text, flags=re.MULTILINE)

In [None]:
def remove_digits(text):

    """function to remove digits from the input text"""

    return re.sub(r'\d'," ",text, flags=re.MULTILINE)

In [None]:
def remove_punctuation(text):

    """function to remove punctuation marks from the input text"""

    for i in string.punctuation:
        text = text.replace(i," ")

    return text      

In [None]:
def chuncking(text):

    """function to perform chucking, which is also referred as shallow parsing.
    This is useful in determing the parts of speech of a given text and adds more
    structure to the input data ."""

    """In this function, we use NLTK library to perform chuncking and if a 
    particular label is PERSON names, we remove that, and names of Geo-graphic
    ares are retained by adding _ in its words.ex-New_York"""


    chunks_data=[]
    chunks_data=(list(ne_chunk(pos_tag(word_tokenize(text)))))
    for label in chunks_data:
        if type(label)==Tree:
            if label.label() == "GPE":
                a = label.leaves()
                if len(a)>1:
                    gpe = "_".join([term for term,pos in a])
                    text = re.sub(rf'{a[1][0]}',gpe,text, flags=re.MULTILINE)
                    text = re.sub(rf'\b{a[0][0]}\b'," ",text, flags=re.MULTILINE)
            if label.label()=="PERSON":      
                for term,pog in label.leaves():
                    text = re.sub(re.escape(term)," ",text, flags=re.MULTILINE)
    return text

In [None]:
def keep_alpha_and_underscore(text):

    """function to keep only aphabets and _ underscore, as we 
    added it in the chunking for geographic locations."""
    
    return re.sub(r"[^a-zA-Z_]"," ",text,flags=re.MULTILINE)

In [None]:
def remove_extra_spaces_if_any(text):

    """function to remove extra spaces if any after all the pre-preocessing"""
    
    return re.sub(r" {2,}", " ", text, flags=re.MULTILINE)

In [None]:
def remove_repeated_characters(text):

    """function to remove repeated characters if any from the input text"""

    """for example CAAAAASSSSSSEEEEE SSSSTTTTTUUUUUUDDDDYYYYYY gives CASE STUDY"""

    return re.sub(r"(\w)(\1{2,})","\\1",text,flags=re.MULTILINE)


In [None]:
def remove_words_lesth2(text):
    """function to remove words with length less than 2"""

    text = re.sub(r'\b\w{1,2}\b'," ",text)
    
    return text

In [None]:
def decontraction(text):

    """function to handle contraction errors"""
    res=""
    for word in text.split():
        try:
            con_text=contractions.fix(word)
            if con_text.lower() is word.lower():
                res=res+word+" "
            else:
                res=res+con_text+" "
        
        except:
            con_text=contractions.fix(word.lower())
            if con_text.lower() is word.lower():
                res=res+word+" "
            else:
                res=res+con_text+" "
    return res.strip()

In [None]:
#lets take all the stop words from both NLTK & Word Cloud libraries, along 
# with some custom words

stop_words=stopwords.words('english')
word_cloud_stp_wrds=list(STOPWORDS)
final_stop_words=list(STOPWORDS.union(set(stop_words)))
final_stop_words.extend(["mr","mrs","miss",
                        "one","two","three","four","five",
                        "six","seven","eight","nine","ten",
                        "us","also","dont","cant","any","can","along",
                        "among","during","anyone",
                         "a","b","c","d","e","f","g","h","i","j","k","l","m",
                         "n","o","p","q","r","s","t","u","v","w","x","y","z","hi","hello","hey","ok",
                         "okay","lol","rofl","hola","let","may","etc"])

In [None]:
#lemmatizer object
lemmatiser = WordNetLemmatizer()

In [None]:
# one-step pre-processing function

def preprocess(text):

    preprocessed_text = []

    for each_text in text:

        result=remove_links(each_text)
        result=remove_html_tags(result)
        result=remove_escape_char(result)        
        result=remove_digits(result)
        result=decontraction(result)
        result=remove_punctuation(result)
        result=chuncking(result)
        result=convert_to_lower_case(result)
        result = ' '.join(non_stop_word for non_stop_word in result.split() if non_stop_word not in final_stop_words)
        result=keep_alpha_and_underscore(result)
        result=remove_extra_spaces_if_any(result)
        result=remove_repeated_characters(result)
        result=remove_words_lesth2(result)
        result=' '.join(lemmatiser.lemmatize(word,pos="v") for word in result.split())
        preprocessed_text.append(result.strip())
        
    return preprocessed_text

## Featurization Functions

In [None]:
#load data

tfidf_dict = joblib.load('/content/gdrive/MyDrive/Colab Notebooks/Case_Study_1/tfidf_dict.pkl')
tfidf_words = joblib.load('/content/gdrive/MyDrive/Colab Notebooks/Case_Study_1/tfidf_words.pkl')
w2v_dict = joblib.load('/content/gdrive/MyDrive/Colab Notebooks/Case_Study_1/w2v_dict.pkl')
w2v_words = joblib.load('/content/gdrive/MyDrive/Colab Notebooks/Case_Study_1/w2v_words.pkl')

In [None]:
# computing tf-idf weighted word2vec for each comment.

def comp_tfidf_weighted_w2v(data,w2v_words,tfidf_words,w2v_dict,tfidf_dict):

    tfidf_w2v = []
    for sentence in data:
        vector = np.zeros(300) 
        # as word vectors are of zero length
        tf_idf_weight =0;
        # num of words with a valid vector in the sentence/review
        for word in sentence.split(): 
            # for each word in a review/sentence
            if (word in w2v_words) and (word in tfidf_words):
                vec = w2v_dict[word] 
                # getting the vector for each word
                # here we are multiplying idf value(dictionary[word]) and 
                #the tf value((sentence.count(word)/len(sentence.split())))
                tf_idf = tfidf_dict[word]*(sentence.count(word)/len(sentence.split()))
                # getting the tfidf value for each word
                vector += (vec * tf_idf) # calculating tfidf weighted w2v
                tf_idf_weight += tf_idf
        if tf_idf_weight != 0:
            vector /= tf_idf_weight
        tfidf_w2v.append(vector)
    return np.array(tfidf_w2v)

## Loading Model & Function for Metrics

In [None]:
#loading model
model=load_model("/content/gdrive/MyDrive/Colab Notebooks/Case_Study_1/mlp_model.h5")

In [None]:
#saving into hdf5 format
model.save("/content/gdrive/MyDrive/Colab Notebooks/Case_Study_1/best_model.hdf5")

In [None]:
def cal_metrics(y_true,y_pred):

    """function to calculate final metrics """

    if isinstance(y_true,scipy.sparse.lil.lil_matrix):
        y_true=y_true.A
    
    if isinstance(y_pred,scipy.sparse.lil.lil_matrix):
        y_pred=y_pred.A

    acc=accuracy_score(y_true,y_pred)
    ham_loss=hamming_loss(y_true,y_pred)

    return {"Accuracy":acc,"Hamming Loss":ham_loss}

## Function-1

In [None]:
def function_1(X):

    #handling single & multiple inputs

    if isinstance(X,str):
        X=[X]

    elif isinstance(X,Iterable):
        X=X

    #pre-processing
    pp_text=preprocess(X)

    #vectorizing
    vect_data=comp_tfidf_weighted_w2v(pp_text,w2v_words,
                                      tfidf_words,
                                      w2v_dict,
                                      tfidf_dict)
    pred=model.predict(vect_data).round().astype(int)
    
    return pred

## Function-2

In [None]:
def function_2(X,y):

    #handling single & multiple inputs

    if isinstance(X,str):
        X=[X]

    elif isinstance(X,Iterable):
        X=X

    #pre-processing
    pp_text=preprocess(X)

    #vectorizing
    vect_data=comp_tfidf_weighted_w2v(pp_text,w2v_words,
                                      tfidf_words,
                                      w2v_dict,
                                      tfidf_dict)

    pred=model.predict(vect_data).round().astype(int)
    
    metrics=[]
    for ground,predct in zip(y,pred):
        d=cal_metrics(ground,predct)
        metrics.append(list(d.values()))
    
    return pd.DataFrame(data=metrics,columns=["Exact Match Ratio","Hamming Loss"])

In [None]:
function_1(["this is a final submisssion for prection function. please give proper output",
            "keep mask follow social distance",
            "stay home stay safe",
            "are you mad ? i will kill you if you ask again"])

array([[0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0]])

In [None]:
function_2(["this is a final submisssion for prection function. please give proper output",
            "keep mask follow social distance",
            "stay home stay safe",
            "are you mad or what ? i will kill you if you ask again"],
           [[0,0,0,0,0,0],[0,0,0,0,0,0],[0,0,0,0,0,0],[0,0,0,0,0,0]])

Unnamed: 0,Exact Match Ratio,Hamming Loss
0,1.0,0.0
1,1.0,0.0
2,1.0,0.0
3,0.833333,0.166667
