1- Noise cleaning

2- Contraction mapping

3- Spell checking

4- Stemming/Lemmatization

5- Stopwords removal

6- Case conversion

In [66]:
import pandas as pd
import numpy as np
import spacy
import nltk
import re
import unicodedata
from textblob import TextBlob
from contractions import CONTRACTION_MAP
from nltk.tokenize.toktok import ToktokTokenizer

In [3]:
# Load data
df = pd.read_csv('Library/input_df.csv')

In [4]:
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


### Noise Cleaning

In [5]:
# Removing accented characters
def remove_accented_chars(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text

In [6]:
# Removing special characters
def special_characters(text, remove_digits=False):
    pattern = r'[^a-zA-Z0-9\s]' if not remove_digits else r'[^a-zA-Z\s]'
    text = re.sub(pattern, '', text)
    return text

### Contraction Mapping

In [48]:
# Contraction mapping
def expand_contractions(text, contraction_mapping = CONTRACTION_MAP):
    
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())), 
                                      flags=re.IGNORECASE|re.DOTALL)
    
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expand_contraction = contraction_mapping.get(match)\
                            if contraction_mapping.get(match)\
                            else contraction_mapping.get(match.lower())
        expand_contraction = first_char + expand_contraction[1:]
        return expand_contraction
    
    expanded_text = contractions_pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)

    return expanded_text

### Spelling Correction

In [15]:
# Correcting the mispelled words
def correct_word_spelling(text):
    text = TextBlob(text)
    text = text.correct()
    return text

In [55]:
# Removing meaningless words
nltk.download('words')
def remove_meaningless_words(text):
    words = set(nltk.corpus.words.words())
    text = ' '.join(w for w in nltk.wordpunct_tokenize(text) if w.lower() in words or not w.isalpha())
    return text

[nltk_data] Downloading package words to /Users/zori/nltk_data...
[nltk_data]   Package words is already up-to-date!


### Lemmatization

In [19]:
nlp = spacy.load('en_core_web_sm')
def lemmatize_text(text):
    text = nlp(text)
    text = ' '.join([word.lemma_ for word in text])
    return text

### Stop Words Removal

In [20]:
stopword_list = nltk.corpus.stopwords.words('english')
stopword_list.remove('no')
stopword_list.remove('not')

In [78]:
# Removing stopwords
def remove_stopwords(text, is_lower_case=False):
    tokenizer = ToktokTokenizer()
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_text = [token for token in tokens if token not in stopword_list]
    else:
        filtered_text = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_text)
    
    return filtered_text

### Case Conversion

In [22]:
def text_conversion(text, case = 'lower'):
    if case == 'lower':
        return text.lower()
    elif case == 'upper':
        return text.upper()
    elif case == 'title':
        return text.title()
    else:
        return text

### Corpus Normalizer Pipeline

In [59]:
def normalized_corpus(corpus, accented_chars=True, special_character=True, 
                    contraction_expansion=True, word_spelling=True, meaningless_word=True,
                    lemmatization=True, stopwords=True, lower_case_conversion=True):
    
    normalized_corpus = []
    for doc in corpus:
        if accented_chars:
            doc = remove_accented_chars(doc)
        if special_characters:
            doc = special_characters(doc)
        if contraction_expansion:
            doc = expand_contractions(doc)
        if word_spelling:
            doc = correct_word_spelling(doc)
#         if meaningless_word:
#             doc = remove_meaningless_words(doc)
        if lemmatization:
            doc = lemmatize_text(doc)
        if stopwords:
            doc = remove_stopwords(doc)
        if lower_case_conversion:
            doc = text_conversion(doc)
        normalized_corpus.append(doc)
    
    return normalized_corpus

In [80]:
normalized_corpus(df.comment_text, accented_chars=True, special_character=True, 
                    contraction_expansion=True, word_spelling=True, meaningless_word=True,
                    lemmatization=True, stopwords=True, lower_case_conversion=True)

ValueError: [E866] Expected a string or 'Doc' as input, but got: <class 'textblob.blob.TextBlob'>.

In [79]:
remove_stopwords(df.comment_text[0])

"Explanation edits made username Hardcore Metallica Fan reverted ? ' vandalisms , closure GAs voted New York Dolls FAC. please ' remove template talk page since ' retired now.89.205.38.27"

In [58]:
df.comment_text[0]

"Explanation\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27"