# Text Preprocessing for Word/Sentence Embedding Generators

## Preprocessing for BERT

## Set-up

### Install necessary libraries

In [1]:
!pip install symspellpy
!pip install pycontractions
!pip install keras-bert

Collecting symspellpy
  Downloading symspellpy-6.5.2-py3-none-any.whl (2.6 MB)
[K     |████████████████████████████████| 2.6 MB 2.8 MB/s 
Installing collected packages: symspellpy
Successfully installed symspellpy-6.5.2
Collecting pycontractions
  Downloading pycontractions-2.0.1-py3-none-any.whl (9.6 kB)
Collecting language-check>=1.0
  Downloading language-check-1.1.tar.gz (33 kB)
Building wheels for collected packages: language-check
  Building wheel for language-check (setup.py) ... [?25l- \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - done
[?25h  Created wheel for language-check: filename=language_check-1.1-py3-none-any.whl size=90190895 sha256=5ce9a1088c3980e8e7140ea1d1f0f267380608d58e6fe21f97168d5635da3cc5
  Stored in directory: /root/.cache/pip/wheels/ce/fe/32/3b83a67c4f1

In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

/kaggle/input/nlp-getting-started/sample_submission.csv
/kaggle/input/nlp-getting-started/test.csv
/kaggle/input/nlp-getting-started/train.csv


### Import necessary libraries

In [3]:
pd.set_option('display.max_colwidth', -1)
import matplotlib.pyplot as plt
from tqdm import tqdm
tqdm.pandas()

import re
import string

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.util import ngrams
from nltk.stem import PorterStemmer


import pkg_resources
from symspellpy.symspellpy import SymSpell
from symspellpy import SymSpell, Verbosity

#Contraction Import
from pycontractions import Contractions

### Set up Spell checker for Segmentation and Spell Check tasks

In [4]:
dictionary_path = pkg_resources.resource_filename("symspellpy", "frequency_dictionary_en_82_765.txt")
bigram_path = pkg_resources.resource_filename("symspellpy", "frequency_bigramdictionary_en_243_342.txt")

symspell_segmenter = SymSpell(max_dictionary_edit_distance=2, prefix_length=8)
symspell_segmenter.load_dictionary(dictionary_path, term_index=0, count_index=1)

sym_spell_misspelled = SymSpell(max_dictionary_edit_distance=3, prefix_length=7)
sym_spell_misspelled.load_dictionary(dictionary_path, term_index=0, count_index=1)
sym_spell_misspelled.load_bigram_dictionary(bigram_path, term_index=0, count_index=2)

True

### Set up model Contractions Expansion

In [5]:
cont = Contractions(api_key="glove-twitter-100")
cont.load_models()

[=====---------------------------------------------] 11.8% 45.7/387.1MB downloaded

### Load the Data

In [6]:
"""Let's load the data files"""
train = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
test = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')
train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected,1
3,6,,,"13,000 people receive #wildfires evacuation orders in California",1
4,7,,,Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school,1


## Imports to access Vocabulary file for BERT Tokenization

In [7]:
from keras_bert import load_vocabulary, Tokenizer, get_checkpoint_paths
from keras_bert.datasets import get_pretrained, PretrainedList
model_path = get_pretrained(PretrainedList.wwm_uncased_large)
paths = get_checkpoint_paths(model_path)
token_dict = load_vocabulary(paths.vocab)
tokenizer = Tokenizer(token_dict)

Using TensorFlow backend.


Downloading data from https://storage.googleapis.com/bert_models/2019_05_30/wwm_uncased_L-24_H-1024_A-16.zip


## Preprocessing set-up

### Define functions for preprocessing

In [8]:
def to_lower(text):
    text = text.lower()
    return text


def remove_url(text):
    url = re.compile(r'https?://\S+|www\.\S+|pic.twitter.com\S+')
    return url.sub('[url]',text)


def remove_punct(text):
    text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text)
    return text


def remove_special_ucchar(text):
    text = re.sub('&.*?;', ' ', text)
    return text


def remove_numbers(text):
    text = re.sub(r'\d+', ' ', text)
    return text


def remove_mentions(text):
    text = re.sub(r'@\w*', ' ', text)
    return text


def handle_unicode(text):
    text = text.encode('ascii', 'replace').decode('utf-8')
    return text


def remove_punctuations(text):
    text = re.sub(r'([^A-Za-z \t])|(\w+:\/\/\S+)', ' ', text)
    return text


def remove_square_bracket(text):
    text = re.sub('\[.*?\]', ' ', text)
    return text


def remove_angular_bracket(text):
    text = re.sub('\<.*?\>+', ' ', text)
    return text


def remove_newline(text):
    text = re.sub('\n', ' ', text)
    return text


def remove_words_with_numbers(text):
    text = re.sub('\w*\d\w*', ' ', text)
    return text
    

def hashtag_to_words(text):
    hashtag_list = re.findall(r"#\w+",text)
    for hashtag in hashtag_list:
        hashtag = re.sub(r'#', '', hashtag)
        text = re.sub(hashtag, symspell_segmenter.word_segmentation(hashtag).segmented_string, text)
    text = re.sub(r'#', ' ', text)
    return text


def extra_spaces(text):
    text = text.strip()
    text = re.sub('\s+|\t+', ' ', text)
    return text

def remove_stopwords(text):
    text_tokens=word_tokenize(text)
    textop = ''
    for token in text_tokens:
        if token not in stopwords.words('english'):
            textop = textop + token + ' '
    return textop


def correct_misspelled_with_context(text):
    suggestions = sym_spell_misspelled.lookup_compound(text, max_edit_distance=2)
    text = str(suggestions[0])
    text = re.sub(r', \d', ' ', text)
    return text


def stemming_text(text):
    stemmer= PorterStemmer()
    text_tokens=word_tokenize(text)
    textop = ''
    for token in text_tokens:
        textop = textop + stemmer.stem(token) + ' '
    return textop


def lemmatization(text):
    lemmatizer=WordNetLemmatizer()
    text_tokens=word_tokenize(text)
    textop = ''
    for token in text_tokens:
        textop = textop + lemmatizer.lemmatize(token) + ' '
    return textop


def removeRepeated(tweet):
    prev = ''
    tweet_new = ''
    for c in tweet:
        caps = False
        if c.isdigit():
            tweet_new += c
            continue
        if c.isalpha() == True:
            if ord(c) >= 65 and ord(c)<=90:
                caps = True
            c = c.lower()
            if c == prev:
                count += 1
            else:
                count = 1
                prev = c
            if count >= 3:
                continue
            if caps == True:
                tweet_new += c.upper()
            else:
                tweet_new += c
        else:
            tweet_new += c
    return tweet_new


def Expand_Contractions(text):
    return list(cont.expand_texts([text]))[0]


### Define function to extract vocabulary from a given file

In [9]:
def build_vocab(text, tokenizer=word_tokenize):
    """
    :param sentences: list of list of words
    :return: dictionary of words and their count
    """
    vocab = text.apply(lambda x: tokenizer(x)).explode().value_counts().to_dict()
    return vocab

### Define functions to check for coverage

In [10]:
import operator 

def check_coverage(vocab,embeddings_index):
    a = {}
    oov = {}
    k = 0
    i = 0
    for word in vocab:
        try:
            a[word] = embeddings_index[word]
            k += vocab[word]
        except:

            oov[word] = vocab[word]
            i += vocab[word]
            pass

    print('Found embeddings for {:.2%} of vocab'.format(len(a) / len(vocab)))
    print('Found embeddings for  {:.2%} of all text'.format(k / (k + i)))
    sorted_x = sorted(oov.items(), key=operator.itemgetter(1))[::-1]

    return sorted_x

### Define functions to process the pre-processing pipeline

In [11]:
def count_chars(text):
    new_text = text.apply(lambda x : list(x)).explode()
    return new_text.unique().shape[0]

def count_words(text):
    new_text = text.apply(lambda x : x.split(' ')).explode()
    return new_text.unique().shape[0]

def preprocess_pipeline(steps, col, df):
    new_col = df[col]
    char_count_before = 0
    word_count_before = 0
    char_count_after = 0
    word_count_after = 0
    for each_step in steps:
        char_count_before = count_chars(new_col)
        word_count_before = count_words(new_col)
        new_col = new_col.apply(each_step)
        char_count_after = count_chars(new_col)
        word_count_after = count_words(new_col)
        print("Preprocessing step: ",each_step.__name__)
        print("Unique Char Count ---> Before: %d | After: %d"%(char_count_before, char_count_after))
        print("Unique Word Count ---> Before: %d | After: %d"%(word_count_before, word_count_after))
        vocab = build_vocab(new_col,word_tokenize)
        check_coverage(vocab,token_dict)
        print()
    
    return new_col

## Preprocessing

### Define the preprocessing pipeline

In [12]:
### Define pipeline
pipeline = []

pipeline.append(handle_unicode)
pipeline.append(to_lower)
pipeline.append(remove_newline)
pipeline.append(remove_url)
pipeline.append(remove_special_ucchar)
pipeline.append(hashtag_to_words)
pipeline.append(remove_mentions)
# pipeline.append(remove_square_bracket)
# pipeline.append(remove_angular_bracket)
pipeline.append(Expand_Contractions)
# pipeline.append(remove_words_with_numbers)
# pipeline.append(remove_punctuations)
# pipeline.append(remove_punct)
pipeline.append(extra_spaces)
# pipeline.append(remove_numbers)
# pipeline.append(removeRepeated)
pipeline.append(correct_misspelled_with_context)
# pipeline.append(remove_stopwords)
# pipeline.append(stemming_text)
# pipeline.append(lemmatization)



### Check the coverage of unprocessed data

In [13]:
# sentences = train["text"].progress_apply(lambda x: x.split()).values
vocab = build_vocab(train["text"],word_tokenize)
oov = check_coverage(vocab,token_dict)

Found embeddings for 23.46% of vocab
Found embeddings for  66.31% of all text


### Check the coverage of keywords from our data

In [14]:
keywords = train.keyword.dropna().apply(lambda x: re.sub('%20',' ',x))

In [15]:
vocab = build_vocab(keywords,word_tokenize)
oov = check_coverage(vocab,token_dict)

Found embeddings for 74.24% of vocab
Found embeddings for  77.27% of all text


### Preprocessing of Train data

In [16]:
%%time
train = pd.read_csv('../input/nlp-getting-started/train.csv')


print("For Training data:")
train['processed_text'] = preprocess_pipeline(pipeline, 'text', train)
train.head()

For Training data:
Preprocessing step:  handle_unicode
Unique Char Count ---> Before: 122 | After: 94
Unique Word Count ---> Before: 32017 | After: 32000
Found embeddings for 23.85% of vocab
Found embeddings for  67.23% of all text

Preprocessing step:  to_lower
Unique Char Count ---> Before: 94 | After: 68
Unique Word Count ---> Before: 32000 | After: 28104
Found embeddings for 37.41% of vocab
Found embeddings for  84.56% of all text

Preprocessing step:  remove_newline
Unique Char Count ---> Before: 68 | After: 67
Unique Word Count ---> Before: 28104 | After: 27967
Found embeddings for 37.43% of vocab
Found embeddings for  84.57% of all text

Preprocessing step:  remove_url
Unique Char Count ---> Before: 67 | After: 67
Unique Word Count ---> Before: 27967 | After: 23383
Found embeddings for 46.89% of vocab
Found embeddings for  84.57% of all text

Preprocessing step:  remove_special_ucchar
Unique Char Count ---> Before: 67 | After: 66
Unique Word Count ---> Before: 23383 | After: 233

Unnamed: 0,id,keyword,location,text,target,processed_text
0,1,,,Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all,1,our deeds are the reason of this earthquake may allah forgive us all
1,4,,,Forest fire near La Ronge Sask. Canada,1,forest fire near la range sask canada
2,5,,,All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected,1,all residents asked to shelter in place are being notified by officers no other evacuation or shelter in place orders are expected
3,6,,,"13,000 people receive #wildfires evacuation orders in California",1,of of a people receive wildfires evacuation orders in california
4,7,,,Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school,1,just got sent this photo from ruby alaska as smoke from wildfires pours into a school


#### Example outputs

In [17]:
print(train.loc[12])

id                18                                                                          
keyword           NaN                                                                         
location          NaN                                                                         
text              #raining #flooding #Florida #TampaBay #Tampa 18 or 19 days. I've lost count 
target            1                                                                           
processed_text    raining flooding florida tampa bay tampa of or of days i have lost count    
Name: 12, dtype: object


### Preprocessing of Test data

In [18]:
%%time
test = pd.read_csv('../input/nlp-getting-started/test.csv')

print("For Testing data:")
test['processed_text'] = preprocess_pipeline(pipeline, 'text', test)
test.head()

For Testing data:
Preprocessing step:  handle_unicode
Unique Char Count ---> Before: 118 | After: 92
Unique Word Count ---> Before: 17426 | After: 17416
Found embeddings for 28.58% of vocab
Found embeddings for  66.75% of all text

Preprocessing step:  to_lower
Unique Char Count ---> Before: 92 | After: 66
Unique Word Count ---> Before: 17416 | After: 15323
Found embeddings for 46.26% of vocab
Found embeddings for  84.40% of all text

Preprocessing step:  remove_newline
Unique Char Count ---> Before: 66 | After: 65
Unique Word Count ---> Before: 15323 | After: 15306
Found embeddings for 46.28% of vocab
Found embeddings for  84.41% of all text

Preprocessing step:  remove_url
Unique Char Count ---> Before: 65 | After: 65
Unique Word Count ---> Before: 15306 | After: 13280
Found embeddings for 55.00% of vocab
Found embeddings for  84.41% of all text

Preprocessing step:  remove_special_ucchar
Unique Char Count ---> Before: 65 | After: 64
Unique Word Count ---> Before: 13280 | After: 1326

Unnamed: 0,id,keyword,location,text,processed_text
0,0,,,Just happened a terrible car crash,just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, stay safe everyone.",heard about earthquake is different cities stay safe everyone
2,3,,,"there is a forest fire at spot pond, geese are fleeing across the street, I cannot save them all",there is a forest fire at spot pond geese are fleeing across the street i cannot save them all
3,9,,,Apocalypse lighting. #Spokane #wildfires,apocalypse lighting spokane wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan,typhoon so decor kills of in china and taiwan


In [19]:
chars = train['processed_text'].apply(lambda x : list(x)).explode()
chars.unique()

array(['o', 'u', 'r', ' ', 'd', 'e', 's', 'a', 't', 'h', 'n', 'f', 'i',
       'q', 'k', 'm', 'y', 'l', 'g', 'v', 'c', 'p', 'b', 'x', 'w', 'j',
       '0', '2', '4', '3', '6', '7', '1', '8', '9', '5', 'z', "'"],
      dtype=object)

In [20]:
print(train['text'].iloc[1031])
print(train['processed_text'].iloc[1031])

Attention all RCHS football players there will be coffins and body bags by the locker rooms grab one tommorow because were gonna die
attention all chs football players there will be coffins and body bags by the locker rooms grab one tomorrow because were going to die  


From the kernel - https://www.kaggle.com/wrrosa/keras-bert-using-tfhub-modified-train-data

The author of the above kernel, manually read the tweets from training data and figured out that some of them were misclassified.

In [21]:
ids_with_target_error = [328,443,513,2619,3640,3900,4342,5781,6552,6554,6570,6701,6702,6729,6861,7226]
train.loc[train['id'].isin(ids_with_target_error),'target'] = 0

In [22]:
u, idx = np.unique(train['processed_text'], return_index=True)
train = train.iloc[idx]

In [23]:
tweet_len = train['processed_text'].apply(len)
print(tweet_len.max())

149


In [24]:
tweet_len = test['processed_text'].apply(len)
print(tweet_len.max())

147


In [25]:
train.to_csv('processed train.csv', index=False)
test.to_csv('processed test.csv', index=False)