Text Preprocessing

tokenization (nltk) \
normalization
- remove HTML tags   v
- remove extra whitespaces  v
- remove punctuations v
- lower casing
- convert accented characters to ASCII characters
- expand contractions
- remove special characters
- convert number words to numeric form and then remove all
- stopwords removal
- stemming
- Lemmatization

In [1]:
from bs4 import BeautifulSoup
import string
import unidecode
import spacy
from word2number import w2n
import gensim.downloader as api
import re

import pandas as pd

In [2]:
import en_core_web_sm
nlp = en_core_web_sm.load()

In [3]:
def strip_html_tags(text):
    """remove html tags from text"""
    stripped_text = re.sub(r'http\S+', '', text)
    return stripped_text

def strip_punctuations(text):
    """remove punctuations from text"""
    stripped_text = text.translate(str.maketrans('', '', string.punctuation))
    return stripped_text

def strip_whitespace(text):
    '''remove extra whitespaces'''
    stripped_text = text.strip()
    return " ".join(stripped_text.split())

def remove_accented_chars(text):
    """remove accented characters from text, e.g. café"""  # important since i found most quotation marks are not in ASCII characters
    text = unidecode.unidecode(text)
    return text

def decontracted(phrase):
    '''expand contractions'''
    # specific
    phrase = re.sub(r"won\'t", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

# exclude words from spacy stopwords list
deselect_stop_words = ['no', 'not','never','without']
for w in deselect_stop_words:
    nlp.vocab[w].is_stop = False

In [4]:
def text_preprocessing(text, accented_chars=True, contractions=True, 
                       convert_num=True, extra_whitespace=True, 
                       lemmatization=True, lowercase=True, punctuations=True,
                       remove_html=True, remove_num=True, special_chars=True, 
                       stop_words=True):
    """preprocess text with default option set to true for all steps"""
    if accented_chars == True: #remove accented characters
        text = remove_accented_chars(text)
    if remove_html == True: #remove html tags
        text = strip_html_tags(text)
    if extra_whitespace == True: #remove extra whitespaces
        text = strip_whitespace(text)
    if contractions == True: #expand contractions
        text = decontracted(text)
    if lowercase == True: #convert all characters to lowercase
        text = text.lower()
    if punctuations == True: # remove punctuations
        text = strip_punctuations(text)

    doc = nlp(text) #tokenise text

    clean_text = []
    
    for token in doc:
        flag = True
        edit = token.text
        # remove stop words
        if stop_words == True and token.is_stop and token.pos_ != 'NUM': 
            flag = False
        # remove punctuations
        if punctuations == True and token.pos_ == 'PUNCT' and flag == True: 
            flag = False
        # remove special characters
        if special_chars == True and token.pos_ == 'SYM' and flag == True: 
            flag = False
        # remove numbers
        if remove_num == True and (token.pos_ == 'NUM' or token.text.isnumeric()) \
        and flag == True:
            flag = False
        # convert number words to numeric numbers
        if convert_num == True and token.pos_ == 'NUM' and flag == True:
            edit = w2n.word_to_num(token.text)
        # convert tokens to base form
        elif lemmatization == True and token.lemma_ != "-PRON-" and flag == True:
            edit = token.lemma_
        # append tokens edited and not removed to list 
        if edit != "" and flag == True:
            clean_text.append(edit)
        # basically I want to use R to do the token again
        join_text = ' '.join(clean_text)
    return join_text

In [23]:
gm_tweets = pd.read_csv('gm_tweets.csv')
gm_text = gm_tweets.text

In [25]:
# gm text preprocessing
new_text = []
for text in range(len(gm_text)):
    new_text.append(text_preprocessing(gm_text[text]))

In [26]:
pre_gm = pd.Series( (text for text in new_text ) )

In [27]:
gm = pd.concat([gm_tweets.status_id, pre_gm], axis=1, ignore_index=True)
gm.rename(columns={0: "status_id", 1: "edited_text"})

Unnamed: 0,status_id,edited_text
0,x1277316716595470337,cheslie40670812 frankdangelo23 realdonaldtrump...
1,x1277314327457017857,general motors tease future cadillac lyriq ev ...
2,x1277311004398018561,general motors co gm surprise market q3 result...
3,x1274771791576252417,general motors co gm surprise market q4 result...
4,x1274593345986273281,general motors co gm surprise market q3 result...
...,...,...
1798,x1274286545160286209,gm shit greedyarse leave flint add death climb...
1799,x1274286545160286209,gm shit greedyarse leave flint add death climb...
1800,x1274285185882828801,ohio official lordstown shuttering violate sta...
1801,x1274264648607584256,ok gmc gm go order new gmc acadia at4 configur...


In [28]:
gm.to_csv('gm_text.csv', index=False)

In [29]:
# ford text preprocessing

ford_tweets = pd.read_csv('ford_tweets.csv')
ford_text = ford_tweets.text

In [30]:
ford_tweets.shape

(7038, 90)

In [31]:
ford_new_text = []
for text in range(len(ford_text)):
    ford_new_text.append(text_preprocessing(ford_text[text]))

In [32]:
pre_ford = pd.Series( (text for text in ford_new_text ) )

In [33]:
ford = pd.concat([ford_tweets.status_id, pre_ford], axis=1, ignore_index=True)
ford.rename(columns={0: "status_id", 1: "edited_text"})

Unnamed: 0,status_id,edited_text
0,x1277319431388114945,hellllllllllllllll yeahhhhhhhhhhhhhhhhh chaseb...
1,x1275215156004388865,someeeeeee blaney teampenske ford fordperforma...
2,x1277318749150777344,tear ford probably rebrand raging antisemite
3,x1277318473757032449,caranddriver ford want truck photo beefy look ...
4,x1277318078506958848,love god bless america ford flag
...,...,...
7033,x1274259767377051653,coyote 480hp mach1 ford mustang ford mustangma...
7034,x1274258026606837765,caranddriver ford leave fine tune car
7035,x1274257423168126976,rockabilly rave seagull model t ntrr2020 segul...
7036,x1274257423168126976,rockabilly rave seagull model t ntrr2020 segul...


In [34]:
ford.to_csv('ford_text.csv', index=False)

In [35]:
# fca text preprocessing

fca_tweets = pd.read_csv('fca_tweets.csv')
fca_text = fca_tweets.text

In [36]:
fca_tweets.shape

(9667, 90)

In [None]:
fca_new_text = []
for text in range(len(fca_text)):
    fca_new_text.append(text_preprocessing(fca_text[text]))

In [None]:
pre_fca = pd.Series( (text for text in fca_new_text ) )
fca = pd.concat([fca_tweets.status_id, pre_fca], axis=1, ignore_index=True)
fca.rename(columns={0: "status_id", 1: "edited_text"})

In [164]:
fca.to_csv('fca_text.csv', index=False)