In [1]:
import pandas as pd
import numpy as np
import re
import string 
from stop_words import get_stop_words
from nltk.corpus import stopwords
stop_words = list(get_stop_words('en'))
nltk_words = list(stopwords.words('english'))
stop_words.extend(nltk_words)
from contraction_map import CONTRACTION_MAP

In [2]:
# Drop some features
data = pd.read_csv("2021-07-12-tweets-twint.csv")
data.head()
cols = ['tweet_id', 'date', 'clean_tweet', 'hashtags', 
            'search_keyword', 'year', 'month']
#data.loc[5:10,['clean_tweet', 'year']]
data = data.loc[:,cols]
data.shape

(82149, 7)

In [None]:
# Clean thoroughly 
def remove_hyperlinks(text):
    ptn = r'(https://[\w./-]+)|(www.[\w./-]+)|([\w./-]+.com)'
    return re.sub(ptn, '', text)

def remove_mentions(text):
    ptn = r'(@[\w_]+ | (@[.]+) | (@))'
    return re.sub(ptn, '', text)

def remove_stopwords(text):
    #text = list(text)
    text=' '.join([x for x in text.split() if x not in stop_words])
    return text

def remove_punctuations(text): 
    text = re.sub("[^a-zA-Z]", " ", text) 
    text = text.translate(str.maketrans('', '', string.punctuation))
    #text = text.translate(None, string.punctuation)
    return text 

def expand_contractions(text, contraction_mapping=CONTRACTION_MAP):
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())), 
                                      flags=re.IGNORECASE|re.DOTALL)
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match)\
                                if contraction_mapping.get(match)\
                                else contraction_mapping.get(match.lower())                       
        expanded_contraction = first_char+expanded_contraction[1:]
        return expanded_contraction
        
    expanded_text = contractions_pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)
    return expanded_text
    
def remove_alphabets(text):
    text = re.sub(r"\b[a-zA-Z]\b", "", text)
    return text 

def remove_digits(text):
    text = ''.join(i for i in text if not i.isdigit())
    return text

def clean_text(text, hyperlink=True, mention=True, stopwords=True,
               punctuations=True, contractions=True, digits=True,
               lowercase=True, alphabets=True):
    if lowercase: # Transform to lowercase
        text = text.lower()
    if hyperlink: # Remove Hyperlinks
        text = remove_hyperlinks(text)
    if mention: # Remove Mentions
        text = remove_mentions(text)
    if punctuations: # Remove Punctuations 
        text = remove_punctuations(text) 
    if contractions: # Expand Contractions e.g. can't -> cannot
        text = expand_contractions(text)
    if stopwords: # Remove english stopwords
        text = remove_stopwords(text)
    if alphabets: # Remove single alphabets 
        text = remove_alphabets(text)
    if digits: # Remove all numbers
        text = remove_digits(text)
    return text

data['clean_tweet3'] = data.clean_tweet.map(clean_text)


### Text lemmatization

 <strong>Lemmatization</strong> usually refers to doing things properly with the use of a vocabulary and morphological
analysis of words, normally aiming to remove inflectional endings only and to return the base or 
dictionary form of a word, which is known as the lemma <br><br>

Source: https://nlp.stanford.edu/IR-book/html/htmledition/stemming-and-lemmatization-1.html
https://blog.bitext.com/what-is-the-difference-between-stemming-and-lemmatization/ <br>
am, are, is $\Rightarrow$ be <br>
car, cars, car's, cars' $\Rightarrow$ car

In [None]:
# Install spacy 
# !pip3 install -U spacy
# !python -m spacy download en_core_web_sm

In [None]:
import spacy 
from nltk.tokenize.toktok import ToktokTokenizer
import nltk
#nlp = spacy.load('en_core')
#nlp_vec = spacy.load('en_vecs', parse = True, tag=True, entity=True)
tokenizer = ToktokTokenizer()

nlp = spacy.load('en_core_web_sm')


In [None]:
def lemmatize_text(text):
    text = str(text) #Type error (consist of float type..)
    text = nlp(text)
    text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
    return text

data["parse_tweet"] = dt.clean_tweet3.map(lemmatize_text)