### Define global variables

In [7]:
from string import punctuation
import re
from pprint import pprint
import warnings
import logging
import os, csv
import datetime, time
from langdetect import detect

#nltk
import nltk, gensim
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.corpus import wordnet, stopwords
from nltk.tokenize import word_tokenize

# matplotlib
import matplotlib.pyplot as plt

# pandas
import pandas as pd

#spacy
import spacy
from spacy.language import Language
from spacy_langdetect import LanguageDetector

# gensim
from gensim.utils import tokenize
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim import  models

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis # don't skip this

# Sklearn
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV

In [8]:
%matplotlib inline
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)
# warnings.filterwarnings("ignore",category=DeprecationWarning)
warnings.filterwarnings('ignore')

In [9]:
import nltk, re, spacy
import pandas as pd

nlp = spacy.load("en_core_web_sm")

nltk.download('averaged_perceptron_tagger')
nltk.download("stopwords")
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\sshre35\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sshre35\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\sshre35\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\sshre35\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\sshre35\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sshre35\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is

True

In [10]:
# global variables
# IMPORTANT: RISKY MOVE WHEN IMPORTING THIS FILE IN ANOTHER NOTEBOOK
# BEWARE OF OVERRIDING GLOBAL VARIABLES

wordnet_lemmatizer = WordNetLemmatizer()
porter_stemmer = PorterStemmer()
custom_stop_words = nltk.corpus.stopwords.words('english')

In [11]:
# regular expressions

html_cleaner = re.compile('<.*?>')
emoji_cleaner = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
                      "]+", re.UNICODE)
non_ascii = re.compile("[^\x00-\x7f]")
email_cleaner = re.compile('\S*@\S*\s?')
punctuation_cleaner = re.compile('[^\w\s]')
only_alphabetic = re.compile('[^a-zA-Z]+')

### Noise removal functions

In [12]:
def remove_num(review):
    return re.sub(r'[0-9]', "", review)
def remove_nan(reviews_arr, is_text: False):
    if is_text:
        return re.sub('nan', '',reviews_arr)
    return [re.sub('nan', '', review) for review in reviews_arr]
def remove_html(reviews_arr, is_text: False):
    if is_text:
        return re.sub(html_cleaner, '', reviews_arr)
    return [re.sub(html_cleaner, '', review) for review in reviews_arr]
def remove_emoji(reviews_arr, is_text=False):
    if is_text:
        return re.sub(emoji_cleaner, '',reviews_arr)
    return [re.sub(emoji_cleaner, '', review) for review in reviews_arr]
def remove_email(reviews_arr, is_text: False):
    if is_text:
        return re.sub(email_cleaner, '',reviews_arr)
    return [re.sub(email_cleaner, '',review) for review in reviews_arr]

def remove_non_alphabetic(words, return_arr=False):
    return [re.sub(only_alphabetic, '', word) for word in words]
def remove_punctuation(words):
    return [re.sub(punctuation_cleaner, "", word) for word in words]
def remove_non_ascii(text):
    return re.sub(non_ascii,'', text)
def remove_new_lines(reviews_arr):
    return [re.sub('\s+', ' ', review) for review in reviews_arr]
def remove_single_quotes(reviews_arr):
    return [re.sub("\'", "", review) for review in reviews_arr]
def remove_double_quotes(reviews_arr):
    return [re.sub("\"", "", review) for review in reviews_arr]


# @param: list of sentences 
def clean_data(review_arr):
    rev_sentences = remove_nan(review_arr)
    rev_sentences = remove_double_quotes(remove_single_quotes(rev_sentences))
    rev_sentences = remove_emoji(rev_sentences)
    rev_sentences = remove_html(rev_sentences)
    rev_sentences = remove_email(rev_sentences)
    rev_sentences = remove_new_lines(rev_sentences)
    return rev_sentences

def clean_text(review):
    # remove numbers, nan
    formatted = remove_num(review)
    formatted = remove_nan(formatted, True)
    # remove html, email, emoji
    formatted = remove_html(formatted, True)
    formatted = remove_email(formatted, True)
    formatted = remove_emoji(formatted, True)
    return formatted

# list of words - a single sentence in a review
def clean_sent_words(sent_words, min_word_len = 0):
    formatted = remove_punctuation(sent_words)
    formatted = remove_non_alphabetic(formatted)
    return list(filter(lambda x: len(x) > min_word_len, formatted)) # do not return empty word

### Sentence tokenize functions

In [13]:
# splits strings into a list of words
def nltk_tokenize(text):
    return nltk.word_tokenize(text)
# returns list of list: each sentence broken down into words
# simple_preprocess is gensim function
def gensim_sent_to_words(reviews_arr):
    for review_sent in reviews_arr:
        yield(simple_preprocess(str(review_sent), deacc=True)) # deacc True removes punctuations
def gensim_tokenize(text):
    return list(tokenize(text)) # removes punctuations as well

### Stopwords removal functions

In [14]:
def nltk_remove_stopwords(words, stopwords = custom_stop_words):
    word_arr = [word for word in words if word not in stopwords]
    return word_arr
def gensim_remove_stopwords(texts, stopwords = custom_stop_words):
    return [[word for word in simple_preprocess(str(doc)) if word not in stopwords] for doc in texts]    

# not added - meditation, 'empower your ride your way', mindfulness, simple habit sleep meditation', grab
appnames = ["app", "application", "acorns", "acorn", "acorm", "atom", "betterment", "bitmart", "blockfi", "e-trade", "etrade", "e trade", "fidelity", "fidelity spire", "spire", "gemini", "ibkr",
               "m1", "m1 finance", "marketsim", "kucoin", "personal capital", "power e trade", "public com", "rally rd", "robinhood", "rb", "schwab", "stash",
                "stockpile", "stockwits", "td", "ameritrade", "thinkorswim", "think or swim", "titan", "vanguard", "wealthfront", "wealthsimple", "wealth simple",
                "webull", "aura", "betterhelp", "dare", "calm", "happify", "happy not perfect mind", "headspace", "ibreathe", "insight timer", "lumosity", "meditation studio",
                "mooodfit", "moodmission", "mood mission", "nocd", "regain", "rootd", "sanvello", "shine", "slumber", "talkspace", "smiling mind", "tenpercenthappier", "ten percent happier",
                "wakingup", "whatsup", "woebot", "wysa", "youper", "zen", "99 private", "ado boletos", "beat", "blabla", "blablacar", "bolt", "curb", "didi",
                "cabify", "flywheel", "gett", "hopskipdrive", "lyft", "kakao", "moovit", "blue bird", "ola", "rapido", "revel", "uber", "veyo", "via", "waze", "wingz", "ztrip",
               "tj"]
spells = ["u", "ish"]
aux_verbs = ["be", "do", "have", "will", "shall", "should", "would", "could", 
             "may", "might", "must", "can", "ought"]
non_useful = ["let", "maybe", "finally", "yeah", "oh", "man", "else", "elsewhere", 
              "definitely", "actually", "else where", "lot", "still", "even",
              "dollars", "pounds", "euros", "dollar", "euro", "pound",
              "really", "hey", "lol", "lot", "xoxo", "already"]

def extend_stopwords():
    print("----------adding new keywords to custom_stop_words-----------")
    custom_stop_words.extend(appnames)
    custom_stop_words.extend(spells)
    custom_stop_words.extend(aux_verbs)
    custom_stop_words.extend(non_useful)

### Token lemmatize functions

In [15]:
def nltk_lemmatize(word_arr, return_arr = True):
    lemmatized = [wordnet_lemmatizer.lemmatize(word) for word in word_arr]
    if return_arr:
        return lemmatized
    return " ".join(lemmatized)

def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

def nltk_lemmatize_post_tag(word_arr, return_arr = True):
    lemmatized = [wordnet_lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in word_arr]
    if return_arr:
        return lemmatized
    return " ".join(lemmatized)

# review_sent_word_arr - 2D; each row is a sentence in the review, and each column is a word
def nltk_lemmatize_post_tag_sent_arr(review_sent_word_arr, return_arr = False):
    lemmatized = []
    # sent_words - list of words in that sentence
    for sent_words in review_sent_word_arr:
        lemmatized.append(",".join([wordnet_lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in sent_words]))
    return ";".join(lemmatized)


def nltk_lemmatize_post_tag_rev_words(review_words, return_arr = False):
    lemmatized = [wordnet_lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in review_words]
    if return_arr:
        return lemmatized
    return ",".join(lemmatized)


# @param return_arr when False, returns list of sentences
# where each sentence's words are lemmatized
def sentences_words_lemmatization(sentence_words_arr, return_arr = True):
    lemmatized = []
    for sentence_words in sentence_words_arr:
        sen_lemma = nltk_lemmatize_post_tag(sentence_words, return_arr)
        lemmatized.append(sen_lemma)
    return lemmatized

### Language detection

In [17]:
@Language.factory('en_language_detector_8')
def get_lang_detector(nlp, name):
    return LanguageDetector()
nlp = spacy.load("en_core_web_sm", disable=["ner", "tagger"])
# Language.factory("en_language_detector", func=get_lang_detector)
nlp.add_pipe('en_language_detector_8', last=True)

<spacy_langdetect.spacy_langdetect.LanguageDetector at 0x1a162ddcee0>

In [18]:
# DETECT LANGUAGE AND CHECK WHETHER A TEXT IS ENGLISH OR NOT
def detect_lang(text):
    doc = nlp(text)
    detect_language = doc._.language
    return detect_language # {'language': 'de', 'score': 0.9999958526911192}
def check_lang(text, lng = 'en', check_score = False, min_score = 0.8):
    detected = detect_lang(text)
    if check_score is False:
        return detected['language'] == lng
    return detected['language'] == lng and detected['score'] >= min_score