In [1]:
#all other func than lemmatize are just optional in case we need them anytime
import re, string, unicodedata
import nltk
import inflect
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, WordNetLemmatizer, PorterStemmer

def remove_non_ascii(words):
    new_words = []
    for word in words:
        new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        new_words.append(new_word)
    return new_words

def to_lowercase(words):
    'Convert all characters to lowercase from list of tokenized words'
    new_words = []
    for word in words:
        new_word = word.lower()
        new_words.append(new_word)
    return new_words

def remove_punctuation(words):
    'Remove punctuation from list of tokenized words'
    new_words = []
    for word in words:
        new_word = re.sub(r'[^\w\s]', '', word)
        if new_word != '':
            new_words.append(new_word)
    return new_words

def replace_numbers(words):
    'Replace all interger occurrences in list of tokenized words with textual representation'
    p = inflect.engine()
    new_words = []
    for word in words:
        if word.isdigit():
            new_word = p.number_to_words(word)
            new_words.append(new_word)
        else:
            new_words.append(word)
    return new_words

def remove_stopwords(words):
    'Remove stop words from list of tokenized words'
    new_words = []
    for word in words:
        # print(word)
        if word not in stopwords.words('english'):
            new_words.append(word)
    return new_words

def stem_words(words):
    'Stem words in list of tokenized words'
    stemmer = PorterStemmer()
    stems = []
    for word in words:
        stem = stemmer.stem(word)
        stems.append(stem)
    return stems

def lemmatize_verbs(words):
    'Lemmatize verbs in list of tokenized words'
    lemmatizer = WordNetLemmatizer()
    lemmas = []
    for word in words:
        deconstructed_word=word.split("_")
        deconstructed_lemma=[]
        for i in deconstructed_word:
            j = lemmatizer.lemmatize(i, pos='v')
            deconstructed_lemma.append(j)
        lemma="_"
        lemma=lemma.join(deconstructed_lemma)
        lemmas.append(lemma)
    return lemmas

def processing(words): 
    words = remove_non_ascii(words)
    words = to_lowercase(words)
    words = remove_punctuation(words)
    words = replace_numbers(words)
    words = remove_stopwords(words)
    words = stem_words(words)
    words = lemmatize_verbs(words)
    return words
    
#example for use
l= ["python", "deep_learning", "artificial_intelligence", "verbal_communication", "accounting","c++","c#","went","go"]


print(stem_words(l))#without removing underscore
print(lemmatize_verbs(l))
print(processing(l))

['python', 'deep_learn', 'artificial_intellig', 'verbal_commun', 'account', 'c++', 'c#', 'went', 'go']
['python', 'deep_learn', 'artificial_intelligence', 'verbal_communication', 'account', 'c++', 'c#', 'go', 'go']
['python', 'deep_learn', 'artificial_intellig', 'verbal_commun', 'account', 'c', 'c', 'go', 'go']
