In [1]:
import numpy as np
import pandas as pd 
import re
import string


In [2]:
from nltk.tokenize import regexp_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer


In [3]:
train_df=pd.read_excel('bert_sample.xlsx')
train_df

Unnamed: 0,ITEM_NAME,CATEGORY_ID
0,CALIBRACION TRANSDUCER 75 nm,CAPITAL ASSEMBLY
1,for pusher whskey,CAPITAL ASSEMBLY
2,Stat 40B Press Head Cup to Carrier from Stati...,CAPITAL ASSEMBLY
3,TRANSD. Cable (4145097103) scrw,CAPITAL ASSEMBLY
4,"ZT200 7,5BAR,13BAR60HZ NUMERO DE SERIE: AIF09...",CAPITAL ASSEMBLY
...,...,...
9995,Export Freight charges By Road,LOGISTICS SERVICE
9996,EXPORT PACKING,LOGISTICS SERVICE
9997,Express Delivery Charges,LOGISTICS SERVICE
9998,Express Delivery Charges for PMF to CHMF Despa...,LOGISTICS SERVICE


In [4]:
STOPWORDS = set(stopwords.words('english'))
PUNCT_TO_REMOVE = string.punctuation
lemmatizer = WordNetLemmatizer()

def tokenization(text):
    '''function for creating tokens'''
    tokens=regexp_tokenize(text,pattern='\s+',gaps=True) #https://towardsdatascience.com/benchmarking-python-nlp-tokenizers-3ac4735100c5
    return(tokens)
def remove_punctuation(text):
    '''custom function to remove the punctuation'''
    return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))
def remove_numbers(text):
    '''function to remove numbers'''
    return re.sub("[0-9]", " ", text)
def remove_stopwords(text):
    '''custom function to remove the stopwords'''
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

def lemmatize_words(text):
    return " ".join([lemmatizer.lemmatize(word) for word in text.split()])

def remove_unwanted(text):
    return [token for token in text if len(token)>2]

In [42]:
def preprocess():
    lower=train_df['ITEM_NAME'].str.lower()
    text=lower.apply(remove_punctuation)
    text=text.apply(remove_numbers)
    text=text.apply(remove_stopwords)
    text = text.apply(lemmatize_words)
    tokens=text.apply(tokenization)
    tokens=tokens.apply(remove_unwanted)
    return text,tokens    
    
text,token=preprocess()    

In [44]:
token

0                               [calibracion, transducer]
1                                        [pusher, whskey]
2       [stat, press, head, cup, carrier, station, mem...
3                                   [transd, cable, scrw]
4       [bar, bar, numero, serie, aif, fan, radial, pa...
                              ...                        
9995                      [export, freight, charge, road]
9996                                    [export, packing]
9997                          [express, delivery, charge]
9998    [express, delivery, charge, pmf, chmf, despatc...
9999          [express, delivery, charge, dubble, driver]
Name: ITEM_NAME, Length: 10000, dtype: object

In [7]:
from gensim import corpora
import pickle

dictionary = corpora.Dictionary(tokens)
corpus = [dictionary.doc2bow(text) for text in tokens]

pickle.dump(corpus, open('corpus.pkl', 'wb'))
dictionary.save('dictionary.gensim')

In [8]:
# import gensim
# NUM_TOPICS = 5
# ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
# ldamodel.save('model5.gensim')
# topics = ldamodel.print_topics(num_words=4)
# for topic in topics:
#     print(topic)

(0, '0.068*"insert" + 0.034*"diam" + 0.020*"sandvik" + 0.019*"mfg"')
(1, '0.023*"print" + 0.022*"serial" + 0.020*"new" + 0.020*"para"')
(2, '0.016*"charge" + 0.015*"bol" + 0.015*"seco" + 0.012*"air"')
(3, '0.042*"drill" + 0.020*"type" + 0.016*"desc" + 0.012*"set"')
(4, '0.068*"tool" + 0.036*"repair" + 0.028*"maher" + 0.024*"bar"')


In [17]:
import pyLDAvis
from pyLDAvis import gensim_models

In [20]:
dictionary = gensim.corpora.Dictionary.load('dictionary.gensim')
corpus = pickle.load(open('corpus.pkl', 'rb'))
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=20, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

lda_display = pyLDAvis.gensim_models.prepare(lda, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)

In [75]:
pyLDAvis.save_html(lda_display, 'lda.html')

In [32]:
import re, collections

def tokens(text): 
    """
    Get all words from the corpus
    """
    return re.findall('[a-z]+', text.lower()) 

WORDS = tokens(open('Big.txt').read())
WORD_COUNTS = collections.Counter(WORDS)

# top 10 words in corpus
print(WORD_COUNTS.most_common(10)) 


def known(words):
    """
    Return the subset of words that are actually 
    in our WORD_COUNTS dictionary.
    """
    return {w for w in words if w in WORD_COUNTS}


def edits0(word): 
    """
    Return all strings that are zero edits away 
    from the input word (i.e., the word itself).
    """
    return {word}



def edits1(word):
    """
    Return all strings that are one edit away 
    from the input word.
    """
    alphabet = 'abcdefghijklmnopqrstuvwxyz'
    def splits(word):
        """
        Return a list of all possible (first, rest) pairs 
        that the input word is made of.
        """
        return [(word[:i], word[i:]) 
                for i in range(len(word)+1)]
                
    pairs      = splits(word)
    deletes    = [a+b[1:]           for (a, b) in pairs if b]
    transposes = [a+b[1]+b[0]+b[2:] for (a, b) in pairs if len(b) > 1]
    replaces   = [a+c+b[1:]         for (a, b) in pairs for c in alphabet if b]
    inserts    = [a+c+b             for (a, b) in pairs for c in alphabet]
    return set(deletes + transposes + replaces + inserts)


def edits2(word):
    """Return all strings that are two edits away 
    from the input word.
    """
    return {e2 for e1 in edits1(word) for e2 in edits1(e1)}
    
    
def correct(word):
    """
    Get the best correct spelling for the input word
    """
    # Priority is for edit distance 0, then 1, then 2
    # else defaults to the input word itself.
    candidates = (known(edits0(word)) or 
                  known(edits1(word)) or 
                  known(edits2(word)) or 
                  [word])
    return max(candidates, key=WORD_COUNTS.get)


def correct_match(match):
    """
    Spell-correct word in match, 
    and preserve proper upper/lower/title case.
    """
    
    word = match.group()
    def case_of(text):
        """
        Return the case-function appropriate 
        for text: upper, lower, title, or just str.:
            """
        return (str.upper if text.isupper() else
                str.lower if text.islower() else
                str.title if text.istitle() else
                str)
    return case_of(word)(correct(word.lower()))

    
def correct_text_generic(text):
    """
    Correct all the words within a text, 
    returning the corrected text.
    """
    return re.sub('[a-zA-Z]+', correct_match, text)



print(correct_text_generic('fianlly'))
# !pip install pattern

from pattern.en import suggest

print(suggest('fianlly'))
print(suggest('flaot'))

[('the', 28849), ('of', 15661), ('and', 10642), ('to', 8910), ('in', 8332), ('a', 7221), ('was', 3986), ('it', 3513), ('that', 3497), ('i', 3150)]
finally
[('finally', 1.0)]
[('flat', 0.85), ('float', 0.15)]


In [54]:
# suggest(token[0][1])
train_df['tokens']=token
train_df

Unnamed: 0,ITEM_NAME,CATEGORY_ID,tokens
0,CALIBRACION TRANSDUCER 75 nm,CAPITAL ASSEMBLY,"[calibracion, transducer]"
1,for pusher whskey,CAPITAL ASSEMBLY,"[pusher, whskey]"
2,Stat 40B Press Head Cup to Carrier from Stati...,CAPITAL ASSEMBLY,"[stat, press, head, cup, carrier, station, mem..."
3,TRANSD. Cable (4145097103) scrw,CAPITAL ASSEMBLY,"[transd, cable, scrw]"
4,"ZT200 7,5BAR,13BAR60HZ NUMERO DE SERIE: AIF09...",CAPITAL ASSEMBLY,"[bar, bar, numero, serie, aif, fan, radial, pa..."
...,...,...,...
9995,Export Freight charges By Road,LOGISTICS SERVICE,"[export, freight, charge, road]"
9996,EXPORT PACKING,LOGISTICS SERVICE,"[export, packing]"
9997,Express Delivery Charges,LOGISTICS SERVICE,"[express, delivery, charge]"
9998,Express Delivery Charges for PMF to CHMF Despa...,LOGISTICS SERVICE,"[express, delivery, charge, pmf, chmf, despatc..."


In [72]:
train_df['Correct_tokens'] = train_df['tokens'].apply(lambda x: [y for y in x if y.isalpha()])
    

In [76]:
train_df.to_csv('corrected_tokens.csv')