https://prrao87.github.io/blog/spacy/nlp/performance/2020/05/02/spacy-multiprocess.html



Uncommend and run the following pip & python commands when running a new compute for the <b> first</b> time! 

In [1]:
#!python -m spacy download el_core_news_sm
#!pip install pyarrow --upgrade
#!pip install openpyxl
#!pip install xlrd

In [2]:
import spacy
import el_core_news_sm
import string
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from azureml.core import Experiment
from azureml.core import Workspace, Dataset

In [3]:
from spacy.lang.el import GreekLemmatizer

In [4]:
from spacy.lang.el import LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES

In [5]:
nlp =spacy.load('el_core_news_sm', disable=['tagger', 'parser', 'ner'])


In [6]:
lemmatizer = GreekLemmatizer(LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES)

In [7]:
p1 = re.compile('δεν απαντ.{1,3}\s{0,1}',re.IGNORECASE)
p2 = re.compile('\sδα\s',re.IGNORECASE)
p3 = re.compile('δε.{0,1}\s.{0,3}\s{0,1}βρ.{1,2}κ.\s{0,1}',re.IGNORECASE)
p4 = re.compile('[^\d]?\d{10}')
p5 = re.compile('[^\d]?\d{18}|[^\d]\d{20}')
p6 = re.compile('δε[ ν]{0,1} (επιθυμ[α-ω]{2,4}?|ηθελ[α-ω]{1,3}?|θελ[α-ω]{1,4}|.{0,10}ενδιαφερ[α-ω]{2,4})',re.IGNORECASE)
p7 = re.compile('δε[ ν]{0,1} (μπορ[α-ω]{2,5}|.εχει)',re.IGNORECASE)
p8 = re.compile('(δεν|μη).*διαθεσιμ[οη]ς{0,1}?',re.IGNORECASE)
p9 = re.compile('(δεν|μη)+.*εφικτη?',re.IGNORECASE)


In [8]:
def loadStopWords():
    sWords = open('stopWords.txt','r',encoding='utf-8')
    sw = set(sWords.read().split('\n'))
    #sw = sw.remove('μη')
    sWords.close()
    return sw

In [9]:
def replaceTerm(text):
    text = p5.sub(' λογαριασμός ',text)
    text = p4.sub(' τηλεφωνο ',text)
    text = p6.sub(' δενθελειδενενδιαφερεται ',text)
    text = p7.sub(' δενεχειδενμπορει ',text)
    text = p8.sub(' δενειναιδιαθεσιμος ',text)
    text = p9.sub(' ανεφικτη ',text)
    text = text.replace('-banking','banking')
    text = text.replace('v banking','vbanking')
    text = text.replace('e banking','ebanking')
    text = text.replace('follow up','followup')
    text = text.replace('fup','followup')
    text = text.replace('f/up','followup')
    text = text.replace('πυρ/ριο','πυρασφαλιστηριο')
    text = text.replace('safe drive','safedrive')
    text = text.replace('safe pocket','safepocket')
    text = text.replace('alphabank','alpha')
    text = text.replace('sweet home smart','sweethomesmart')
    text = text.replace('sweet home','sweethome')
    text = text.replace('eξασφαλιζω','εξασφαλιζω')
    text = text.replace('credit card','creditcard')
    text = text.replace('debit card','debitcard')
    text = text.replace('life cycle','lifecycle')
    text = text.replace('π/κ','πκ')
    text = text.replace('td','πκ')
    text = text.replace('α/κ','ακ')
    text = text.replace('δ/α','δεναπαντα ')
    text = text.replace('εκτος αττικης','εκτοςαττικης ')
    #τδ
    text = p1.sub(' δεναπαντα ',text)
    text = p2.sub(' δεναπαντα ',text)
    text = p3.sub(' δεντονβρηκα ',text)
    
    return text


In [10]:
#sw = nlp.Defaults.stop_words
#sw = sw|{'εχω','απο','ωστε'}
sw = loadStopWords()
def remove_ton(text):
    diction = {'ά':'α','έ':'ε','ί':'ι','ό':'ο','ώ':'ω','ύ':'υ'}
    for key in diction.keys():
        text = text.replace(key, diction[key])
    return text   
def clean_text(text):
     #text to string
    text = str(text).lower()
    text = replaceTerm(text)
    
   # tokenize text and remove puncutation
    text = [word.strip(string.punctuation) for word in text.split(" ")]
    # lower text
    text = [remove_ton(x) for x in text]
    # remove stop words
    text = [x for x in text if x not in sw]
 
    #remove quotes
    text = [x.replace('quot;','').replace('&quot','') for x in text if x not in {'quot','amp'}]
    # remove words that contain numbers
    text = [word for word in text if not any(c.isdigit() for c in word)]
    # remove empty tokens
    text = [t for t in text if len(t) > 0]
    # remove amp & quot
    text = [x for x in text if x not in ['quot','amp']]
    # remove words with only one letter
    text = " ".join([t for t in text if len(t) > 1])
     # lemmatize text
    text = " ".join([lemmatizer(t.text,t.pos_)[0] for t in nlp(text)])
   
    return(text)

In [11]:
def correct(x,corDict):
    if x in corDict.keys():
        y = corDict[x]
    else:
        y = x
    return y    

In [12]:
def get_ngrams(idf,mindf,minngram,maxngram):
    tfidf = TfidfVectorizer(min_df = mindf,ngram_range = (minngram,maxngram))
    tfidf_result = tfidf.fit_transform(idf['tokenized']).toarray()
    tfidf_df = pd.DataFrame(tfidf_result, columns = tfidf.get_feature_names())
    tfidf_df.columns = [str(x) for x in tfidf_df.columns]
    df_i = pd.concat([df[['CON_ROW_ID']],tfidf_df],axis=1).melt(id_vars=['CON_ROW_ID'],value_vars = tfidf_df.columns).dropna()
    df_i = df_i[df_i['value']>0]
    return df_i

In [13]:
def cleanComments(df):
    df = df[['CON_ROW_ID','CON_COMMENTS']]
    df['tokenized'] = df['CON_COMMENTS'].apply(clean_text)
    df = df.fillna('N/A')
    df['variable'] = df['tokenized'].str.split()
    return df

In [14]:
def getTokens(df):
    df = cleanComments(df)
    df_f = df.explode('variable')[['CON_ROW_ID','variable']]
    return df_f


In [15]:
def getTokencount(df_f,minCount):
    tokenCount = df_f['variable'].value_counts().to_dict()
    df_f['value'] = df_f['variable'].map(tokenCount)
    df_f=df_f[df_f['value']>=minCount] 
    return df_f

In [16]:
def loadComments(fileNum):
    # azureml-core of version 1.0.72 or higher is required
    # azureml-dataprep[pandas] of version 1.1.34 or higher is required
   


    dataset = Dataset.get_by_name(workspace, name='LinkComments{0}'.format(fileNum))
    df = dataset.to_pandas_dataframe()
    return df

In [17]:
subscription_id = '6ed9d167-b2e6-41b8-9500-35e6df64d9dc'
resource_group = 'MLRG'
workspace_name = 'erbbimlws'

workspace = Workspace(subscription_id, resource_group, workspace_name)

In [18]:
experiment = Experiment(workspace = workspace, name = "Link_Comments")

In [19]:
run = experiment.start_logging(snapshot_directory=None)

In [20]:
fileNum = '202102'

In [21]:
run.log('fileNum',fileNum)

In [22]:
df = loadComments(fileNum)

In [23]:
df = cleanComments(df)

In [24]:
df_f = getTokens(df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [25]:
minCount = 30

In [26]:
run.log('minCount',minCount)

In [27]:

df_f = getTokencount(df_f,minCount)

In [28]:
#ngrams parameters
mindf,minngram,maxngram = 1000,2,3

In [29]:
run.log_table('Parameters',{'Param':['mindf','minngram','maxngram'],'Values':[mindf,minngram,maxngram]})

In [30]:
df_f = df_f.append(get_ngrams(df,mindf,minngram,maxngram ))


In [31]:
#df_tokenCount = pd.read_excel('tokenlist.xlsx',engine='openpyxl')


In [32]:
df_f['variable'].value_counts().to_excel('tokenlistTotal.xlsx')

In [33]:
corDict = dict(pd.read_excel("corTokens.xls").to_dict("split")['data'])

In [34]:
df_f['token'] = df_f['variable'].apply(lambda x : correct(x,corDict))

In [35]:
df_f = df_f[df_f['token'] !='rmv']

In [36]:
df_f = df_f[df_f['token'].str.len() >1]

In [37]:
#df_f['token'].value_counts().to_excel('tokenlist.xlsx')

In [38]:
df_f = df_f.fillna('N/A')

In [39]:
df_f = df_f.sort_values(['CON_ROW_ID','token'])

In [40]:
df_f = df_f[['CON_ROW_ID','token']].drop_duplicates()

In [41]:
df_f.to_csv('comments_tokens_{0}.txt'.format(fileNum),sep ='\t',line_terminator='\r\n',index = False)

In [42]:
#df_f['token'].value_counts().to_excel('tokenlist_new.xlsx')

In [43]:
#df
#df[df.tokenized.str.contains('/')].count()

In [44]:
print('comments_tokens_{0}.txt'.format(fileNum))

comments_tokens_202102.txt


In [45]:
run.log('Output','comments_tokens_{0}.txt'.format(fileNum))

In [46]:
run.complete()