<b>Import necessery libraries</b>


In [8]:
import spacy
import string
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from azureml.core import Experiment
from azureml.core import Workspace
from azureml.data import DataType
from spacy.cli.download import download as spacy_download
import os 
from os.path import join as osjoin
pd.set_option('display.max_rows', None)

import warnings
warnings.filterwarnings("ignore")

<b>Loading the Greek language tools</b>

In [9]:
spacy_download('el_core_news_sm')
nlp =spacy.load('el_core_news_sm', disable=['tagger', 'parser', 'ner'])

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('el_core_news_sm')


<b>Regular expressions definitions</b>

In [11]:
p1 = re.compile('δεν απαντ.{1,3}\s{0,1}',re.IGNORECASE)
p2 = re.compile('\sδα\s',re.IGNORECASE)
p3 = re.compile('δε.{0,1}\s.{0,3}\s{0,1}βρ.{1,2}κ.\s{0,1}',re.IGNORECASE)
p4 = re.compile('[^\d]?\d{10}')
p5 = re.compile('[^\d]?\d{18}|[^\d]\d{20}')
p6 = re.compile('δε[ ν]{0,1} (επιθυμ[α-ω]{2,4}?|[ήη]θ[εέ]λ[α-ω]{1,3}?|θελ[α-ω]{1,4}|.{0,20}ενδιαφ[εέ]ρ[α-ω]{2,4})',re.IGNORECASE)
p7 = re.compile('δε[ ν]{0,1} (μπορ[α-ω]{2,5}|.εχει)',re.IGNORECASE)
p8 = re.compile('(δεν|μη).*διαθεσιμ[οη]ς{0,1}?',re.IGNORECASE)
p9 = re.compile('(δεν|μη)+.*εφικτη?',re.IGNORECASE)
p10 = re.compile('δε[ ν]{0,1}.{0,20}θετικ[οόήη]ς{0,1}',re.IGNORECASE)
p11 = re.compile('δε[ ν]{0,1}\s?(γνωρ[α-ω|ά-ώ]{1,8}|ξ[εέ]ρ[α-ω|ά-ώ]{1,4}|απ[αά]ντ[α-ω|ά-ώ]{1,4})',re.IGNORECASE)

p12 = re.compile('εξυπη.?(.*)',re.IGNORECASE)
p13 = re.compile('τηλ[εέ]φ.?(.*)',re.IGNORECASE)
p14 = re.compile('.*πρ(ο|ό)σ[ωώ]π.?(.*)',re.IGNORECASE)
p15 = re.compile('αναμον.?(.*)',re.IGNORECASE)
p16 = re.compile('χρ[οό]ν.?(.*)',re.IGNORECASE)
p17 = re.compile('εμβ.?(.*)',re.IGNORECASE)
p18 = re.compile('υπ[αά]λλ.?(.*)',re.IGNORECASE)
p19 = re.compile('(υπο){0,1}καταστ.?(.*)', re.IGNORECASE)
p20 = re.compile('πιστωτ.?(.*)', re.IGNORECASE)
p21 = re.compile('διαδικα.?(.*)', re.IGNORECASE)
p22 = re.compile('φωνητι.?(.*)', re.IGNORECASE)
p23 = re.compile('γιο[υύ]ρο.?(.*)', re.IGNORECASE)
p24 = re.compile('υπηρεσ.?(.*)', re.IGNORECASE)
p25 = re.compile('κατ[αά]ρτι.?(.*)', re.IGNORECASE)
p26 = re.compile('ανταπ.?(.*)', re.IGNORECASE)
p27 = re.compile('υπηρ[εέ]σ.?(.*)', re.IGNORECASE)
p28 = re.compile('πρ.βλ.μ.(τα){0,1}', re.IGNORECASE)

#### Dictionary correction

In [10]:
corDict = {
    **dict.fromkeys(['αμεσοτητα', 'καλυτερη', 'ταχυτερη', 'μεταφορα', 'κλησης', 'επικοινωνια', 'μεταφορα','τηλεφωνο', 
                   'εκπροσωπος', 'αμεση', 'γρηγορη', 'εξυπηρετηση', 'ουρα', 'γρηγοροτερη', 'καθυστερηση', 'προσωπικο',
                    'ανταποκριση', 'μεγαλυτερη', 'περισσοτερη', 'καταστημα', 'ταχυτητα', 'οργανωση', 'αμεσοι', 
                     'γρηγοροι', 'ευελικτοι', 'ευκολοι', 'λαθος', 'λανθασμενη', 'ηλεκτρονικο', 
                     'ηλεκτρονικα', 'περιμενε', 'λειτουργιες', 'οδηγιες', 'επικοινωνιας'], 'εξυπηρετηση'), 
    **dict.fromkeys(['διαρκεια', 'χρονος', 'αρχικη', 'αναμονη', 'αντιμετωπιση', 'αναμονης'], 'αναμονη'),
    **dict.fromkeys(['φωνητικη', 'πυλη', 'φωνητικη πυλη', 'συστημα', 'ηχητικες', 'ηχητικα'], 'φωνητικη πυλη'),
    **dict.fromkeys(['καρτα', 'εμβασμα', 'δανεια', 'μετοχων', 'λογαριασμους', 'λογαριασμο'], 'προιοντα'),
    **dict.fromkeys(['eBanking', 'europhonebanking', 'phonebanking', 'telephonebanking', 'europhone banking', 
                     'telephone banking','υπηρεσιες', 'telephone', 'banking', 'eurobank', 'mobile', 'euro',
                    'εφαρμογη', 'εφαρμογης', 'πληρωμη', 'πληρωμης'], 'υπηρεσιες'),
    **dict.fromkeys(['υπαλληλος', 'συμπεριφορα', 'συμπεριφορα υπαλληλου', 'καταρτιση'], 'υπαλληλοι'),
    **dict.fromkeys(['προβλημα', 'υπαρχει', 'μεσω', 'τραπεζα','μπορουμε','πραγματα','πελατες','μπορω','δυο','ξερω','δευτερο',
                     'δεκα', 'γιατι' ], ''),
    **dict.fromkeys(['γραφειοκρατια', 'διαδικασιες'], 'διαδικασιες'),
    **dict.fromkeys(['δενγνωριζωδεναπαντω', 'τιποτα'], 'δεν γνωριζω/δεν απαντω'),
    **dict.fromkeys(['ολα', 'καλα', 'ενταξει', 'τελεια', 'ενταξει'], 'ευχαριστημενος'),
    **dict.fromkeys(['κατι', 'αλλο'], 'αλλο')
}

<b>Functions definitions</b>

In [13]:
def replaceTerm(text):
    
    '''This function uses the above defined regular expressions to replace text
    This function is applied before the accent mark removal
    The order of the rules is important
    Combinations of two or more words, are concatenated, in order to be considered as a single token'''

    text = text.replace('γιουροφοουν','europhone')
    text = text.replace('ιμπανκινγκ','ebanking')
    text = text.replace('μπανκινγκ','banking')
    text = text.replace('τελεφοουν','telephone')
    text = text.replace('γιουρομπανκ','eurobank')
    text = text.replace('γιουρο μπανκ','eurobank')
    text = text.replace('βαιμπερ','viber')
    text = text.replace('ιμαιλ', 'email')
    text = text.replace('γιουζερνεημ', 'username')
    text = text.replace('εητιεμ', 'ATM')
    text = text.replace('φοουν', 'phone')
    text = text.replace('γιουρο', 'euro')
    text = text.replace('μομπαηλ', 'mobile')
    
    text = p5.sub(' λογαριασμος ',text)
    text = p4.sub(' τηλεφωνο ',text)
    text = p6.sub(' δενθελειδενενδιαφερεται ',text)
    text = p10.sub(' δενθελειδενενδιαφερεται ',text)
    text = p7.sub(' δενεχειδενμπορει ',text)
    text = p8.sub(' δενειναιδιαθεσιμος ',text)
    text = p9.sub(' ανεφικτη ',text)
    text = text.replace('-banking','banking')
    text = text.replace('v banking','vbanking')
    text = text.replace('e banking','ebanking')
    text = text.replace('follow up','followup')
    text = text.replace('fup','followup')
    text = text.replace('f/up','followup')
    text = text.replace('πυρ/ριο','πυρασφαλιστηριο')
    text = text.replace('safe drive','safedrive')
    text = text.replace('safe pocket','safepocket')
    text = text.replace('alphabank','alpha')
    text = text.replace('sweet home smart','sweethomesmart')
    text = text.replace('sweet home','sweethome')
    text = text.replace('eξασφαλιζω','εξασφαλιζω')
    text = text.replace('credit card','creditcard')
    text = text.replace('debit card','debitcard')
    text = text.replace('life cycle','lifecycle')
    text = text.replace('π/κ','πκ')
    text = text.replace('td','πκ')
    text = text.replace('α/κ','ακ')
    text = text.replace('δ/α','δεναπαντα ')
    text = text.replace('εκτος αττικης','εκτοςαττικης ')
    #τδ
    text = p1.sub(' δεναπαντα ',text)
    text = p2.sub(' δεναπαντα ',text)
    text = p3.sub(' δεντονβρηκα ',text)
    text = p11.sub('δενγνωριζωδεναπαντω', text)
    text = p12.sub('εξυπηρετηση', text)
    text = p13.sub('τηλεφωνο', text)
    text = p14.sub('εκπροσωπος', text)
    text = p15.sub('αναμονης', text)
    text = p16.sub('χρονος', text)
    text = p17.sub('εμβασμα', text)
    text = p18.sub('υπαλληλος', text)
    text = p19.sub('καταστημα', text)
    text = p20.sub('καρτα', text)
    text = p21.sub('διαδικασιες', text)
    text = p22.sub('φωνητικη', text)
    text = p23.sub('euro', text)
    text = p24.sub('υπηρεσιες', text)
    text = p25.sub('καταρτιση', text)
    text = p26.sub('ανταποκριση', text)
    text = p27.sub('υπηρεσιες', text)
    text = p28.sub('εξυπηρετηση', text)
    return text

In [14]:
def remove_accent_mark(text):
    
    '''removes punctuation, removal of accent mark'''
    
    diction = {'ά':'α','έ':'ε','ί':'ι','ό':'ο','ώ':'ω','ύ':'υ','ή':'η'}
    for key in diction.keys():
        text = text.replace(key, diction[key])
    return text   

In [36]:
'''def load_correctDict(ws):
    
    ''''''It creates a dictionary out of a dataset that containes pairs of (original term, corrected term)''''''
    
    dataset = Dataset.get_by_name(ws, name='correct_Tokens')    
    corDict = dict(dataset.to_pandas_dataframe().to_dict("split")['data'])
    return corDict'''

'def load_correctDict(ws):\n    \n    It creates a dictionary out of a dataset that containes pairs of (original term, corrected term)\n    \n    dataset = Dataset.get_by_name(ws, name=\'correct_Tokens\')    \n    corDict = dict(dataset.to_pandas_dataframe().to_dict("split")[\'data\'])\n    return corDict'

In [16]:
def correct(x,corDict):
    
    '''Uses the dictionary to correct the terms'''
    
    if x in corDict.keys():
        y = corDict[x]
    else:
        y = x
    return y    

In [20]:
def getTokens(df, idField, textField):
    
    '''The variable columns is a list. The explode method "unpivots" this list'''
    
    # load stop words for the cleaning of the text --"WORKSPACE" IS READ AUTOMATICALLY
    sw = loadStopWords()
    
    df = df[[idField,textField]]
    df['tokenized'] = df[textField].apply(clean_text, sw =sw)
    
    df = df.fillna('N/A')
    
    df['variable'] = df['tokenized'].str.split()
    df_f = df.explode('variable')[[idField, 'tokenized','variable']]
    return df_f

In [12]:
def loadStopWords():

    '''A dataset containing the Greek stop words has been created 
    the function loads this dataset as a dataframe'''
    
    dataset = Dataset.get_by_name(workspace, name='stopWords_gr')
    sw = set(dataset.to_pandas_dataframe().squeeze())
    return sw

In [19]:
def clean_text(text, sw):
    '''This function performs text cleansing and returns the clean and lemmatized version of the original text
    convert to lower text '''
    
    # convert text to lowercase
    text = str(text).lower()
    
    # remove puncutation
    text = [word.strip(string.punctuation) for word in text.split(" ")]
    
    # remove accent mark
    text = [remove_accent_mark(x) for x in text]
    
    #replacements either by rules or regular expressions
    text = [replaceTerm(x) for x in text]

    # remove stop words
    text = [x for x in text if x not in sw]

    #remove quotes
    text = [x.replace('quot;','').replace('&quot','') for x in text if x not in {'quot','amp'}]
    
    # remove words that contain numbers
    text = [word for word in text if not any(c.isdigit() for c in word)] #addition to return even empty tokens
    
    # remove empty tokens
    #text = [t for t in text if len(t) > 0] #addition to return even empty tokens
    
    # remove amp & quot
    text = [x for x in text if x not in ['quot','amp']]
    
    # remove words with only one letter
    text = " ".join([t for t in text if len(t) > -1]) #addition to return even empty tokens
    
    # lemmatize text
    text = " ".join([t.lemma_ for t in nlp(text, disable=['tagger', 'parser', 'ner','tok2vec', 'morphologizer', 'parser', 'senter', 'attribute_ruler',  'ner'])])
    
    return(text)

In [18]:
def getTokencount(df_f,minCount):
    
    '''Calculate the number of occurances (counts) of each token
    tokens with count less than mincount are set to blank'''

    tokenCount = df_f['variable'].value_counts().to_dict()
    
    df_f['value'] = df_f['variable'].map(tokenCount)
   
    df_f.loc[(df_f['value'] < minCount), 'variable'] = ' ' #addition to return even empty tokens
    
    return df_f

In [17]:
def get_ngrams(idf,mindf,minngram,maxngram,idField):
    
    '''This function returns the bi-grams and tri-grams'''
    idf = idf.reset_index(drop = True)
    tfidf = TfidfVectorizer(min_df = mindf, ngram_range = (minngram,maxngram))
    tfidf_df = pd.DataFrame(tfidf.fit_transform(idf['tokenized']).toarray(), columns = tfidf.get_feature_names())

    df_i = pd.concat([idf[[idField]],tfidf_df],axis=1).melt(id_vars=[idField],value_vars = tfidf_df.columns).dropna()
    df_i = df_i[df_i['value'] > 0].reset_index(drop=True)
    return df_i

In [21]:
def performNLP(minCount, ngram_param, df, idField, textField, min_importance = 0.7, 
               corDict = None, deleteEmptyTokens = True):
    
    '''Loads all above functions for the cleaning of the text and the extraction of bigrams and trigrams'''
       
    df_f = getTokens(df, idField,textField)
    
    df_f = df_f.fillna(' ')
    
    df_f = getTokencount(df_f, minCount)
    
    try:
        df_f = df_f.append(get_ngrams(df_f, ngram_param[0], ngram_param[1], ngram_param[2], idField))
    except:
        print('no bigramms or trigramms were added')  
    
    df_f = df_f.loc[df_f['value'] > min_importance]
    
    df_f['token'] = df_f['variable']
     
    df_f.loc[(df_f['token'].str.len() <= 5), 'token'] = ' ' #single of double character tokens are set to blank
    
    df_f = df_f.sort_values([idField,'token'])
    
    if corDict != None: df_f['token'] = df_f['variable'].map(corDict).fillna('')
        
    if deleteEmptyTokens:
        df_f = df_f[df_f['token'] != ' ']
    
    df_f = df_f[[idField, 'token']].drop_duplicates().reset_index(drop=True)
    
    return df_f

In [22]:
def loadTexts(datasetName,idField,textField):
    
    '''loads the texts to be analyzed'''
    
    dataset = Dataset.get_by_name(workspace, name=datasetName)
    df = dataset.to_pandas_dataframe()
    df= df[[idField,textField]]
    return df   

In [23]:
def exportResults(fileName,df_f):
    
    '''Export results to a .txt file'''
    
    df_f.to_csv(fileName+'.txt',sep =',',line_terminator='\r\n',index = False)
    fil = [os.getcwd()+'/'+ fileName+'.txt']
    #datastore.upload_files(fil, target_path='UI/NLP', overwrite=True, show_progress=True)

### Parameter definition

In [30]:
from azureml.core import Workspace, Dataset

subscription_id = '6ed9d167-b2e6-41b8-9500-35e6df64d9dc'
resource_group = 'MLRG'
workspace_name = 'erbbimlws'

global workspace 
workspace = Workspace(subscription_id, resource_group, workspace_name)

#minimum number of tokens in the texts
minCount = 1
#ngrams parameters
ngram_param = [3,2,2]
fileName = 'Omilia_Vana'

idField = 'id'
textField = 'NPS_UTTERANCE_'

In [31]:
# Read manually an excel sheet with the comments to be analyzed
#df = pd.read_excel('./xlsxFiles/NPS_CSAT_0102_2022.xlsx',engine='openpyxl',index_col = None)

In [34]:
# Call perform NLP for the NLP manipulation
df_f = performNLP(minCount,ngram_param,df,idField,textField, 0.9)

In [54]:
exportResults(fileName,df_f)
#run.complete()