In [33]:
#!python -m spacy download el_core_news_sm
#!pip install pyarrow --upgrade
#!pip install openpyxl
#!pip install xlrd

Collecting xlrd
  Downloading xlrd-2.0.1-py2.py3-none-any.whl (96 kB)
[K     |████████████████████████████████| 96 kB 3.9 MB/s  eta 0:00:01
[?25hInstalling collected packages: xlrd
Successfully installed xlrd-2.0.1


In [1]:
import spacy
import el_core_news_sm
import string
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
from spacy.lang.el import GreekLemmatizer

In [3]:
from spacy.lang.el import LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES

In [4]:
nlp = el_core_news_sm.load()

In [5]:
lemmatizer = GreekLemmatizer(LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES)

In [6]:
p1 = re.compile('δεν απαντ.{1,3}\s{0,1}',re.IGNORECASE)
p2 = re.compile('\sδα\s',re.IGNORECASE)
p3 = re.compile('δε.{0,1}\s.{0,3}\s{0,1}βρ.{1,2}κ.\s{0,1}',re.IGNORECASE)
p4 = re.compile('[^\d]?\d{10}')
p5 = re.compile('[^\d]?\d{18}|[^\d]\d{20}')
p6 = re.compile('δε[ ν]{0,1} (επιθυμ[α-ω]{2,4}?|ηθελ[α-ω]{1,3}?|θελ[α-ω]{1,4}|.{0,10}ενδιαφερ[α-ω]{2,4})',re.IGNORECASE)
p7 = re.compile('δε[ ν]{0,1} (μπορ[α-ω]{2,5}|.εχει)',re.IGNORECASE)

In [7]:
def loadStopWords():
    sWords = open('stopWords.txt','r',encoding='utf-8')
    sw = set(sWords.read().split('\n'))
    #sw = sw.remove('μη')
    sWords.close()
    return sw

In [8]:
def replaceTerm(text):
    text = p5.sub(' λογαριασμός ',text)
    text = p4.sub(' τηλεφωνο ',text)
    text = p6.sub(' δενθελειδενενδιαφερεται ',text)
    text = p7.sub(' δενεχειδενμπορει ',text)
    text = text.replace('-banking','banking')
    text = text.replace('v banking','vbanking')
    text = text.replace('e banking','ebanking')
    text = text.replace('follow up','followup')
    text = text.replace('safe drive','safedrive')
    text = text.replace('safe pocket','safepocket')
    text = text.replace('sweet home','sweethome')
    text = text.replace('credit card','creditcard')
    text = text.replace('debit card','debitcard')
    text = text.replace('life cycle','lifecycle')
    text = text.replace('π/κ','πκ')
    text = text.replace('td','πκ')
    text = text.replace('α/κ','ακ')
    text = text.replace('δ/α','δεναπαντα ')
    #τδ
    text = p1.sub(' δεναπαντα ',text)
    text = p2.sub(' δεναπαντα ',text)
    text = p3.sub(' δεντονβρηκα ',text)
    return text


In [9]:
#sw = nlp.Defaults.stop_words
#sw = sw|{'εχω','απο','ωστε'}
sw = loadStopWords()
def remove_ton(text):
    diction = {'ά':'α','έ':'ε','ί':'ι','ό':'ο','ώ':'ω','ύ':'υ'}
    for key in diction.keys():
        text = text.replace(key, diction[key])
    return text   
def clean_text(text):
     #text to string
    text = str(text).lower()
    text = replaceTerm(text)
   # tokenize text and remove puncutation
    text = [word.strip(string.punctuation) for word in text.split(" ")]
    # lower text
    text = [remove_ton(x) for x in text]
    # remove stop words
    text = [x for x in text if x not in sw]
 
    #remove quotes
    text = [x.replace('quot;','').replace('&quot','') for x in text if x not in ['quot','amp']]
    # remove words that contain numbers
    text = [word for word in text if not any(c.isdigit() for c in word)]
    # remove empty tokens
    text = [t for t in text if len(t) > 0]
    # remove amp & quot
    text = [x for x in text if x not in ['quot','amp']]
    # remove words with only one letter
    text = " ".join([t for t in text if len(t) > 1])
    # lemmatize text
    text = " ".join([lemmatizer(t.text,t.pos_)[0] for t in nlp(text)])
   
    return(text)

In [10]:
def correct(x,corDict):
    if x in corDict.keys():
        y = corDict[x]
    else:
        y = x
    return y    

In [11]:
fileNum = 202001

In [12]:
# azureml-core of version 1.0.72 or higher is required
# azureml-dataprep[pandas] of version 1.1.34 or higher is required
from azureml.core import Workspace, Dataset

subscription_id = '6ed9d167-b2e6-41b8-9500-35e6df64d9dc'
resource_group = 'MLRG'
workspace_name = 'erbbimlws'

workspace = Workspace(subscription_id, resource_group, workspace_name)

dataset = Dataset.get_by_name(workspace, name='LinkBranchComments')
df = dataset.to_pandas_dataframe()

Performing interactive authentication. Please follow the instructions on the terminal.
To sign in, use a web browser to open the page https://microsoft.com/devicelogin and enter the code D5CFGL8AD to authenticate.
You have logged in. Now let us find all the subscriptions to which you have access...
Interactive authentication successfully completed.


First partition columns (ordered): ['CON_ROW_ID', 'CON_COMMENTS', 'Year_month', 'Column4', 'Column5', 'Column6']
Found Partition has columns (ordered): ['CON_ROW_ID', 'CON_COMMENTS', 'Year_month', 'Column4']


In [14]:
df = df[df.Year_month == fileNum][['CON_ROW_ID','CON_COMMENTS']]

In [15]:
#df.columns
df.head()
df.shape

(79379, 2)

In [16]:
df['tokenized'] = df['CON_COMMENTS'].apply(clean_text)

In [17]:
df = df.fillna('N/A')

In [None]:
#df[df['tokenized'].str.contains(' χρονι ') ]#[~df['tokenized'].str.contains('banking') ]

In [18]:
#tfidf = TfidfVectorizer(min_df = 1000,ngram_range = (1,2))
tfidf =  TfidfVectorizer(max_features = 500,ngram_range = (1,2))

In [20]:
tfidf_result = tfidf.fit_transform(df['tokenized']).toarray()

In [21]:
tfidf_df = pd.DataFrame(tfidf_result, columns = tfidf.get_feature_names())

In [22]:
tfidf_df.shape

(79379, 500)

In [23]:
tfidf_df.columns = [str(x) for x in tfidf_df.columns]

In [24]:
df_f = pd.concat([df[['CON_ROW_ID']],tfidf_df],axis=1).melt(id_vars=['CON_ROW_ID'],value_vars = tfidf_df.columns).dropna()

In [25]:
df_f = df_f[df_f['value']>0]

In [26]:
df_f.shape

(230786, 3)

In [30]:
df_f['variable'].value_counts().to_excel('tokenlist_branch.xlsx')

In [34]:
corDict = dict(pd.read_excel("corTokens.xls").to_dict("split")['data'])

In [35]:
df_f['token'] = df_f['variable'].apply(lambda x : correct(x,corDict))

In [36]:
df_f = df_f[df_f['token'] !='rmv']

In [37]:
df_f = df_f.fillna('N/A')

In [38]:
df_f = df_f.sort_values(['CON_ROW_ID','token'])

In [39]:
df_f['token_c'] = df_f['token']

In [40]:
df_f = df_f[['CON_ROW_ID','token_c']].drop_duplicates()

In [41]:
#df_f.head()
#df_f.shape
df_f['token_c'].value_counts().to_excel('tokens_c.xlsx')

In [42]:
df_f.to_csv('Branchcomments_tokens_{0}.txt'.format(fileNum),sep ='\t',line_terminator='\r\n',index = False)

In [None]:
#df_f.to_excel('vBanking_tokens_s.xlsx',index = False)
#df_f[df_f['value']>0].to_excel('D://Downloads//comments_tokens.xlsx')
#df.to_excel('D://Downloads//comments_cleaned.xlsx')

In [None]:
#df_f[df_f['CON_ROW_ID'] ==60427536]
df.head()