In [19]:
#!python -m spacy download el_core_news_sm

In [20]:
import spacy
import el_core_news_sm
import string
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

In [21]:
from spacy.lang.el import GreekLemmatizer

In [22]:
from spacy.lang.el import LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES

In [23]:
nlp = el_core_news_sm.load()

In [24]:
lemmatizer = GreekLemmatizer(LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES)

In [25]:
def loadStopWords():
    sWords = open('stopWords.txt','r',encoding='utf-8')
    sw = set(sWords.read().split('\n'))
    #sw = sw.remove('μη')
    sWords.close()
    return sw

In [26]:
def replaceTerm(text):
    text = text.replace('-banking','banking')
    text = text.replace('v banking','vbanking')
    text = text.replace('e banking','ebanking')
    return text


In [27]:
#sw = nlp.Defaults.stop_words
#sw = sw|{'εχω','απο','ωστε'}
sw = loadStopWords()
def remove_ton(text):
    diction = {'ά':'α','έ':'ε','ί':'ι','ό':'ο','ώ':'ω','ύ':'υ'}
    for key in diction.keys():
        text = text.replace(key, diction[key])
    return text   
def clean_text(text):
     #text to string
    text = str(text).lower()
    text = replaceTerm(text)
   # tokenize text and remove puncutation
    text = [word.strip(string.punctuation) for word in text.split(" ")]
    # lower text
    text = [remove_ton(x) for x in text]
    # remove stop words
    text = [x for x in text if x not in sw]
 
    #remove quotes
    text = [x.replace('quot;','').replace('&quot','') for x in text if x not in ['quot','amp']]
    # remove words that contain numbers
    text = [word for word in text if not any(c.isdigit() for c in word)]
    # remove empty tokens
    text = [t for t in text if len(t) > 0]
    # remove amp & quot
    text = [x for x in text if x not in ['quot','amp']]
    # remove words with only one letter
    text = " ".join([t for t in text if len(t) > 1])
    # lemmatize text
    text = " ".join([lemmatizer(t.text,t.pos_)[0] for t in nlp(text)])
   
    return(text)

In [28]:
# azureml-core of version 1.0.72 or higher is required
# azureml-dataprep[pandas] of version 1.1.34 or higher is required
from azureml.core import Workspace, Dataset

subscription_id = '6ed9d167-b2e6-41b8-9500-35e6df64d9dc'
resource_group = 'MLRG'
workspace_name = 'erbbimlws'

workspace = Workspace(subscription_id, resource_group, workspace_name)

dataset = Dataset.get_by_name(workspace, name='vBankingComments')
df = dataset.to_pandas_dataframe()

In [None]:
#df.head()

In [29]:
df['tokenized'] = df['CON_COMMENTS'].apply(clean_text)

In [None]:
#df.head(1000)

In [30]:
df = df.fillna('N/A')

In [31]:
tfidf = TfidfVectorizer(min_df = 100,ngram_range = (1,2))

In [32]:
tfidf_result = tfidf.fit_transform(df['tokenized']).toarray()

In [33]:
tfidf_df = pd.DataFrame(tfidf_result, columns = tfidf.get_feature_names())

In [34]:
tfidf_df.columns = [str(x) for x in tfidf_df.columns]

In [35]:
df_f = pd.concat([df[['CON_ROW_ID']],tfidf_df],axis=1).melt(id_vars=['CON_ROW_ID'],value_vars = tfidf_df.columns).dropna()

In [36]:
df_f = df_f[df_f['value']>0]

In [None]:
#!pip install openpyxl
#import openpyxl

In [37]:
df_f[['CON_ROW_ID','variable']].to_excel('vBanking_tokens.xlsx',index = False)
#df_f[df_f['value']>0].to_excel('D://Downloads//comments_tokens.xlsx')
#df.to_excel('D://Downloads//comments_cleaned.xlsx')

In [None]:
df_f[df_f['variable']=='banking']['CON_ROW_ID'].apply(str)

In [None]:
print(df[df['CON_ROW_ID'] == 63699307].CON_COMMENTS)