In [1]:
# !pip install deplacy stanza
import stanza
#----------------------
#!pip install deplacy trankit transformers
import trankit
#----------------------
#!pip install deplacy spacy-udpipe
import spacy_udpipe
#----------------------
#!pip install deplacy spacy_jptdp
import spacy_jptdp
#----------------------
#!pip install --index-url https://pypi.clarin-pl.eu/simple deplacy combo
#!pip install combo
import combo
#import combo.predict
#----------------------
#!pip install deplacy camphr en-udify@https://github.com/PKSHATechnology-Research/camphr_models/releases/download/0.7.0/en_udify-0.7.tar.gz
import pkg_resources,imp
imp.reload(pkg_resources)
import spacy

In [2]:
# Import file
import codecs
# Stop words
#!pip install advertools
import advertools as adv
# Presenting a semantic analysis
import deplacy
import re
import string
import pandas as pd
# Text samerization
#!pip install summa
import summa
from summa import summarizer
from summa import keywords

2022-05-30 16:45:32,141 | INFO | textcleaner.py:12 | <module> | 'pattern' package not found; tag filters are not available for English


In [3]:
class HebrewDataProcessing():
    def load_data(self, file_name):
        # Load dataSet
        data = list(codecs.open(file_name, 'r', 'utf-8').readlines())
        df =pd.DataFrame({'text':data})
        return df
    
    def get_syntactic_analysis(self, type, df):
        # Select a library type to preform data syntax analysis
        if type=='stanza':
            #stanza.download('he') 
            nlp=stanza.Pipeline("he")
        elif type=='trankit':
            nlp=trankit.Pipeline("hebrew")
        elif type=='spacy-udpipe':
            #spacy_udpipe.download("he")
            nlp=spacy_udpipe.load("he")
        elif type=='spacy-jptdp':
            nlp=spacy_jptdp.load("he_htb")
        elif type == 'combo-pytorch':
            nlp=combo.predict.COMBO.from_pretrained("hebrew-ud27")
        elif type=='camphr-udify':
            nlp=spacy.load("en_udify")
        doc=nlp(str(df))
        #deplacy.render(doc,WordRight=True)
        #deplacy.serve(doc,port=None,RtoL=True)
        return nlp
    
    def remove_urls(self, hebrew_text):
        # Removing urls
        return re.sub(r'https?://\S+|www\.\S+', '', hebrew_text)

    def get_tokens(self, doc):
        # list of upos
        upos =["CCONJ","PUNCT", "ADP", "PRON", "DET", "SCONJ", "NUM", "ADV"]
        # Remove specific upos
        list_of_tokens = [str(word.lemma) for sent in doc.sentences for word in sent.words  if word.upos not in upos]
        return list_of_tokens 

    def remove_stopwords(self, hebrew_data):
        # list of stopwords by the spaCy package
        word_tokens = list(adv.stopwords['hebrew']) 
        # Remove stopwords
        list_remove_stopwords = [word_token for word_token in hebrew_data if word_token not in word_tokens ]
        return list_remove_stopwords

    def remove_special_characters(self, hebrew_data): 
        # Removing html tags
        no_html_tags = list( re.sub(r'<[^>]*>', '', i) for i in hebrew_data)
        amoji ='[(\U0001F600-\U0001F92F|\U0001F300-\U0001F5FF|\U0001F680-\U0001F6FF|\U0001F190-\U0001F1FF|\U00002702-\U000027B0|\U0001F926-\U0001FA9F|\u200d|\u2640-\u2642|\u2600-\u2B55|\u23cf|\u23e9|\u231a|\ufe0f)]'
        no_emoji = list(re.sub(amoji+'+','', i) for i in no_html_tags)
        # remove numbers and eng words 
        no_integers = list( re.sub('[0-9a-zA-Z+]', '', i) for i in no_emoji)
        # list of special characters 
        punctuations = list(string.punctuation)
        punctuations.extend([' ','—','"','־'])
        # remove special characters 
        no_special_characters = [token for token in no_integers if token not in punctuations]
        return no_special_characters

    def convert_list_to_string(self, hebrew_data):
        # convert list to string
        return ' '.join(hebrew_data)

    def data_clearing(self, df):
        df_nlp = df.applymap(self.remove_urls)
        nlp_stanza = self.get_syntactic_analysis('stanza', df_nlp)
        df_nlp = df_nlp.applymap(nlp_stanza)
        df_nlp = df_nlp.applymap(self.get_tokens)
        #df_nlp.to_csv('stanza.csv', index = False)
        #df_nlp = pd.read_csv('stanza.csv',converters={'text': eval})
        df_nlp = df_nlp.applymap(self.remove_stopwords)
        df_nlp = df_nlp.applymap(self.remove_special_characters)
        df_nlp = df_nlp.applymap(self.convert_list_to_string)
        return df_nlp

In [None]:
if __name__ == "__main__":
    obj = HebrewDataProcessing()
    df= obj.load_data('data/hebrew_text.tsv')
    print(df.head())
    df_nlp = obj.data_clearing(df)
    df_nlp.to_csv('data/preprocessing_hebrew.csv', index = False, header=False)
    print(df_nlp.head())
    
    def hebrow_summarizer(str_text):
        print('Text summary:\n', summarizer.summarize(str_text))
        print('keywords:\n', keywords.keywords(str_text))
        
    str_text = df_nlp.to_string()
    hebrow_summarizer(str_text)

2022-05-30 16:45:32 INFO: Loading these models for language: he (Hebrew):
| Processor | Package |
-----------------------
| tokenize  | htb     |
| mwt       | htb     |
| pos       | htb     |
| lemma     | htb     |
| depparse  | htb     |

2022-05-30 16:45:32,261 | INFO | core.py:112 | __init__ | Loading these models for language: he (Hebrew):
| Processor | Package |
-----------------------
| tokenize  | htb     |
| mwt       | htb     |
| pos       | htb     |
| lemma     | htb     |
| depparse  | htb     |

2022-05-30 16:45:32 INFO: Use device: cpu
2022-05-30 16:45:32,263 | INFO | core.py:123 | __init__ | Use device: cpu
2022-05-30 16:45:32 INFO: Loading: tokenize
2022-05-30 16:45:32,264 | INFO | core.py:129 | __init__ | Loading: tokenize
2022-05-30 16:45:32 INFO: Loading: mwt
2022-05-30 16:45:32,276 | INFO | core.py:129 | __init__ | Loading: mwt
2022-05-30 16:45:32 INFO: Loading: pos
2022-05-30 16:45:32,303 | INFO | core.py:129 | __init__ | Loading: pos


                                                text
0  ממש כואב ..... אני בוכה עם המשפחה שלא תדעו עוד...
1                                  איש יקר שלנו\t0\n
2                         כל הכבוד והמון בהצלחה\t0\n
3  " תל חי , רובי . בכל העצב הזה היית קרן אור של ...
4            נקי כפיים ובר לבב בהצלחה לך ולנו .\t0\n


2022-05-30 16:45:32 INFO: Loading: lemma
2022-05-30 16:45:32,669 | INFO | core.py:129 | __init__ | Loading: lemma
2022-05-30 16:45:32 INFO: Loading: depparse
2022-05-30 16:45:32,721 | INFO | core.py:129 | __init__ | Loading: depparse
2022-05-30 16:45:33 INFO: Done loading processors!
2022-05-30 16:45:33,314 | INFO | core.py:179 | __init__ | Done loading processors!


                                     text
0                   כאב בכה משפחה תדע צער
1                                 איש יקר
2                          כבוד מון הצלחה
3  תל חי רובי עצב קרן אור תקוה נשיא ישראל
4                     נקי כף בר לבב הצלחה
