In [1]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

In [2]:
import regex as re
import operator
import codecs

import local_settings
import django
django.setup()
from sefaria.model import *



In [3]:
import hebrew_spellcheck

In [4]:
word_expander = hebrew_spellcheck.word_expander

In [5]:
len(word_expander)

354051

# Get a list of all stopwords
### This includes:
1. The Dicta Prefixes
2. A list of common stopwords

In [None]:
def get_clean_segments(data):
    data = data.split(u'~~')[1]
    data = re.sub(ur'[-־]', u' ', data)
    data = re.sub(ur'[^ \u05d0-\u05ea"\'┉״׳]', u' ', data)
    data = re.sub(ur'(^|\s)["\'״׳]+', u' ', data)
    return data

all_unique_dicta_prefixes = set()
with codecs.open('./sefaria-export_prefix_refs.txt', encoding='utf8') as f:
    for index, line in enumerate(f):
        if u'~~' not in line:
            continue
        line = get_clean_segments(line)
        prefixes_in_line = [word.split(u'┉')[0] for word in line.strip().split() if u'┉' in word]
        all_unique_dicta_prefixes.update(prefixes_in_line)
        if index % 100000 == 0:
            print index


stopwords = codecs.open('./hebrew_stopwords.txt', encoding='utf8').read().strip().split('\n')
all_unique_dicta_prefixes_with_marker = [word+u'┉' for word in all_unique_dicta_prefixes]
all_stop_words = stopwords+all_unique_dicta_prefixes_with_marker

0
100000
200000
300000
400000
500000
600000
700000
800000
900000
1000000
1100000


# Create a dictionary of the entire corpus
- ## Key - a ref 
- ## Value - the text with the Dicta Prefixes separated out as their own word

In [21]:
print word_expander[u'תהו']

הם תהה


In [44]:
def pull_out_suffix(line):
    line = line.split()
    line = ' '.join([word_expander.get(word, word) for word in line])
    return line

def get_clean_segments2(data):
    data = data.split(u'~~')[1]
    data = re.sub(ur'[-־]', u' ', data)
    data = re.sub(ur'(\([^()]*(?1)?[^()]*\))', u'.', data)
    data = re.sub(ur'\[[^\]]+\]', u'.', data)
    data = re.sub(ur'[^ \u05d0-\u05ea"\'┉.:?,!;״׳]', u' ', data)
    data = re.sub(ur'(^|\s)["\'״׳]+', u' ', data)
    data = re.sub(ur'["\'״׳]+(\s|$)', u' ', data)
    return data

text_with_prefixes_as_own_word = {}
with codecs.open('./sefaria-export_prefix_refs.txt', encoding='utf8') as f:
    for index, line in enumerate(f):
        if u'~~' not in line:
            continue
        ref = line.split('~')[0]
        line = get_clean_segments2(line)
        line = line.strip().replace(u'┉', u'┉ ')
        line = pull_out_suffix(line)
        text_with_prefixes_as_own_word[ref] = line
#         if index >= 10000:
#             break
        if index % 100000 == 0:
            print index

Mishneh Torah, Foundations of the Torah 1:6~5~~ו┉ידיעת דבר זה מצות עשה ש┉נאמר <small>(שמות כ ב)</small> אנכי ה' אלהיך. "ו┉כל ה┉מעלה על דעתו ש┉יש שם אלוה אחר חוץ מ┉זה. עובר בלא תעשה ש┉נאמר <small>(שמות כ ג)</small> לא יהיה לך אלהים אחרים על פני. "ו┉כופר בעקר ש┉זהו ה┉עקר ה┉גדול שה┉כל תלוי בו:

ו┉ידיעת דבר זה מצות עשה ש┉נאמר        .         אנכי ה אלהיך. ו┉כל ה┉מעלה על דעתו ש┉יש שם אלוה אחר חוץ מ┉זה. עובר בלא תעשה ש┉נאמר        .         לא יהיה לך אלהים אחרים על פני. ו┉כופר בעקר ש┉זהו ה┉עקר ה┉גדול שה┉כל תלוי בו: 
ו┉ ידיעת דבר זה מצות עשה ש┉ אנו אמר . אנכי ה אלהיך. ו┉ כל ה┉ מעלה על דעתו ש┉ יש שם אלוה אחר חוץ מ┉ זה. עובר בלא תעשה ש┉ אנו אמר . לא הוא היה לך אלהים אחרים על פני. ו┉ אני כפר בעקר ש┉ זהו ה┉ עקר ה┉ גדול שה┉ כל תלוי בו:



Mishneh Torah, Foundations of the Torah 1:12~11~~ו┉הואיל וה┉דבר כן הוא. כל ה┉דברים הללו וכ┉יוצא בהן ש┉נאמרו ב┉תורה וב┉דברי נביאים ה┉כל משל ו┉מליצה הן. כמו ש┉נאמר <small>(תהילים ב ד)</small> יושב בשמים ישחק. <small>("דברים לב כא)</small> כעסוני ב┉הבליהם. <small>

Mishneh Torah, Foundations of the Torah 1:9~8~~אם כן מהו זה ש┉כתוב ב┉תורה ו┉תחת רגליו. <small>("שמות לא יח)</small> כתובים ב┉אצבע אלהים. "יד ה'. "עיני ה'. "אזני ה'. "וכ┉יוצא ב┉דברים ה┉אלו. ה┉כל לפי דעתן של בני אדם הוא ש┉אינן מכירין אלא (ה┉נופות) [ה┉גופות] ו┉דברה תורה כ┉לשון בני אדם. וה┉כל כנויים הן. ש┉נאמר <small>(דברים לב מא)</small> אם שנותי ברק חרבי. "ו┉כי חרב יש לו וב┉חרב הוא הורג אלא משל וה┉כל מ┉של. ראיה ל┉דבר ש┉נביא אחד אומר ש┉ראה ה┉קדוש ברוך הוא <small>(דניאל ז ט)</small> לבושיה כ┉תלג חור. "ו┉אחד ראהו <small>(ישעיה סג א)</small> חמוץ בגדים מ┉בצרה. "משה רבנו עצמו ראהו על ה┉ים כ┉גבור עושה מלחמה. ו┉בסיני כ┉שליח צבור עטוף. לומר ש┉אין לו דמות ו┉צורה אלא ה┉כל ב┉מראה ה┉נבואה וב┉מחזה. ו┉אמתת ה┉דבר אין דעתו של אדם מבין ו┉לא יכולה להשיגו ו┉לחקרו. ו┉זה ש┉אמר ה┉כתוב <small>(איוב יא ז)</small> החקר אלוה תמצא אם עד תכלית שדי תמצא:"

אם כן מהו זה ש┉כתוב ב┉תורה ו┉תחת רגליו.        .         כתובים ב┉אצבע אלהים. יד ה'. עיני ה'. אזני ה'. וכ┉יוצא ב┉דברים ה┉אלו. ה┉כל לפי דעתן של בני אדם הוא ש┉אינן 

# An Implementation of RAKE for Hebrew

In [9]:
class RakeHebrew(object):
    def __init__(self, stop_words):
        self.stop_words = stop_words
        self.__stop_words_pattern = self.build_stop_word_regex(stop_words)
        
    def separate_words(self, text):
        """
        Utility function to return a list of all words
        @param text The text that must be split in to words.
        """
        return [word.strip().lower() for word in text.split()]


    def build_stop_word_regex(self, stop_words):
        return re.compile(u"(?:\s|^)({})(?=\s|$)".format(u"|".join(stop_words)))


    def generate_candidate_keywords(self, sentence_list, stopword_pattern):
        phrase_list = []
        for s in sentence_list:
            tmp = re.sub(stopword_pattern, u' | ', s.strip())
            phrases = re.split(u"[|.:?,!;]", tmp)
            phrase_list += [phrase.strip().lower() for phrase in phrases if phrase.strip()]
        return phrase_list
        
    def calculate_word_scores(self, phraseList):
        word_frequency = {}
        word_degree = {}
        for phrase in phraseList:
            word_list = self.separate_words(phrase)
            word_list_length = len(word_list)
            word_list_degree = word_list_length - 1
            #if word_list_degree > 3: word_list_degree = 3 #exp.
            for word in word_list:
                word_frequency.setdefault(word, 0)
                word_frequency[word] += 1
                word_degree.setdefault(word, 0)
                word_degree[word] += word_list_degree  #orig.
                #word_degree[word] += 1/(word_list_length*1.0) #exp.
        for item in word_frequency:
            word_degree[item] = word_degree[item] + word_frequency[item]

        # Calculate Word scores = deg(w)/frew(w)
        word_score = {}
        for item in word_frequency:
            word_score.setdefault(item, 0)
            word_score[item] = word_degree[item] / (word_frequency[item] * 1.0)  #orig.
        #word_score[item] = word_frequency[item]/(word_degree[item] * 1.0) #exp.
        return word_score
        
    def generate_candidate_keyword_scores(self, phrase_list, word_score):
        keyword_candidates = {}
        for phrase in phrase_list:
            word_list = self.separate_words(phrase)
            candidate_score = sum([word_score[word] for word in word_list])
            keyword_candidates[phrase] = candidate_score
        return keyword_candidates

    def run(self, sentence_list):
        phrase_list = self. generate_candidate_keywords(sentence_list, self.__stop_words_pattern)
        word_scores = self.calculate_word_scores(phrase_list)
        keyword_candidates = self.generate_candidate_keyword_scores(phrase_list, word_scores)
        sorted_keywords = sorted(keyword_candidates.iteritems(), key=operator.itemgetter(1), reverse=True)
        return sorted_keywords

# We will use RAKE on sections.  We therefore use the Sefaria APIs to get all the segment refs in a section.

## Tanakh and Talmud will have unique section divisions.  Therefore we are creating a set of every book title that we want to exclude from the standard Sefaria section division

In [10]:
indexes = library.all_index_records()
tanakh_books = set(library.get_indexes_in_category("Tanakh"))
talmud_books = set(library.get_indexes_in_category("Bavli"))
skip_books = tanakh_books | talmud_books
r = Ref("Likutei Moharan 52").text("he")
print skip_books

set([u'Judges', u'Horayot', u'Beitzah', u'Deuteronomy', u'Haggai', u'Sanhedrin', u'Megillah', u'Temurah', u'Shevuot', u'Berakhot', u'Chullin', u'Leviticus', u'Malachi', u'Eruvin', u'Ruth', u'Genesis', u'Nazir', u'Obadiah', u'Esther', u'Exodus', u'Jeremiah', u'Yoma', u'Habakkuk', u'I Chronicles', u'Bava Kamma', u'Keritot', u'Jonah', u'II Chronicles', u'Job', u'Tamid', u'Arakhin', u'Niddah', u'Gittin', u'Yevamot', u'Isaiah', u'Chagigah', u'Proverbs', u'Shabbat', u'Micah', u'Psalms', u'Avodah Zarah', u'Moed Katan', u'Zephaniah', u'Joel', u'Sukkah', u'Nahum', u'Kiddushin', u'Hosea', u'Zechariah', u'Nehemiah', u'Taanit', u'II Kings', u'Lamentations', u'Amos', u'Sotah', u'Pesachim', u'Nedarim', u'Joshua', u'Menachot', u'Ezra', u'Ketubot', u'Bava Metzia', u'Bava Batra', u'Ezekiel', u'Meilah', u'I Samuel', u'Daniel', u'Numbers', u'I Kings', u'Rosh Hashanah', u'Bekhorot', u'Ecclesiastes', u'Zevachim', u'Song of Songs', u'II Samuel', u'Makkot'])


## Loops through every section and get a list of their segment's text

In [13]:
rake = RakeHebrew(all_stop_words)

i = 0

with codecs.open('section_keywords.txt', 'wb', encoding='utf8') as the_file:
    for ind in indexes:
        if ind.title in skip_books:
            continue
        for section in ind.all_section_refs():
            segments = section.all_segment_refs()
            try:
                text = [text_with_prefixes_as_own_word[seg.normal()] for seg in segments]    
            except KeyError:
                print seg.normal()
            keywords = rake.run(text)  
            first_ten = [keyword[0] for keyword in keywords[:10]]
            the_file.write(u"{} -- {}\n\n".format(section.normal(), ', '.join(first_ten)))
            
            i += 1
            
            if i % 10000 == 0:
                print i
                
        if i % 10000 == 0:
            print i
        

Likutei Moharan 52:3
Likutei Moharan 136:3
Likutei Moharan 262:2
Likutei Moharan 272:2
Beit Yosef, Orach Chaim 47:1:2
10000
20000
Jerusalem Talmud Eruvin 1a:1
Jerusalem Talmud Eruvin 1b:1
Jerusalem Talmud Eruvin 2a:1
Jerusalem Talmud Eruvin 2b:1
Jerusalem Talmud Eruvin 3a:1
Jerusalem Talmud Eruvin 3b:1
Jerusalem Talmud Eruvin 4a:1
Jerusalem Talmud Eruvin 4b:1
Jerusalem Talmud Eruvin 5a:1
Jerusalem Talmud Eruvin 5b:1
Jerusalem Talmud Eruvin 6a:1
Jerusalem Talmud Eruvin 6b:1
Jerusalem Talmud Eruvin 7a:1
Jerusalem Talmud Eruvin 7b:1
Jerusalem Talmud Eruvin 8a:1
Jerusalem Talmud Eruvin 8b:1
Jerusalem Talmud Eruvin 9a:1
Jerusalem Talmud Eruvin 9b:1
Jerusalem Talmud Eruvin 10a:1
Jerusalem Talmud Eruvin 10b:1
Jerusalem Talmud Eruvin 11a:1
Jerusalem Talmud Eruvin 11b:1
Jerusalem Talmud Eruvin 12a:1
Jerusalem Talmud Eruvin 12b:1
Jerusalem Talmud Eruvin 13a:1
Jerusalem Talmud Eruvin 13b:1
Jerusalem Talmud Eruvin 14a:1
Jerusalem Talmud Eruvin 14b:1
Jerusalem Talmud Eruvin 15a:1
Jerusalem Talmud E

Introductions to the Babylonian Talmud, Shabbat, Introduction to Perek XXIV 1
Introductions to the Babylonian Talmud, Shabbat, Summary of Perek XXIV 1
Introductions to the Babylonian Talmud, Eruvin, Introduction to Eruvin 1
Introductions to the Babylonian Talmud, Eruvin, Introduction to Perek I 1
Introductions to the Babylonian Talmud, Eruvin, Summary of Perek I 1
Introductions to the Babylonian Talmud, Eruvin, Introduction to Perek II 1
Introductions to the Babylonian Talmud, Eruvin, Summary of Perek II 1
Introductions to the Babylonian Talmud, Eruvin, Introduction to Perek III 1
Introductions to the Babylonian Talmud, Eruvin, Summary of Perek III 1
Introductions to the Babylonian Talmud, Eruvin, Introduction to Perek IV 1
Introductions to the Babylonian Talmud, Eruvin, Summary of Perek IV 1
Introductions to the Babylonian Talmud, Eruvin, Introduction to Perek V 1
Introductions to the Babylonian Talmud, Eruvin, Summary of Perek V 1
Introductions to the Babylonian Talmud, Eruvin, Intro

Introductions to the Babylonian Talmud, Megillah, Introduction to Perek IV 1
Introductions to the Babylonian Talmud, Megillah, Summary of Perek IV 1
Introductions to the Babylonian Talmud, Moed Katan, Introduction to Moed Katan 1
Introductions to the Babylonian Talmud, Moed Katan, Introduction to Perek I 1
Introductions to the Babylonian Talmud, Moed Katan, Summary of Perek I 1
Introductions to the Babylonian Talmud, Moed Katan, Introduction to Perek II 1
Introductions to the Babylonian Talmud, Moed Katan, Summary of Perek II 1
Introductions to the Babylonian Talmud, Moed Katan, Introduction to Perek III 1
Introductions to the Babylonian Talmud, Moed Katan, Summary of Perek III 1
Introductions to the Babylonian Talmud, Nazir, Introduction to Nazir 1
Introductions to the Babylonian Talmud, Nazir, Introduction to Perek I 1
Introductions to the Babylonian Talmud, Nazir, Summary of Perek I 1
Introductions to the Babylonian Talmud, Nazir, Introduction to Perek II 1
Introductions to the Baby

Introductions to the Babylonian Talmud, Yoma, Introduction to Perek I 1
Introductions to the Babylonian Talmud, Yoma, Summary of Perek I 1
Introductions to the Babylonian Talmud, Yoma, Introduction to Perek II 1
Introductions to the Babylonian Talmud, Yoma, Summary of Perek II 1
Introductions to the Babylonian Talmud, Yoma, Introduction to Perek III 1
Introductions to the Babylonian Talmud, Yoma, Summary of Perek III 1
Introductions to the Babylonian Talmud, Yoma, Introduction to Perek IV 1
Introductions to the Babylonian Talmud, Yoma, Summary of Perek IV 1
Introductions to the Babylonian Talmud, Yoma, Introduction to Perek V 1
Introductions to the Babylonian Talmud, Yoma, Summary of Perek V 1
Introductions to the Babylonian Talmud, Yoma, Introduction to Perek VI 1
Introductions to the Babylonian Talmud, Yoma, Summary of Perek VI 1
Introductions to the Babylonian Talmud, Yoma, Introduction to Perek VII 1
Introductions to the Babylonian Talmud, Yoma, Summary of Perek VII 1
Introductions

KeyboardInterrupt: 

In [43]:
for keyword in keywords[:10]:
    print(keyword[0])

נשג דעת בוראם
זאב ערבות ישדדם
עבוד מלכיות בלבד
גדי ירבץ משל
השיב לב אבות
סדור הוית דברים
משיח יבוא אליהו
ישראל חכמים גדולים
אנכי שלח
פסל אנשים


# This just gets a list of every word and the number of occurences within the entire corpus.  Used to help build a stopword list

In [None]:
# from collections import Counter

# def get_clean_segments(data):
#     data = data.split(u'~~')[1]
#     data = re.sub(ur'[-־]', u' ', data)
#     data = re.sub(ur'[^ \u05d0-\u05ea"\'┉״׳]', u' ', data)
#     data = re.sub(ur'(^|\s)["\'״׳]+', u' ', data)
#     data = re.sub(ur'["\'״׳]+(\s|$)', u' ', data)
#     return data

# text_with_prefixes_as_own_word = []
# with codecs.open('./sefaria-export_prefix_refs.txt', encoding='utf8') as f:
#     for index, line in enumerate(f):
#         if u'~~' not in line:
#             continue
#         line = get_clean_segments(line)
#         line = line.strip().replace(u'┉', u'┉ ')
#         text_with_prefixes_as_own_word.append(line)
#         if index % 100000 == 0:
#             print index
        

# def get_vocab(lst):
# #     vocabcount = Counter(w for txt in lst for w in txt.split())
#     vocabcount = Counter(w for txt in lst for w in txt.split())
#     return vocabcount

# vocabcount = get_vocab(text_with_prefixes_as_own_word)

# for k,n in vocabcount.most_common(n=500):
#     if u"┉" in k:
#         continue
#     print k, n