## Modules, Methods, Constants

In [7]:
from sklearn import svm
from sklearn.decomposition import PCA

import numpy as np
import pandas as pd

import json
import re
import random as rd

In [8]:
nikkud = ['ֹ', 'ְ', 'ּ', 'ׁ', 'ׂ', 'ָ', 'ֵ', 'ַ', 'ֶ', 'ִ', 'ֻ', 'ֱ', 'ֲ', 'ֳ', 'ׇ']
alphabet = ['א','ב','ג','ד','ה','ו','ז','ח','ט','י','כ','ך','ל','מ','ם','נ','ן','ס','ע','פ','ף','צ','ץ','ק','ר','ש','ת']
punctuation = ['״', '׳']
characters = alphabet + nikkud + punctuation

In [9]:
def tok_to_vec(token, dim):
    # print(token)
    vec = [0]*dim
    for i in range(len(token)):
        vec[i * len(characters) + characters.index(token[i])] = 1
    return vec

In [10]:
def clean(token):
    return ''.join([c for c in token if c in characters])

## Data

In [11]:
with open('./data/vowelized_cal_texts/71667_each_training_data.json', encoding='utf-8') as f:
    data = json.load(f)
    data = [{'tag':d['tag'], 'word': clean(d['word'])} for d in data]

In [12]:
print('Aramaic words in corpus: ' + str(len([w for w in data if w['tag'] == 'A'])))
print('Hebrew words in corpus: ' + str(len([w for w in data if w['tag'] == 'R'])))

Aramaic words in corpus: 71667
Hebrew words in corpus: 71667


In [13]:
rd.shuffle(data)

In [82]:
train_data = data[:(len(data) * 3 // 4)]
test_data = data[(len(data) * 3 //4):]

## Initial Basic Test

In [80]:
train_size = 20000
test_size = 5000

In [83]:
train_labels = [d['tag'] for d in train_data]
test_labels = [d['tag'] for d in test_data]

dimension = max([len(d['word']) for d in data]) * len(characters)
print(dimension)

train_vecs = [tok_to_vec(d['word'], dimension) for d in train_data[:train_size]]
test_vecs = [tok_to_vec(d['word'], dimension) for d in test_data]

968


In [84]:
pc = dimension // 1
pca = PCA(pc).fit(train_vecs)
train_pcs = pca.transform(train_vecs)
test_pcs = pca.transform(test_vecs)

In [85]:
lang_clf = svm.SVC()#probability=True)
lang_clf.fit(train_pcs, train_labels[:train_size])

SVC()

In [86]:
accuracy = sum(np.array(lang_clf.predict(test_pcs[:test_size])) == np.array(test_labels[:test_size])) / test_size
print(accuracy)

0.9368


## Test on Talmud Data

In [88]:
# Nazir was not part of the training data
with open('./data/aligned_talmud/Nazir.json', encoding='utf-8') as f:
    naz = json.load(f)

In [96]:
page = rd.randrange(len(naz))
chunk = rd.randrange(len(naz[page]['content']))
words = [word_forms[1] for word_forms in naz[page]['content'][chunk]['text']]
words

['בָּעֵי',
 'רַבִּי',
 'יִרְמְיָה',
 'רָקָב',
 'הַבָּא',
 'מִן',
 'הֶעָקֵב',
 'מַהוּ',
 'כִּי',
 'גָּמְרִינַן',
 'רָקָב',
 'הַבָּא',
 'מִכּוּלֵּיהּ',
 'מֵת',
 'אֲבָל',
 'דְּאָתֵי',
 'מִן',
 'עָקֵב',
 'לָא',
 'אוֹ',
 'דִלְמָא',
 'לָא',
 'שְׁנָא']

In [97]:
words_vecs = [tok_to_vec(word, dimension) for word in words]
words_pcs = pca.transform(words_vecs)

In [None]:
#naz_predictions = lang_clf.predict_proba(words_pcs)
naz_predictions = lang_clf.predict(words_pcs)

for i in range(len(words)):
    print(words[i] + '\t' + str(naz_predictions[i]))

## Testing Saved Model

### Basic Test

In [15]:
import joblib

In [None]:
rd.shuffle(data)

In [16]:
test_labels = [d['tag'] for d in data]

dimension = max([len(d['word']) for d in data]) * len(characters)
print(dimension)

test_vecs = [tok_to_vec(d['word'], dimension) for d in data]

968


In [14]:
lang_clf = joblib.load('./src/languagetagger/GemaraLanguageTagger.joblib')

In [19]:
accuracy = sum(np.array(lang_clf.predict(test_vecs[:500])) == np.array(test_labels[:500])) / 500
print(accuracy)

0.964


### Real Masekhet Testing

In [20]:
# Nazir was not part of the training data
with open('./data/aligned_talmud/Nazir.json', encoding='utf-8') as f:
    naz = json.load(f)

In [29]:
page = rd.randrange(len(naz))
chunk = rd.randrange(len(naz[page]['content']))
words = [word_forms[1] for word_forms in naz[page]['content'][chunk]['text']]
words

['הָיוּ',
 'לָהּ',
 'מָעוֹת',
 'סְתוּמִין',
 'יִפְּלוּ',
 'לִנְדָבָה',
 'מָעוֹת',
 'מְפוֹרָשִׁין',
 'דְּמֵי',
 'חַטָּאת',
 'יֵלְכוּ',
 'לְיָם',
 'הַמֶּלַח',
 'לֹא',
 'נֶהֱנִין',
 'וְלֹא',
 'מוֹעֲלִין',
 'בָּהֶן']

In [30]:
words_vecs = [tok_to_vec(word, dimension) for word in words]

In [31]:
naz_predictions = lang_clf.predict_proba(words_vecs)
#naz_predictions = lang_clf.predict(words_vecs)

print('\t' + 'Hebrew      Aramaic')
for i in range(len(words)):
    print(words[i] + '\t' + str(naz_predictions[i]))

	Hebrew      Aramaic
הָיוּ	[0.049861 0.950139]
לָהּ	[0.04982364 0.95017636]
מָעוֹת	[0.03085425 0.96914575]
סְתוּמִין	[0.05246527 0.94753473]
יִפְּלוּ	[0.02585053 0.97414947]
לִנְדָבָה	[0.04986535 0.95013465]
מָעוֹת	[0.03085425 0.96914575]
מְפוֹרָשִׁין	[0.09631687 0.90368313]
דְּמֵי	[0.93803868 0.06196132]
חַטָּאת	[0.04985473 0.95014527]
יֵלְכוּ	[0.11903322 0.88096678]
לְיָם	[0.04983734 0.95016266]
הַמֶּלַח	[0.01094101 0.98905899]
לֹא	[0.04985779 0.95014221]
נֶהֱנִין	[0.04981154 0.95018846]
וְלֹא	[0.04984848 0.95015152]
מוֹעֲלִין	[0.00383405 0.99616595]
בָּהֶן	[0.03025871 0.96974129]


### Testing on stop words

In [32]:
with open('./data/vowelized_cal_texts/stop_words.txt', encoding='utf-8') as f:
    stops = f.read().split('\n')

In [34]:
stops = [tuple(w.split('\t')) for w in stops]

In [35]:
stops

[('רַבִּי', '1222'),
 ('עַל', '1119'),
 ('אֶלָּא', '1110'),
 ('אֶת', '1051'),
 ('אוֹמֵר', '923'),
 ('הוּא', '875'),
 ('לֹא', '803'),
 ('הָא', '802'),
 ('אוֹ', '627'),
 ('כִּי', '613'),
 ('לוֹ', '593'),
 ('שֶׁל', '558'),
 ('בַּר', '546'),
 ('אִי', '541'),
 ('עַד', '509'),
 ('וְלֹא', '461'),
 ('מִן', '452'),
 ('אֵין', '446'),
 ('אִם', '443'),
 ('לִי', '437'),
 ('מִשּׁוּם', '426'),
 ('זֶה', '426'),
 ('הִיא', '408'),
 ('אָמַר', '397'),
 ('לָאו', '365'),
 ('אֶחָד', '351'),
 ('אוֹמְרִים', '337'),
 ('כָּל', '325'),
 ('בֵּין', '317'),
 ('מִי', '316'),
 ('אַף', '288'),
 ('וְאִם', '281'),
 ('יְהוּדָה', '279'),
 ('טָמֵא', '257'),
 ('אֵינוֹ', '246'),
 ('שֶׁהוּא', '239'),
 ('הָיָה', '235'),
 ('כֵּיוָן', '229'),
 ('לָךְ', '222'),
 ('מִפְּנֵי', '219'),
 ('אֲבָל', '219'),
 ('בֶּן', '218'),
 ('שִׁמְעוֹן', '215'),
 ('בֵּית', '201'),
 ('טָהוֹר', '199'),
 ('אַחַת', '197'),
 ('וְאִי', '186'),
 ('בּוֹ', '181'),
 ('בָּהּ', '180'),
 ('לָהּ', '169'),
 ('הָיוּ', '169'),
 ('עָלָיו', '164'),
 ('שֶׁאֵין', '162'),


In [36]:
stops[0][0]

'רַבִּי'

In [39]:
stop_vecs = [tok_to_vec(word[0], dimension) for word in stops]

In [None]:
stop_predictions = lang_clf.predict_proba(stop_vecs)

print('\t' + 'Hebrew      Aramaic')
for i in range(len(stops)):
    print(stops[i][0] + '\t' + str(stop_predictions[i]))