## Modules, Methods, Constants

In [1]:
from sklearn import svm
from sklearn.decomposition import PCA

import numpy as np
import pandas as pd

import json
import re
import random as rd

In [2]:
nikkud = ['ֹ', 'ְ', 'ּ', 'ׁ', 'ׂ', 'ָ', 'ֵ', 'ַ', 'ֶ', 'ִ', 'ֻ', 'ֱ', 'ֲ', 'ֳ', 'ׇ']
alphabet = ['א','ב','ג','ד','ה','ו','ז','ח','ט','י','כ','ך','ל','מ','ם','נ','ן','ס','ע','פ','ף','צ','ץ','ק','ר','ש','ת']
punctuation = ['״', '׳']
characters = alphabet + nikkud + punctuation

In [3]:
def tok_to_vec(token, dim):
    # print(token)
    vec = [0]*dim
    for i in range(len(token)):
        vec[i * len(characters) + characters.index(token[i])] = 1
    return vec

In [4]:
def clean(token):
    return ''.join([c for c in token if c in characters])

## Data

In [5]:
with open('./data/vowelized_cal_texts/71667_each_training_data.json', encoding='utf-8') as f:
    data = json.load(f)
    data = [{'tag':d['tag'], 'word': clean(d['word'])} for d in data]

In [6]:
print('Aramaic words in corpus: ' + str(len([w for w in data if w['tag'] == 'A'])))
print('Hebrew words in corpus: ' + str(len([w for w in data if w['tag'] == 'R'])))

Aramaic words in corpus: 71667
Hebrew words in corpus: 71667


In [7]:
rd.shuffle(data)

In [14]:
train_data = data[:(len(data) * 3 // 4)]
test_data = data[(len(data) * 3 //4):]

## Initial Basic Test

In [12]:
train_size = 20000
test_size = 5000

In [15]:
train_labels = [d['tag'] for d in train_data]
test_labels = [d['tag'] for d in test_data]

dimension = max([len(d['word']) for d in data]) * len(characters)
print(dimension)

train_vecs = [tok_to_vec(d['word'], dimension) for d in train_data[:train_size]]
test_vecs = [tok_to_vec(d['word'], dimension) for d in test_data]

968


In [84]:
pc = dimension // 1
pca = PCA(pc).fit(train_vecs)
train_pcs = pca.transform(train_vecs)
test_pcs = pca.transform(test_vecs)

In [85]:
lang_clf = svm.SVC()#probability=True)
lang_clf.fit(train_pcs, train_labels[:train_size])

SVC()

In [86]:
accuracy = sum(np.array(lang_clf.predict(test_pcs[:test_size])) == np.array(test_labels[:test_size])) / test_size
print(accuracy)

0.9368


## Test on Talmud Data

In [88]:
# Nazir was not part of the training data
with open('./data/aligned_talmud/Nazir.json', encoding='utf-8') as f:
    naz = json.load(f)

In [96]:
page = rd.randrange(len(naz))
chunk = rd.randrange(len(naz[page]['content']))
words = [word_forms[1] for word_forms in naz[page]['content'][chunk]['text']]
words

['בָּעֵי',
 'רַבִּי',
 'יִרְמְיָה',
 'רָקָב',
 'הַבָּא',
 'מִן',
 'הֶעָקֵב',
 'מַהוּ',
 'כִּי',
 'גָּמְרִינַן',
 'רָקָב',
 'הַבָּא',
 'מִכּוּלֵּיהּ',
 'מֵת',
 'אֲבָל',
 'דְּאָתֵי',
 'מִן',
 'עָקֵב',
 'לָא',
 'אוֹ',
 'דִלְמָא',
 'לָא',
 'שְׁנָא']

In [97]:
words_vecs = [tok_to_vec(word, dimension) for word in words]
words_pcs = pca.transform(words_vecs)

In [None]:
#naz_predictions = lang_clf.predict_proba(words_pcs)
naz_predictions = lang_clf.predict(words_pcs)

for i in range(len(words)):
    print(words[i] + '\t' + str(naz_predictions[i]))

## Testing Saved Model

### Basic Test

In [15]:
import joblib

In [None]:
rd.shuffle(data)

In [16]:
test_labels = [d['tag'] for d in data]

dimension = max([len(d['word']) for d in data]) * len(characters)
print(dimension)

test_vecs = [tok_to_vec(d['word'], dimension) for d in data]

968


In [14]:
lang_clf = joblib.load('./src/languagetagger/GemaraLanguageTagger.joblib')

In [19]:
accuracy = sum(np.array(lang_clf.predict(test_vecs[:500])) == np.array(test_labels[:500])) / 500
print(accuracy)

0.964


### Real Masekhet Testing

In [19]:
import joblib

In [20]:
lang_clf = joblib.load('./src/languagetagger/GemaraLanguageTagger.joblib')

In [8]:
with open('./data/aligned_talmud/Berakhot.json', encoding='utf-8') as f:
    mas = json.load(f)

In [30]:
page = 0 #rd.randrange(len(mas))
chunk = 0 #rd.randrange(len(mas[page]['content']))
words = [word_forms[1] for word_forms in mas[page]['content'][chunk]['text']]
words

['מֵאֵימָתַי',
 'קוֹרִין',
 'אֶת',
 'שְׁמַע',
 'בָּעֲרָבִין',
 'מִשָּׁעָה',
 'שֶׁהַכֹּהֲנִים',
 'נִכְנָסִים',
 'לֶאֱכוֹל',
 'בִּתְרוּמָתָן',
 'עַד',
 'סוֹף',
 'הָאַשְׁמוּרָה',
 'הָרִאשׁוֹנָה',
 'דִּבְרֵי',
 'רַבִּי',
 'אֱלִיעֶזֶר']

In [31]:
words_vecs = [tok_to_vec(word, dimension) for word in words]

In [32]:
mas_predictions = lang_clf.predict_proba(words_vecs)

print('\t' + 'Hebrew      Aramaic')
for i in range(len(words)):
    print(words[i] + '\t' + str(mas_predictions[i]))

	Hebrew      Aramaic
מֵאֵימָתַי	[0.08400024 0.91599976]
קוֹרִין	[0.04981305 0.95018695]
אֶת	[0.0498447 0.9501553]
שְׁמַע	[0.96086188 0.03913812]
בָּעֲרָבִין	[0.11935584 0.88064416]
מִשָּׁעָה	[0.03898098 0.96101902]
שֶׁהַכֹּהֲנִים	[0.02752469 0.97247531]
נִכְנָסִים	[0.01633482 0.98366518]
לֶאֱכוֹל	[0.10155507 0.89844493]
בִּתְרוּמָתָן	[0.23402207 0.76597793]
עַד	[0.04987237 0.95012763]
סוֹף	[0.17321767 0.82678233]
הָאַשְׁמוּרָה	[0.4110613 0.5889387]
הָרִאשׁוֹנָה	[0.04981824 0.95018176]
דִּבְרֵי	[0.04982186 0.95017814]
רַבִּי	[0.04988215 0.95011785]
אֱלִיעֶזֶר	[0.0213943 0.9786057]
