# Imports

In [6]:
import pymongo
import json
import nltk
import pandas as pd
import numpy as np

In [2]:
with open('pswrd.txt') as text: pswrd = text.read()
conn_str = 'mongodb+srv://dov-db2:' + pswrd + '@apicluster.s8lqy.mongodb.net/test'
_client = pymongo.MongoClient(conn_str)
_db = _client['bavli']

jastrow = _db['dov-jastrow']
dicta_all = _db['new-dicta']

# Distinct Words in Dicta

The data below show that given a random word from the Talmud with nikkud, it will automatically be mapped to the proper POS and word root 91% of the time if piped through the Dicta data.

**NOTE TO SELF: Ignore Participles; they simply duplicate the verbs. With Participles removed, 99% of words are distinct.**

*Proof:*

`db['new-dicta'].aggregate([{$match: {$or: [{pos: 'Participle'}, {pos: 'Verb'}]}}, {'$group': {'_id': '$word'}}, {$group: {_id: 1, count: {'$sum': 1}}}]) `

`== `

`db['new-dicta'].aggregate([{$match: {pos: 'Verb'}}, {'$group': {'_id': $word}}, {$group: {_id: 1, count: {'$sum': 1}}}])`

In [48]:
rows = dicta_all.distinct('pos')
# rows.remove('Participle')

dicta_data = pd.DataFrame(index=rows, columns=['distinct_words', 'total_words'])

for i in dicta_data.index:
    dicta_data['total_words'][i] = dicta_all.count_documents({'pos': i})
    dicta_data['distinct_words'][i] = list(dicta_all.aggregate([{'$match': {'pos': i}}, 
                                                                {'$group': {'_id': '$word'}}, 
                                                                {'$group': {'_id': 1, 'count': {'$sum': 1}}}]))[0]['count']

dicta_data

Unnamed: 0,distinct_words,total_words
,873,882
None Suf_AccNom Suf_Fem Suf_Plural Suf_P3,3,3
None Suf_AccNom Suf_Fem Suf_Sing Suf_P2,2,2
None Suf_AccNom Suf_Fem Suf_Sing Suf_P3,2,2
None Suf_AccNom Suf_Masc Suf_Plural Suf_P2,2,2
None Suf_AccNom Suf_Masc Suf_Plural Suf_P3,6,6
None Suf_AccNom Suf_Masc Suf_Sing Suf_P2,2,2
None Suf_AccNom Suf_Masc Suf_Sing Suf_P3,4,4
None Suf_AccNom Suf_Masc-Fem Suf_Plural Suf_P1,3,3
None Suf_AccNom Suf_Masc-Fem Suf_Sing Suf_P1,2,2


In [49]:
assert sum(dicta_data['total_words']) == dicta_all.count_documents({})

In [24]:
real_total_words = sum(dicta_data['distinct_words'])
real_total_words

861089

In [26]:
distinct_words_in_corpus = list(dicta_all.aggregate([{'$group': {'_id': '$word'}}, 
                                                     {'$group': {'_id': 1, 'count': {'$sum': 1}}}]))[0]['count']
distinct_words_in_corpus

785259

In [27]:
print('Percent of unambiguously mapped Dicta words: ' + str(distinct_words_in_corpus / real_total_words))

Percent of unambiguously mapped Dicta words: 0.9119370936105327


# Distinct Words in Jastrow

The data below show that 84% of valid input words will map to a single entry in Jastrow.

In [36]:
all_word_forms = [f for doc in jastrow.find({}, {'all_forms': 1}) for f in doc['all_forms']]
len(all_word_forms)

59733

In [37]:
distinct_word_forms = list( dict.fromkeys(all_word_forms) )
len(distinct_word_forms)

50103

In [40]:
print('Percent of unambiguous Jastrow entries: ' + str(len(distinct_word_forms) / len(all_word_forms)))

Percent of unambiguous Jastrow entries: 0.8387825824920898
