Skip to content

Commit

Permalink
Merge pull request #64 from SekouD/cooljugator
Browse files Browse the repository at this point in the history
  • Loading branch information
SekouD committed Oct 6, 2018
2 parents b730798 + 45545d7 commit 4cee3c2
Show file tree
Hide file tree
Showing 6 changed files with 730 additions and 14 deletions.
17 changes: 9 additions & 8 deletions mlconjug/mlconjug.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,13 +39,14 @@
'ro': VerbRo,
}

_PRE_TRAINED_MODEL_PATH = {'fr': '/'.join(('data', 'models', 'trained_model-fr-final.zip')),
'it': '/'.join(('data', 'models', 'trained_model-it-final.zip')),
'es': '/'.join(('data', 'models', 'trained_model-es-final.zip')),
'en': '/'.join(('data', 'models', 'trained_model-en-final.zip')),
'pt': '/'.join(('data', 'models', 'trained_model-pt-final.zip')),
'ro': '/'.join(('data', 'models', 'trained_model-ro-final.zip')),
}
_PRE_TRAINED_MODEL_PATH = {
'fr': '/'.join(('data', 'models', 'trained_model-fr-final.zip')),
'it': '/'.join(('data', 'models', 'trained_model-it-final.zip')),
'es': '/'.join(('data', 'models', 'trained_model-es-final.zip')),
'en': '/'.join(('data', 'models', 'trained_model-en-final.zip')),
'pt': '/'.join(('data', 'models', 'trained_model-pt-final.zip')),
'ro': '/'.join(('data', 'models', 'trained_model-ro-final.zip')),
}

_ALPHABET = {'fr': {'vowels': 'aáàâeêéèiîïoôöœuûùy',
'consonants': 'bcçdfghjklmnpqrstvwxyz'},
Expand Down Expand Up @@ -93,7 +94,7 @@ def extract_verb_features(verb, lang, ngram_range):
final_ngrams = ['END={0}'.format(verb[-n:]) for n in range(min_n, min(max_n + 1, verb_len + 1))]
initial_ngrams = ['START={0}'.format(verb[:n]) for n in range(min_n, min(max_n + 1, verb_len + 1))]
if lang not in _ALPHABET:
lang = 'en' # We chose 'en' as the default alphabet because english is more standard, without accents or diactrics.
lang = 'en' # We chose 'en' as the default alphabet because english is more standard, without accents or diactrics.
vowels = sum(verb.count(c) for c in _ALPHABET[lang]['vowels'])
vowels_number = 'VOW_NUM={0}'.format(vowels)
consonants = sum(verb.count(c) for c in _ALPHABET[lang]['consonants'])
Expand Down
12 changes: 6 additions & 6 deletions requirements_dev.txt
Original file line number Diff line number Diff line change
@@ -1,18 +1,18 @@
pip==18.0
pip==18.1
bump2version==0.5.10
wheel==0.31.1
wheel==0.32.1
watchdog==0.9.0
flake8==3.5.0
tox==3.4.0
coverage==4.5.1
Sphinx==1.8.0
Sphinx==1.8.1
cryptography==2.3.1
PyYAML==4.2b4
pytest==3.8.0
pytest-runner>=4.2
cython==0.28.5
numpy==1.15.1
scikit-learn>=0.19.1
numpy==1.15.2
scikit-learn==0.19.1
scipy==1.1.0
click==6.7
click==7.0
mypy==0.630
114 changes: 114 additions & 0 deletions utils/conjug_formatter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
import pickle
from collections import OrderedDict, defaultdict


def detect_root(verb, info_verb, language):
if 'type' in info_verb:
return None
result = OrderedDict()
verb_forms = [forms for key, forms in info_verb.items() if key != 'doubles']
flat_verb_forms = []
for tense in verb_forms:
for form in tense:
if '/' in form[-1]:
if not language == 'it':
standard_form = form[-1][:form[-1].index('/')]
else:
standard_form = form[-1][form[-1].index('/') +1:]
if len(standard_form) > 0:
flat_verb_forms.append(standard_form.strip().split(' ')[-1])
pass
elif not form[-1] in ('' or '-'):
standard_form = form[-1].strip().split(' ')[-1]
if len(standard_form) > 0:
flat_verb_forms.append(standard_form)
root = ''
for i in range(1, len(verb)+1):
prefix = verb[:i]
if all([elmt.startswith(prefix) for elmt in flat_verb_forms]):
root = prefix
else:
break
suffix = verb[len(root):]
return (root, suffix)


def construct_template(verb, info_verb, root_info, language, pronouns_dict):
if 'type' in info_verb:
return None
result = OrderedDict()
verb_roots = OrderedDict()
verb_roots[verb] = root_info[0]
for key, val in info_verb.items():
if key != 'doubles':
forms = []
for form in val:
if form[-1].startswith('('):
continue
if not form[-1] in ('' or '-'):
forms.append(form[-1].strip().split(' ')[-1][len(root_info[0]):])
if form[-1].strip().split(' ')[-1] == '-':
forms.append(form[-1].strip().split(' ')[-1])
if len(form) > 1:
pronouns_dict[language].add(form[0])
result[key] = tuple(forms)
return result


def group_by_template(model_verbs_dict, all_verbs_dict, verb_roots):
results = defaultdict(dict)
for key, val in model_verbs_dict.items():
if val:
pattern = frozenset(val.items())
results[pattern]['model_verb'] = key
results[pattern]['members'] = [key, ]
root = verb_roots[key]
template = root + ':' + key[len(root):]
results[pattern]['template'] = template
pass
for key, val in all_verbs_dict.items():
if val:
pattern = frozenset(val.items())
if pattern in results:
results[pattern]['members'].append(key)
pass
else:
results[pattern]['model_verb'] = key
results[pattern]['members'] = [key, ]
root = verb_roots[key]
template = root + ':' + key[len(root):]
results[pattern]['template'] = template
pass
pass
return results


def construct_verbs_dict(hastable, verb_roots):
verb_dict = {}
for val in hastable.values():
root = verb_roots[val['model_verb']]
template = root + ':' + val['model_verb'][len(root):]
for verb in set(val['members']):
verb_dict[verb] = template
pass
pass
return verb_dict


def construct_conjug_dict(hastable, verb_roots, all_verbs_conjugation):
conjug_dict = {}
for val in hastable.values():
root = verb_roots[val['model_verb']]
template = root + ':' + val['model_verb'][len(root):]
conjug = all_verbs_conjugation[val['model_verb']]
conjug_dict[template] = conjug
pass
return conjug_dict


if __name__ == "__main__":
# conjug = defaultdict(dict)
with open('C:/Users/SekouD/Documents/Projets_Python/mlconjug/utils'
'/raw_data/cooljugator_dump_temp.pickle', 'rb') as f:
conjug = pickle.load(f)
print('ok.')

0 comments on commit 4cee3c2

Please sign in to comment.