In [598]:
import phunspell
import pandas as pd
import numpy as np
from requests_html import HTMLSession



In [599]:
session = HTMLSession()

In [600]:
pspell = phunspell.Phunspell('hu_HU')
#pspell.lookup("angolszász")) = False
#pspell.lookup("kutya") = True


In [601]:
adj_hu = pd.read_csv('hungarian_adjectives.txt', sep = "\n")
adj_hu.columns = ['adjective']

adj_en = pd.read_csv('hungarian_adjectives_english.txt', sep="\n")
adj_en.columns = ['adjective']

non_comp = pd.read_csv('hungarian_adjectives_non_comparable.txt', sep = "\n")
non_comp.columns = ['adjective']



In [602]:
adj_hu = adj_hu[~adj_hu['adjective'].isin(non_comp['adjective'])]
adj_en = adj_en[~adj_en['adjective'].isin(non_comp['adjective'])]

print('After removing the comparable words from both sets, we got {} words left from the Hungarian version\
and {} words from the English version'.format(len(adj_hu), len(adj_en)))

After removing the comparable words from both sets, we got 6980 words left from the Hungarian versionand 2834 words from the English version


In [603]:
vowels = ['a', 'á', 'e', 'é', 'i', 'í', 'ö', 'ő', 'o', 'ó', 'u', 'ú', 'ü', 'ű']
throat_vowels = [vowels[i] for i in [0,1,8,9,10,11]]
tooth_vowels = [vowels[i] for i in [2,3,4,5,6,7,12,13]]
accented_vowels = [vowels[i] for i in range(1,14,2)]

def vowels_of(word):
    vowel_list = []
    for letter in word:
        if letter in vowels:
            vowel_list.append(letter)
    return(vowel_list)

def contains_throatvowel(word):
    for letter in word:
        if letter in throat_vowels:
            return(True)
    return(False)

def throatvowels_exlast2(word):
    if len(vowels_of(word))>1 and contains_throatvowel(word)==True:
        if not any(letter in set(vowels_of(word)[-2:]) for letter in set(throat_vowels)):
            return(True)
        return(False)
    return(False)

def last_vowel(word):
    for letter in word[::-1]:
        if letter in vowels:
            return(letter)

def last_letter_is_vowel(word):
    return(word[-1] in vowels)


In [604]:
def rule(word):
    if not last_letter_is_vowel(word):
        if contains_throatvowel(word)==True:
            if throatvowels_exlast2(word)==True:#last_vowel(word) in throat_vowels:
                return(word + 'ebb')
            else:
                return(word + 'abb')
        else:
            return(word + 'ebb')
        
    if last_letter_is_vowel(word):
        if word[-1] != 'i':
            if word[-1] in accented_vowels:
                return(word+'bb')
            else:
                return(word[0:-1] + vowels[vowels.index(word[-1])+1] + 'bb')  
        if word[-1] == 'i':
            return(word+'bb')

In [605]:
adj_hu['comparative'] = adj_hu['adjective'].map(lambda x: rule(x))
adj_hu['existing comparative'] = adj_hu['comparative'].map(lambda x: pspell.lookup(x))

adj_en['comparative'] = adj_en['adjective'].map(lambda x: rule(x))
adj_en['existing comparative'] = adj_en['comparative'].map(lambda x: pspell.lookup(x))

In [606]:
def comparable_hu(word):
    url = "https://hu.wiktionary.org/wiki/{}".format(word.replace(" ", "_"))
    
    r = session.get(url)
    r.html.arender(sleep=2)
    
    return('középfok' in r.html.text) #or r.html.html if you want it in raw html

def comparable_en(word):
    url_en = "https://en.wiktionary.org/wiki/{}#Hungarian".format(word)
    
    r = session.get(url)
    r.html.arender(sleep=2)
    
    return('comparative' in r.html.text)

def existing_comparative_hu(word):
    return adj_hu.loc[adj_hu['adjective'] == word, 'existing comparative'].iloc[0]

def existing_comparative_en(word):
    return adj_en.loc[adj_en['adjective'] == word, 'existing comparative'].iloc[0]

def comparable_and_existing_hu(word): #returns False if rule(word) does not exist and on the web it's not comparable
    if existing_comparative_hu(word) == False:
        return(comparable_hu(word))
    else:
        return(True)
    
def comparable_and_existing_en(word):
    if existing_comparative_en(word) == False:
        return(comparable_en(word))
    else:
        return(True)

adj_hu['comparable'] = adj_hu['adjective'].map(lambda word: comparable_and_existing_hu(word))
adj_en['comparable'] = adj_en['adjective'].map(lambda word: comparable_and_existing_en(word))

  r.html.arender(sleep=2)
  r.html.arender(sleep=2)


In [607]:
adj_hu = adj_hu[adj_hu['comparable']==True]
adj_en = adj_en[adj_en['comparable']==True]

adj_hu_3 = adj_hu.drop(['comparable'], axis=1)
adj_en_3 = adj_en.drop(['comparable'], axis=1)

adj_merged = adj_hu_3.merge(adj_en_3, how='outer')
adj_merged
#adj_merged = adj_hu.merge(adj_en, how='outer', on = ['adjective', 'comparative', )


Unnamed: 0,adjective,comparative,existing comparative
0,aberrált,aberráltabb,True
1,abesszin,abesszinebb,True
2,abház,abházabb,True
3,abiotikus,abiotikusabb,True
4,ablakmosó,ablakmosóbb,True
...,...,...,...
6522,zöldfülű,zöldfülűbb,True
6523,zöngés,zöngésebb,True
6524,zsarnok,zsarnokabb,True
6525,zsenánt,zsenántabb,True


In [608]:
accuracy = len(adj_merged[adj_merged['existing comparative'] == True])/len(adj_merged)
print('Accuracy is: {}'.format(accuracy))

Accuracy is: 0.9970890148613452


In [609]:
exceptions = adj_merged.loc[adj_merged['existing comparative']==False]
exceptions

Unnamed: 0,adjective,comparative,existing comparative
16,absztinens,absztinensebb,False
372,audio,audióbb,False
417,balgatag,balgatagabb,False
742,bátor,bátorabb,False
1122,drágalátos,drágalátosabb,False
1513,ennivaló,ennivalóbb,False
2681,jobb,jobbabb,False
2706,jó,jóbb,False
2851,kicsi,kicsibb,False
2984,komplex,komplexabb,False


## Clarification of the exceptions
Of these words the comparative was actually correct: absztinens, balgatag, drágalátos, ésszerűtlen
These words are not actually comparable: audio, ennivaló, jobb, multivitamin
This word was incorrectly given to have a correct comparative: kevés
Of these words I recognize that they are real exceptions: bátor, jó, kicsi, komplex, konvex, modern, messze, víg,
szép, nagy, nehéz

## Conclusion
Up the 12 words bátor, jó, kicsi, komplex, konvex, modern, messze, víg,
szép, nagy, nehéz, kevés, the above defined rule holds.