# Amazing spanish tool

In [1]:
import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd

URL = lambda page: f"https://lingolex.com/verbs/popular_verbs.php?page={page}letra=h"

Scrap verbs and their importances

In [360]:
import time
pages = np.arange(64)

verbs = []
counts = []

for page_num in pages:
    page = requests.get(URL(page_num))
    soup = BeautifulSoup(page.content, "html.parser")
    rows = soup.find_all("div", class_="row")
    rows = rows[3:]
    
    for row in rows:
        children = row.findChildren(class_='col-sm-3 panel-heading')
        count = int(children[0].text.replace(',', ''))
        verb = children[1].text

        counts.append(count)
        verbs.append(verb)
    
    time.sleep(0.1)
    print(f"Page: {page_num} done.")

Page: 0 done.
Page: 1 done.
Page: 2 done.
Page: 3 done.
Page: 4 done.
Page: 5 done.
Page: 6 done.
Page: 7 done.
Page: 8 done.
Page: 9 done.
Page: 10 done.
Page: 11 done.
Page: 12 done.
Page: 13 done.
Page: 14 done.
Page: 15 done.
Page: 16 done.
Page: 17 done.
Page: 18 done.
Page: 19 done.
Page: 20 done.
Page: 21 done.
Page: 22 done.
Page: 23 done.
Page: 24 done.
Page: 25 done.
Page: 26 done.
Page: 27 done.
Page: 28 done.
Page: 29 done.
Page: 30 done.
Page: 31 done.
Page: 32 done.
Page: 33 done.
Page: 34 done.
Page: 35 done.
Page: 36 done.
Page: 37 done.
Page: 38 done.
Page: 39 done.
Page: 40 done.
Page: 41 done.
Page: 42 done.
Page: 43 done.
Page: 44 done.
Page: 45 done.
Page: 46 done.
Page: 47 done.
Page: 48 done.
Page: 49 done.
Page: 50 done.
Page: 51 done.
Page: 52 done.
Page: 53 done.
Page: 54 done.
Page: 55 done.
Page: 56 done.
Page: 57 done.
Page: 58 done.
Page: 59 done.
Page: 60 done.
Page: 61 done.
Page: 62 done.
Page: 63 done.


Save

In [361]:
df = pd.DataFrame(np.array([verbs, counts]).T, columns=['verb', 'count'])
df.to_csv('verb-count.csv', index=False)

# Scrape all conjugations :D

In [2]:
df = pd.read_csv('verb-count.csv')

In [4]:
from itertools import islice
import requests
import pandas as pd 

URL_CONJU = lambda verb: f'https://conjugator.reverso.net/conjugation-spanish-verb-{verb}.html'
URL_WF = lambda verb: f"https://www.wordreference.com/conj/esverbs.aspx?v={verb}"

def get_cols(tables):
    cols = []
    for table in tables:
        children = table.findChildren(['th'])
        title = children[0].text.split('ⓘ')[0]
        for child in children[1:]:
            cols.append(f"{title} {child.text}")
    
    _, idx = np.unique(cols, return_index=True)
    cols = np.array(cols)[np.sort(idx)]
    return cols

def scrape_website(url):
    page = requests.get(url)
    soup = BeautifulSoup(page.content, "html.parser")
    
    groups = soup.find_all("div", class_="aa")
    tables = [g.findChildren(class_='neoConj active') for g in groups]
    tables = [t for g in tables for t in g]
    
    cols = get_cols(tables)
    
    conjs = []
    for el in tables:
        conj = el.findChildren('td')
        conj = [c.text for c in conj]
        conjs.append(conj)
    
    conjs = [d for c in conjs for d in c]
    d = pd.DataFrame(conjs).T
    d.columns = cols
    return d

Initialize df

In [None]:
tmp_df = df.copy()

row = df.iloc[0, :]
verb = row['verb']
d = scrape_website(URL_WF(verb))
tmp_df.loc[:, d.columns.values] = 0
tmp_df.loc[0, tmp_df.columns.isin(d.columns.values)] = d.values.flatten()
    

Fill the conjugations

In [55]:
import time
    
for i, row in islice(df.iterrows(), 1240, None):
    verb = row['verb']
    d = scrape_website(URL_WF(verb))
    tmp_df.loc[i, tmp_df.columns.isin(d.columns.values)] = d.values.flatten()
    
    time.sleep(0.1)
    print(f"row: {i}")

Add probabilities for choosing

In [62]:
tmp_df.loc[:, 'prob'] = tmp_df['count'] / tmp_df['count'].sum()

Save

In [67]:
tmp_df.to_csv('verbs.csv', index=False)

# Run inference on data

In [1]:
import pandas as pd
import numpy as np
df = pd.read_csv('verbs.csv')

In [111]:
from numpy.random import choice

verb = 
verb


array(['acariciar'], dtype=object)

In [2]:
categories = np.array(['presente', 'imperfecto', 
                       'pretérito', 'futuro', 'condicional'])

In [36]:
from ipywidgets import widgets
from IPython.display import display, Markdown, clear_output
from ipywidgets import HTML

In [191]:
class SpanishApp:
    
    def __init__(self):
        self._initialize_attributes()
        self._create_widgets()
        
        self.set_question_visibility('hidden')
        self.set_answer_visibility('hidden')
        
        self.search_btn.on_click(self.search_new_verb)
        self.check_answer_btn.on_click(self.check_answer)
        
    ###
    # Constructors
    ###
    def _initialize_attributes(self):
        self.conjugated_verb = None
        self.infinitive = None
        self.conjugated_category = None
        self.include_vos = False
    
    def _create_widgets(self):
        self.html_title = HTML("""<h1><center><u>Spanish conjugation tool</u></center></h1><h3><center>Similarity search engine</center></h3>""")
        self.hline = HTML("<hline>")
        self.html2 = HTML("""<br><h3><center>Choose what conjugations to include</center></h3>""")
        self.base_word_html = HTML(f"<p>{self.infinitive}, {self.conjugated_category}")
        self.answer_html = HTML(f"{self.conjugated_verb}")
        self.answer_indicator = widgets.HTML()
        
        self.cat_selection = widgets.SelectMultiple(options=categories)
        self.answer_input = widgets.Text(description='Answer:')
        
        self.search_btn = widgets.Button(description='Search verb')
        self.check_answer_btn = widgets.Button(description='Check answer')
        
        self.verb_output = widgets.Output()
        self.title_output = widgets.Output()
        
        vbox_layout = widgets.Layout(display='flex',
                               flex_flow='column',
                               align_items='center',
                               width='100%')
        
        check_answer_box = widgets.HBox(children=[self.answer_input, 
                                                  self.answer_indicator])
        check_answer_btn_box = widgets.HBox(children=[self.search_btn, 
                                                      self.check_answer_btn])
        
        self.box = widgets.VBox(children=
                                [
                                    self.html_title, self.hline, self.html2, self.hline,
                                    self.cat_selection, check_answer_btn_box,
                                    self.base_word_html, check_answer_box,
                                    self.answer_html,
                                ],
                               layout=vbox_layout)


    ###
    # Updates
    ###
    def update_answer(self, value):
        self.conjugated_verb = value
        self._update_answer_html()
        
    def update_infinitive(self, value):
        self.infinitive = value
        self._update_baseword()
        
    def update_conjugated_category(self, value):
        self.conjugated_category = value
        self._update_baseword()
        
    def _update_baseword(self):
        self.base_word_html.value = f"<p><b>{self.infinitive}</b> (<i>{self.conjugated_category}</i>)"
    
    ###
    # Setters
    ###
    def set_question_visibility(self, visibility):
        self.base_word_html.layout.visibility = visibility
        self.check_answer_btn.layout.visibility = visibility
        self.answer_input.layout.visibility = visibility
        self.answer_input.value = ""
        self.answer_indicator.value = ''
    
    def set_answer_visibility(self, visibility):
        self.answer_html.layout.visibility = visibility
        self.answer_indicator.visibility = visibility
        
    def _update_answer_html(self):
        self.answer_html.value = f"<b>{self.conjugated_verb}</b>"
        
        
    ###
    # Eventlisteners
    ###
    def search_new_verb(self, b):
        self.set_answer_visibility('hidden')
        
        verb = np.random.choice(df.verb, 1, p=df.prob)
        row_chosen = df.verb.isin(verb)
        # Take all true values possible
        categories_chosen = np.sum([df.columns.str.contains(x) for x in self.cat_selection.value], 
                                   axis=0, dtype=bool)
        
        cat_str_chosen = df.columns[categories_chosen].values

        # Remove vosotros
        if not include_vos:
            cat_str_chosen = [x for x in cat_str_chosen if 'vos' not in x]
        
        # Choose random verb to be displayed
        cat_to_conj = np.random.choice(cat_str_chosen, 1)[0]

        conjugated_verb = df.loc[row_chosen, df.columns.isin([cat_to_conj])].values[0]
        
        self.update_answer(str(conjugated_verb[0]))
        self.update_infinitive(str(verb[0]))
        self.update_conjugated_category(cat_to_conj)
        self.set_question_visibility('visible')
    

    def check_answer(self, b):
        answer = self.answer_input.value
        if answer == self.conjugated_verb:
            self.answer_indicator.value = 'Correct'
            
        else:
            self.answer_indicator.value = 'Incorrect'
        self.set_answer_visibility('visible')
        
app = SpanishApp()
display(app.box)

VBox(children=(HTML(value='<h1><center><u>Spanish conjugation tool</u></center></h1><h3><center>Similarity sea…