In [1]:
import json
import string
import random
import re # Regular Expressions (regex)
import urllib.request

import numpy as np

# Para leer y parsear el texto en HTML de wikipedia
import bs4 as bs

import nltk
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import sys
import gradio as gr

# Descargar el diccionario
nltk.download("punkt")
nltk.download("wordnet")
nltk.download('omw-1.4')

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\santi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\santi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\santi\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

### Datos
Se consumirán los datos del artículo de wikipedia sobre el futbol en ingles.

In [2]:
raw_html = urllib.request.urlopen('https://en.wikipedia.org/wiki/History_of_association_football')
raw_html = raw_html.read()

# Parsear artículo, 'lxml' es el parser a utilizar
article_html = bs.BeautifulSoup(raw_html, 'lxml')

# Encontrar todos los párrafos del HTML (bajo el tag <p>)
# y tenerlos disponible como lista
article_paragraphs = article_html.find_all('p')

article_text = ''

for para in article_paragraphs:
    article_text += para.text

article_text = article_text.lower()

In [3]:
# Demos un vistazo
article_text

'\nthe history of association football, more commonly known as football or soccer, stretches back at least to the medieval times.[1][2][3] some predecessors of football may date back to ancient greece and rome, and similar games were played in ancient china and japan.[4] the history of football in britain dates at least to the eighth century ce.[5]\nthe development of association football has its origins in medieval ball games and english public school games. the modern game of association football originated in the mid-nineteenth century by the efforts of english football clubs to standardize the varying sets of football rules, culminating in the formation of the football association (the fa) in london, england, in 1863, and their issuing of the laws of the game in the same year.[6][7] the "laws of the game" were later trusted to the international football association board (ifab) and then adopted by the international association football federation (fifa). this set of rules drafted b

In [4]:
print("Cantidad de caracteres en la nota:", len(article_text))

Cantidad de caracteres en la nota: 34961


### 2 - Preprocesamiento
- Remover caracteres especiales
- Quitar espacios o saltos

In [5]:
# Repaso de regex:
# https://docs.python.org/3/library/re.html

# Para practicar regex:
# https://regex101.com/

# el inicio con 'r' antes de cada string indica que se interprete como raw string
# '\n' es interpretado por Python como salto de linea
# r'\n' es interpretado por Python como el string formado por dos caracteres: 
#  backslash y n

# substituir con regex con espacio vacío:
text = re.sub(r'\[[0-9]*\]', ' ', article_text) # substituir los números entre corchetes
# (notar que los corchetes son interpretados literalmente por los backlsash)
text = re.sub(r'\s+', ' ', text) # substituir más de un caracter de espacio, salto de línea o tabulación

# probar en regex101 con los patrones anteriores:
# 'Hola [1], [], [ estoy bien   [123]. [12sss]. OK!   .'

In [6]:
# Demos un vistazo
text

' the history of association football, more commonly known as football or soccer, stretches back at least to the medieval times. some predecessors of football may date back to ancient greece and rome, and similar games were played in ancient china and japan. the history of football in britain dates at least to the eighth century ce. the development of association football has its origins in medieval ball games and english public school games. the modern game of association football originated in the mid-nineteenth century by the efforts of english football clubs to standardize the varying sets of football rules, culminating in the formation of the football association (the fa) in london, england, in 1863, and their issuing of the laws of the game in the same year. the "laws of the game" were later trusted to the international football association board (ifab) and then adopted by the international association football federation (fifa). this set of rules drafted by the fa allowed clubs 

In [7]:
print("Cantidad de caracteres en el texto:", len(text))

Cantidad de caracteres en el texto: 34724


### 3 - Dividir el texto en sentencias y en palabras

In [8]:
corpus = nltk.sent_tokenize(text) # divide en oraciones
words = nltk.word_tokenize(text) # divide en términos

In [9]:
# Demos un vistazo
corpus[:10]

[' the history of association football, more commonly known as football or soccer, stretches back at least to the medieval times.',
 'some predecessors of football may date back to ancient greece and rome, and similar games were played in ancient china and japan.',
 'the history of football in britain dates at least to the eighth century ce.',
 'the development of association football has its origins in medieval ball games and english public school games.',
 'the modern game of association football originated in the mid-nineteenth century by the efforts of english football clubs to standardize the varying sets of football rules, culminating in the formation of the football association (the fa) in london, england, in 1863, and their issuing of the laws of the game in the same year.',
 'the "laws of the game" were later trusted to the international football association board (ifab) and then adopted by the international association football federation (fifa).',
 'this set of rules drafted

In [10]:
# Demos un vistazo
words[:20]

['the',
 'history',
 'of',
 'association',
 'football',
 ',',
 'more',
 'commonly',
 'known',
 'as',
 'football',
 'or',
 'soccer',
 ',',
 'stretches',
 'back',
 'at',
 'least',
 'to',
 'the']

In [11]:
print("Vocabulario:", len(words))

Vocabulario: 6553


### 4 - Funciones de ayuda para limpiar y procesar el input del usuario
- Lematizar los tokens de la oración
- Quitar símbolos de puntuación

In [12]:
lemmatizer = WordNetLemmatizer()

def perform_lemmatization(tokens):
    return [lemmatizer.lemmatize(token) for token in tokens]

# ord() nos da el código Unicode para un caracter dado
punctuation_removal = dict((ord(punctuation), None) for punctuation in string.punctuation)

def get_processed_text(document):
    # 1 - reduce el texto a mínuscula (string.lower())
    # 2 - quitar los simbolos de puntuacion (string.translate())
    # 3 - realiza la tokenización (nltk.word_tokenize)
    # 4 - realiza la lematización (nuestra función perform_lemmatization)
    return perform_lemmatization(nltk.word_tokenize(document.lower().translate(punctuation_removal)))

### 5 - Utilizar vectores TF-IDF y la similitud coseno construido con el corpus del artículo de wikipedia

In [13]:

def generate_response(user_input, corpus):
    response = ''
    # Sumar al corpus la pregunta del usuario para calcular
    # su cercania con otros documentos/sentencias
    # la entrada del usuario se usa para tokenizar y vectorizar
    corpus.append(user_input)

    # Crear un vectorizar TFIDF que quite las "stop words" del ingles y utilice
    # nuestra funcion para obtener los tokens lematizados "get_processed_text"
    word_vectorizer = TfidfVectorizer(tokenizer=get_processed_text, stop_words='english')

    # Crear los vectores a partir del corpus
    all_word_vectors = word_vectorizer.fit_transform(corpus)

    # Calcular la similitud coseno entre todas los documentos excepto el agregado (el útlimo "-1")
    # NOTA: con los word embedings veremos más en detalle esta matriz de similitud
    similar_vector_values = cosine_similarity(all_word_vectors[-1], all_word_vectors)

    # Obtener el índice del vector más cercano a nuestra oración
    # --> descartando la similitud contra nuestor vector propio
    similar_sentence_number = similar_vector_values.argsort()[0][-2]
    matched_vector = similar_vector_values.flatten()
    matched_vector.sort()
    vector_matched = matched_vector[-2]

    if vector_matched == 0: # si la similaridad coseno fue nula (ningún término en común)
        response = "I am sorry, I could not understand you"
    else:
        response = corpus[similar_sentence_number] # obtener el documento del corpus más similar
    
    corpus.remove(user_input)
    return response

### 6 - Ensayar el sistema
El sistema intentará encontrar la parte del artículo que más se relaciona con nuestro texto de entrada. Sugerencias a ensayar:
- Grand slam
- tournaments
- nadal
- artificial intelligence

In [14]:
# Se utilizará gradio para ensayar el bot
# Herramienta poderosa para crear interfaces rápidas para ensayar modelos
# https://gradio.app/

!{sys.executable} -m pip install gradio --quiet


[notice] A new release of pip is available: 23.1.2 -> 23.2.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [15]:
def bot_response(human_text):
    print("Q:", human_text)    
    resp = generate_response(human_text.lower(), corpus)
    print("A:", resp)
    return resp

iface = gr.Interface(
    fn=bot_response,
    inputs=["textbox"],
    outputs="text",
    layout="vertical")

iface.launch(debug=True)

  iface = gr.Interface(


Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.


Q: origen del futbol?




A: la liga, spain's national league, had its first season in 1928, with its participants based on the previous winners of the copa del rey, which began in 1902. the modern german national league, the bundesliga was late in the foundation, especially for european countries, given it wasn't founded until 1963. the german football association was founded as early as 1900 with the first german football champions being leipzig in 1903. however, prior to the formation of the bundesliga, german football was played at an amateur level in a large number of regional leagues.
Q: ball history
A: (for more details see: history of american football and 1869 college football season.)
Q: best player
A: there were also rules preventing professional players from playing for more than one club in a season, without obtaining special permission, and all professional players had to be registered with the f.a.
Q: offside?
A: other schools (in particular eton college, shrewsbury school and harrow) favoured a 



### Alumno

- Tomar un ejemplo de los bots utilizados (uno de los dos) y construir el propio.
- Sacar conclusiones de los resultados.

__IMPORTANTE__: Recuerde para la entrega del ejercicio debe quedar registrado en el colab las preguntas y las respuestas del BOT para que podamos evaluar el desempeño final.