In [1]:
import math

In [2]:
class CAH:
    def __init__(self, linkage_method="average"):
        self.data = []
        self.linkage_method = linkage_method
        # The default linkage_method is "average"

    @staticmethod
    def tokenize(text: str, lang_in_English: str) -> list:
        """
        This function tokenizes a text
        arg1 = text
        input: a text to tokenize
        output: a list of tokens
        """
        # Importing the nltk module
        import nltk

        # Downloading the tokenizer if not already done
        # nltk.download('punkt')

        # Tokenizing the text using the nltk tokenizer
        # The language used is determined by the lang_in_English variable and standardized
        tokens = nltk.word_tokenize(text, language=lang_in_English.lower())
        # Tokens are converted to lowercase and only alphanumeric tokens are kept
        tokens = [token.lower() for token in tokens if token.isalnum()]

        # Returns the list of tokens
        return tokens

    @staticmethod
    def text2vec(text, lang_in_English: str) -> dict:
        """
        Creates a dictionary of vectors from a text
        arg1 = text to process
        input = text to vectorize
        output: dictionary of vectors
        """
        # Importing the nltk module
        import nltk
        from nltk.corpus import stopwords

        # Downloading stopwords if necessary
        # nltk.download('stopwords')

        # Creating a set of stopwords for the specified language
        list_stopwords = set(stopwords.words(lang_in_English.lower()))

        # Defining a filtering function that removes stopwords from the text
        filter_stopfr = lambda text: [token for token in text if token.lower() not in list_stopwords]

        # Tokenizing and filtering the text using the tokenize method of the CAH object
        tokens = filter_stopfr(CAH.tokenize(text, lang_in_English))

        # Initializing an empty dictionary to store the vectors
        vector = {}

        # Iterating over all tokens in the text
        for token in tokens:
            # If the token is already present in the dictionary, increment its counter
            if token in vector:
                vector[token] += 1
            # Otherwise, add the token to the dictionary with a counter initialized to 1
            else:
                vector[token] = 1

        # Returns the dictionary of vectors
        return vector

    def url2text(self, list_urls: list) -> dict:
        """
        Retrieves text from a list of URLs
        """
        # Importing the requests module to make HTTP requests
        import requests
        # Importing BeautifulSoup to parse HTML content
        from bs4 import BeautifulSoup
        # Importing the unquote function from the urllib.parse module to decode URLs
        from urllib.parse import unquote

        # Initializing a dictionary to store the texts retrieved from the URLs
        texts = {}

        # For each URL in the provided list of URLs
        for url in list_urls:
            # Making an HTTP GET request to retrieve the content of the URL
            response = requests.get(url)
            # Parsing the HTML content of the response
            parsed = BeautifulSoup(response.text, "html.parser")
            # Finding all <div> elements with the class 'mw-parser-output'
            text = parsed.find_all('div', class_='mw-parser-output')

            # Checking if text is found on the web page
            if len(text):
                # Extracting the text key by analyzing the URL which will be used as the "label" in data
                key = url.split("/")[-1]
                # Decoding the URL key to handle all special characters such as éèä etc.
                key = unquote(key)
                # Storing the found text in the dictionary under the decoded key
                texts[key] = text[0].text

        # Returning the dictionary containing the texts retrieved from the URLs
        return texts

    def add_texts(self, dict_urls: dict, lang_in_English: str):
        """
        Adds multiple texts to the data list
        Input: Text dictionary, language written in English
        Output: list of dictionary for each text
        """
        # Iterating over each key-value pair in the dict_urls dictionary
        for key, value in dict_urls.items():
            # Calling the add_text method to add each text to the data list
            # the label will correspond to the key (retrieved during urls2text) of the dict_urls dictionary
            # it is important to always mention a language that we use during tokenization in text2vec
            self.add_text(key, value, lang_in_English)

        # Returning the updated data list after adding the texts
        return self.data

    def add_text(self, label, text, lang_in_English: str):
        """
        Adds a text to data
        Input: a label, a text, and a language written in English (for the tokenizer
        to apply the correct methods)
        """
        # Adding to data a dictionary with the given label as an argument
        # the vector value is the result of text2vec of the text given as an argument
        self.data.append({"label": label, "vector": CAH.text2vec(text, lang_in_English)})

    def del_text(self, label: str):
        """
        Removes the text corresponding to the label
        """
        # Removing the text based on the label given as an argument
        # by recreating a new data list without the information 


# Training data

Here is some training data. 
I have chosen to use Wikipedia pages of Sherlock Holmes novels. It is important for the algorithm to work properly that the urls are put in the form of a list of urls. 

In [3]:
urls = [
    "https://fr.wikipedia.org/wiki/Un_scandale_en_Boh%C3%AAme",
    "https://fr.wikipedia.org/wiki/La_Ligue_des_rouquins",
    "https://fr.wikipedia.org/wiki/Une_affaire_d%27identit%C3%A9",
    "https://fr.wikipedia.org/wiki/Le_Myst%C3%A8re_du_Val_Boscombe",
]

# Defining the objet Sherlock
sherlock = CAH()
# Generating a dictionary with the content from the URLs
sherlock_books = sherlock.url2text(urls)

# Adding the texts from the sherlock_books dictionary to our object
# Specifying the language of the texts as French
sherlock.add_texts(sherlock_books, "French")

# Then we classify the data of the Sherlock Holmes stories
print(sherlock.classify(3, 2.5))

# Texts can also be deleted
sherlock.del_text("Le_Mystere_du_Val_Boscombe")



[[0], [1], [3, 2]]


'Le text Le_Mystere_du_Val_Boscombe a été supprimé'

In [4]:
# Calculating the tf-idf of Sherlock novels
sherlock.tf_idf()

{'Un_scandale_en_Bohême': {'articles': 0.0,
  'homonymes': 0.0018607978001609269,
  'voir': 0.0,
  'a': 0.0,
  'scandal': 0.011164786800965562,
  'in': 0.011164786800965562,
  'bohemia': 0.009303989000804634,
  'scandale': 0.0,
  'bohême': 0.0,
  'illustration': 0.0003861504328211823,
  'sidney': 0.0,
  'paget': 0.0,
  '1891': 0.0,
  'publication': 0.0,
  'auteur': 0.0,
  'arthur': 0.0,
  'conan': 0.0,
  'doyle': 0.0,
  'titre': 0.0,
  'langue': 0.0,
  'anglais': 0.0,
  'parution': 0.0,
  'juillet': 0.0027911967002413906,
  'strand': 0.0,
  'magazine': 0.0,
  'mensuel': 0.0,
  'recueil': 0.0,
  'aventures': 0.0,
  'sherlock': 0.0,
  'holmes': 0.0,
  'intrigue': 0.0,
  'date': 0.0,
  'fictive': 0.0,
  'mars': 0.005582393400482781,
  '1888': 0.001158451298463547,
  '1': 0.0,
  'personnages': 0.0,
  'holmesdocteur': 0.0,
  'watsonsire': 0.0018607978001609269,
  'von': 0.007443191200643707,
  'ormstein': 0.005582393400482781,
  'client': 0.0018607978001609269,
  'irène': 0.0186079780016092