## **Practices** - *1° Partial*
* September 30°, 2025
#### ESCOM - IPN: *Natural Language Processing*
#### Prof. Marco Antonio

#### *B.S. in Data Science* - 6AV1
> Sánchez García Miguel Alexander

#### **1° Practice - Tokenization**

In [None]:
import time
import tracemalloc

class Tokenizer:
    """ Class for tokenizing text """
    delimiter = ""
    
    """ Constructor """
    def __init__(self):
        self.delimiter = " \t\n\r\f\v" + "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}"

    """ Methods """
    # Verifies if the word is only numbers or alphanumeric
    def verify_word(self, text:str) -> str:
        numbers = "0123456789"
        is_only_number = True
        word = ""
        for char in text:
            if char not in numbers:
                is_only_number = False
                break 

        if is_only_number:
            word = text
        else:
            # Keep alphabetic characters, remove only numbers from mixed words
            for char in text:
                if char.isalpha():  # Keep letters
                    word += char
        return word
    
    # Converts all characters in the token to lowercase
    def to_lowercase(self, token:list) -> list:
        for i in range(len(token)):
            for c in token[i]:
                if (c >= 'A') and (c <= 'Z'):
                    token[i] = token[i].replace(c, chr(ord(c) + 32))
        return token
    
    # Delete stopwords from the token
    def remove_stopwords(self, token:list) -> list:
        stopwords = ['the', 'of', 'in', 'on', 'a', 'an', 'some', 'and', 'that', 'this', 'mi', 'es', 'a', 'lo', 'la', 'el']
        return [word for word in token if word not in stopwords]
        
        
    # Tokenizes the input text
    def tokenize(self, text: str) -> list:              
        t_init = time.time()
        tracemalloc.start()
        
        token = []
        n = len(text)
        
        i = 0
        j = i
        
        while i <= n - 1:
            if (text[i] in self.delimiter) and (text[j] in self.delimiter):
                j += 1
            elif (text[i] in self.delimiter):
                word_verified = self.verify_word(text[j:i])
                if word_verified:  # Only add non-empty words
                    token.append(word_verified)
                j = i + 1
            i += 1

        # Handle the last word if the text doesn't end with a delimiter
        if j < n:
            word_verified = self.verify_word(text[j:n])
            if word_verified:
                token.append(word_verified)

        token = self.to_lowercase(token)
        
        token = self.remove_stopwords(token)

        # print("Time:", time.time() - t_init)
        # print("Memory:", tracemalloc.get_traced_memory())
        tracemalloc.stop()
        
        return token

In [3]:
word = " Hoy hay clase123 de PNL. Hay jun23ta a las 1945. o   holavcghv.   gcv  Tienen tarea á ñ "

tokenizer = Tokenizer()
print(tokenizer.tokenize(word))

['Hoy', 'hay', 'clase', 'de', 'PNL', 'Hay', 'junta', 'a', 'las', '1945', 'o', 'holavcghv', 'gcv', 'Tienen', 'tarea', 'á', 'ñ']


#### **2° Read Text File & One Hot Encoding**

**a.** Import the tokenizers

In [None]:
import time
import tracemalloc

class Tokenizer:
    """ Class for tokenizing text """
    delimiter = ""
    
    """ Constructor """
    def __init__(self):
        self.delimiter = " \t\n\r\f\v" + "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}"

    """ Methods """
    # Verifies if the word is only numbers or alphanumeric
    def verify_word(self, text:str) -> str:
        numbers = "0123456789"
        is_only_number = True
        word = ""
        for char in text:
            if char not in numbers:
                is_only_number = False
                break 

        if is_only_number:
            word = text
        else:
            # Keep alphabetic characters, remove only numbers from mixed words
            for char in text:
                if char.isalpha():  # Keep letters
                    word += char
        return word
    
    # Converts all characters in the token to lowercase
    def to_lowercase(self, token:list) -> list:
        for i in range(len(token)):
            for c in token[i]:
                if (c >= 'A') and (c <= 'Z'):
                    token[i] = token[i].replace(c, chr(ord(c) + 32))
        return token
    
    # Delete stopwords from the token
    def remove_stopwords(self, token:list) -> list:
        stopwords = ['the', 'of', 'in', 'on', 'a', 'an', 'some', 'and', 'that', 'this', 'mi', 'es', 'a', 'lo', 'la', 'el']
        return [word for word in token if word not in stopwords]
        
        
    # Tokenizes the input text
    def tokenize(self, text: str) -> list:              
        t_init = time.time()
        tracemalloc.start()
        
        token = []
        n = len(text)
        
        i = 0
        j = i
        
        while i <= n - 1:
            if (text[i] in self.delimiter) and (text[j] in self.delimiter):
                j += 1
            elif (text[i] in self.delimiter):
                word_verified = self.verify_word(text[j:i])
                if word_verified:  # Only add non-empty words
                    token.append(word_verified)
                j = i + 1
            i += 1

        # Handle the last word if the text doesn't end with a delimiter
        if j < n:
            word_verified = self.verify_word(text[j:n])
            if word_verified:
                token.append(word_verified)

        token = self.to_lowercase(token)
        
        token = self.remove_stopwords(token)

        # print("Time:", time.time() - t_init)
        # print("Memory:", tracemalloc.get_traced_memory())
        tracemalloc.stop()
        
        return token

**b.** Read the pdf file

In [1]:
import fitz

# Get the text from a PDF file
doc = fitz.open("el principito.pdf")

# Extract text from each page since third page
text = "\n".join([page.get_text() for page in doc[2:]])  

# Print the first 100 characters of the extracted text
print(text[:100])


número 2 era así:
Las personas mayores me aconsejaron abandonar el dibujo de serpientes
boas, ya fue


**c.** One Hot Encoding

In [6]:
tokenizer = Tokenizer()

token_text = tokenizer.tokenize(text)
print(len(token_text)) 
print(token_text[:100])


13074
['número', '2', 'era', 'así', 'las', 'personas', 'mayores', 'me', 'aconsejaron', 'abandonar', 'el', 'dibujo', 'de', 'serpientes', 'boas', 'ya', 'fueran', 'abiertas', 'o', 'cerradas', 'y', 'poner', 'más', 'interés', 'en', 'la', 'geografía', 'la', 'historia', 'el', 'cálculo', 'y', 'la', 'gramática', 'de', 'esta', 'manera', 'la', 'edad', 'de', 'seis', 'años', 'abandoné', 'una', 'magnífica', 'carrera', 'de', 'pintor', 'había', 'quedado', 'desilusionado', 'por', 'el', 'fracaso', 'de', 'mis', 'dibujos', 'número', '1', 'y', 'número', '2', 'las', 'personas', 'mayores', 'nunca', 'pueden', 'comprender', 'algo', 'por', 'sí', 'solas', 'y', 'es', 'muy', 'aburrido', 'para', 'los', 'niños', 'tener', 'que', 'darles', 'una', 'y', 'otra', 'vez', 'explicaciones', 'tuve', 'pues', 'que', 'elegir', 'otro', 'oficio', 'y', 'aprendía', 'pilotear', 'aviones', 'he', 'volado', 'un']


In [7]:
# Get the unique words from the tokenized text
unique_words = set(token_text)
print(len(unique_words))
print(unique_words)

2659
{'gustó', 'juez', 'pareció', 'encogiéndose', 'miel', 'mirarlas', 'actos', 'fueron', 'pan', 'aspecto', 'mires', 'querer', 'callarse', 'diciéndole', 'cortésmente', 'ayudarle', 'mariposas', 'necesidad', 'fácil', 'menos', 'bosteces', 'cuando', 'temo', 'mitin', 'olvidar', 'irritó', 'exacto', 'explicarse', 'instante', 'viento', 'cambia', 'secreto', 'pétalos', 'planetas', 'serás', 'curioso', 'compadecido', 'regalaré', 'dónde', 'veía', 'significaba', 'amapolas', 'sale', 'conocimiento', 'terribles', 'turco', 'estén', 'caída', 'irreparable', 'x', 'contar', 'crecer', 'mente', 'cae', 'cuero', 'amanecer', 'desembarazarse', 'país', 'complicado', 'espinas', 'gusto', 'quedado', 'tentativa', 'erupciones', 'poder', 'habita', 'helado', 'vergüenza', 'fuentes', 'volaba', 'juzgar', 'sentían', 'manzano', 'cuestión', 'obedecido', 'domesticado', 'estar', 'gana', 'dedos', 'anda', 'boas', 'pareceré', 'esperé', 'pensó', 'reflexiones', 'emociona', 'es', 'ciento', 'tiene', 'volví', 'diciéndome', 'ave', 'mediod

In [17]:
# One Hot Encoding
import pandas as pd
class OneHotEncoder:
    """ Class for One Hot Encoding """
    def __init__(self):
        pass
    
    def fit_transform(self, token:list) -> dict:
        unique_words = set(token)
        # print(len(unique_words))
        # Order the set alphabetically
        unique_words = sorted(unique_words)
        one_hot_df = pd.DataFrame(0, index=range(len(unique_words)), columns=list(unique_words))
        for i, word in enumerate(unique_words):
            one_hot_df.at[i, word] = 1
        return one_hot_df

In [18]:
oh_encoder = OneHotEncoder()

one_hot_token = oh_encoder.fit_transform(token_text)

one_hot_token

Unnamed: 0,1,1909,1920,2,325,3251,326,327,328,329,...,ópera,órdenes,última,últimos,única,únicas,único,únicos,útil,útiles
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2654,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2655,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2656,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2657,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


#### **3° TF-IDF Matrix**

**a.** Creation of documents

In [6]:
# Three documents about SpongeBob's love for his job at the Krusty Krab

document1 = """
SpongeBob SquarePants has an unparalleled passion for his work at the Krusty Krab that stems from his genuine love of cooking and serving others. Every morning, he wakes up with boundless enthusiasm, eager to flip patties and create the perfect Krabby Patty for every customer who walks through the doors. His dedication goes beyond mere employment; it represents his life's calling and purpose. The grill becomes his canvas, and each burger is a masterpiece crafted with care, precision, and an infectious joy that radiates throughout the restaurant. SpongeBob finds deep satisfaction in the sizzle of the grill, the smell of fresh ingredients, and the smiles on customers' faces when they taste his culinary creations. His work ethic is legendary in Bikini Bottom, often staying late to perfect recipes or arriving early to ensure everything is spotless and ready for another day of service. The Krusty Krab isn't just a workplace for SpongeBob; it's where his dreams come alive, where his talents shine brightest, and where he can make a positive difference in the lives of everyone around him through the simple yet profound act of preparing delicious food with love and dedication.
"""

document2 = """
The camaraderie and relationships SpongeBob has built at the Krusty Krab are fundamental to why he cherishes his position as the restaurant's fry cook. Working alongside his best friend and neighbor Squidward, despite their contrasting personalities, provides SpongeBob with daily opportunities for friendship and shared experiences. His deep respect and admiration for Mr. Krabs, whom he views as both a mentor and father figure, motivates him to exceed expectations and contribute to the restaurant's success. SpongeBob thrives in the collaborative environment where each team member plays a vital role in creating memorable dining experiences for their customers. The fast-paced nature of the kitchen energizes him, and he finds joy in the choreographed dance of food preparation during busy lunch rushes. His infectious optimism and unwavering commitment to excellence inspire others around him, creating a positive work atmosphere that extends beyond the kitchen walls. SpongeBob genuinely cares about each customer's satisfaction, often going above and beyond to ensure their visit to the Krusty Krab is exceptional. This personal investment in customer happiness and the success of the business makes every workday feel meaningful and rewarding, reinforcing his belief that he has found his true calling in the culinary world of Bikini Bottom.
"""

document3 = """
SpongeBob's love for his job at the Krusty Krab is deeply rooted in his appreciation for the craft of cooking and his desire to bring happiness to others through food. He approaches each Krabby Patty with the precision of an artist and the heart of someone who understands that a great meal can brighten someone's entire day. The technical aspects of grilling fascinate him – from achieving the perfect temperature to timing each flip with mathematical precision. His knowledge of ingredients, cooking techniques, and food safety demonstrates a professional commitment that goes far beyond what most would expect from a fry cook position. SpongeBob takes immense pride in maintaining the Krusty Krab's reputation as Bikini Bottom's premier dining establishment, understanding that his individual contribution directly impacts the restaurant's legacy and success. The creative challenges of menu development and seasonal specials excite him, providing opportunities to innovate while respecting the time-honored traditions that make the Krabby Patty special. His job allows him to combine his natural talents with his generous spirit, creating an environment where work doesn't feel like work but rather like a daily celebration of food, friendship, and community service. This perfect alignment of personal values, professional skills, and workplace culture explains why SpongeBob approaches each day at the Krusty Krab with unwavering enthusiasm and genuine gratitude for the opportunity to do what he loves most.
"""

print("Document 1 word count:", len(document1.split()))
print("Document 2 word count:", len(document2.split())) 
print("Document 3 word count:", len(document3.split()))
print("\nDocuments created successfully!")

Document 1 word count: 192
Document 2 word count: 201
Document 3 word count: 227

Documents created successfully!


**b.** Creation of the TF-IDF matrix

In [7]:
import pandas as pd
from math import log

class TF_IDF(Tokenizer):
    """ Class for creating the TF-IDF matrix """
    
    """ Constructor """
    def __init__(self, docs:list):
        # Initialize the parent Tokenizer class
        super().__init__() 
        
        self.documents = docs
        self.tokens = []
        self.vocabulary = set()
        
        # Tokenize each document and build vocabulary
        for doc in self.documents:
            doc_tokens = self.tokenize(doc)
            self.tokens.append(doc_tokens)
            self.vocabulary.update(doc_tokens)

        # Convert vocabulary to sorted list for consistent column order
        self.vocabulary = sorted(list(self.vocabulary))

    """ Methods """
    # Compute term frequency for a given token list
    def compute_tf(self, token_list: list) -> pd.Series:
        # Create a Series with vocabulary as index, initialized to 0
        tf = pd.Series(0, index=self.vocabulary)
        
        # Count occurrences of each word
        for word in token_list:
            if word in tf.index:
                tf[word] += 1
        
        return tf
    
    # Compute inverse document frequency for the entire corpus
    def compute_idf(self) -> pd.Series:
        N = len(self.documents)
        idf = pd.Series(0.0, index=self.vocabulary)
        
        for word in self.vocabulary:
            # Count how many documents contain this word
            doc_count = sum(1 for doc_tokens in self.tokens if word in doc_tokens)
            # Calculate IDF using the smoothed formula: log(N / (1 + doc_count))
            idf[word] = log(N / (1 + doc_count))
        
        return idf

    # Compute the TF-IDF matrix
    def compute_tf_idf(self):
        # Compute TF for each document
        tf_matrix = []
        for i, doc_tokens in enumerate(self.tokens):
            tf_series = self.compute_tf(doc_tokens)
            tf_matrix.append(tf_series)
        
        # Create TF DataFrame
        tf_df = pd.DataFrame(tf_matrix, index=[f"Doc_{i+1}" for i in range(len(self.documents))])
        
        # Compute IDF
        idf_series = self.compute_idf()
        
        # Compute TF-IDF by multiplying TF matrix with IDF vector
        tf_idf_matrix = tf_df.multiply(idf_series, axis=1)
        
        return tf_idf_matrix

In [8]:
tf_idf = TF_IDF([document1, document2, document3])

print("Vocabulary size:", len(tf_idf.vocabulary))
print("Number of documents:", len(tf_idf.documents))

matrix = tf_idf.compute_tf_idf()

matrix

Vocabulary size: 300
Number of documents: 3


Unnamed: 0,about,above,achieving,act,admiration,alignment,alive,allows,alongside,another,...,whom,why,with,work,workday,working,workplace,world,would,yet
Doc_1,0.0,0.0,0.0,0.405465,0.0,0.0,0.405465,0.0,0.0,0.405465,...,0.0,0.0,-0.863046,-0.575364,0.0,0.0,0.0,0.0,0.0,0.405465
Doc_2,0.405465,0.405465,0.0,0.0,0.405465,0.0,0.0,0.0,0.405465,0.0,...,0.405465,0.0,-0.287682,-0.287682,0.405465,0.405465,0.0,0.405465,0.0,0.0
Doc_3,0.0,0.0,0.405465,0.0,0.0,0.405465,0.0,0.405465,0.0,0.0,...,0.0,0.0,-1.150728,-0.575364,0.0,0.0,0.0,0.0,0.405465,0.0


**c.** Coeficient analysis

In [9]:
# Remove columns (words) that are all 0
matrix_withoutZeroes = matrix.loc[:, (matrix != 0).any(axis=0)]

print("TF-IDF Matrix without words with value 0:")
matrix_withoutZeroes

TF-IDF Matrix without words with value 0:


Unnamed: 0,about,above,achieving,act,admiration,alignment,alive,allows,alongside,another,...,where,while,whom,with,work,workday,working,world,would,yet
Doc_1,0.0,0.0,0.0,0.405465,0.0,0.0,0.405465,0.0,0.0,0.405465,...,-0.863046,0.0,0.0,-0.863046,-0.575364,0.0,0.0,0.0,0.0,0.405465
Doc_2,0.405465,0.405465,0.0,0.0,0.405465,0.0,0.0,0.0,0.405465,0.0,...,-0.287682,0.0,0.405465,-0.287682,-0.287682,0.405465,0.405465,0.405465,0.0,0.0
Doc_3,0.0,0.0,0.405465,0.0,0.0,0.405465,0.0,0.405465,0.0,0.0,...,-0.287682,0.405465,0.0,-1.150728,-0.575364,0.0,0.0,0.0,0.405465,0.0


In [10]:
# Get the top N most significant words per document
N = 5  # number of representative words
for doc in matrix.index:
    top_words = matrix.loc[doc].sort_values(ascending=False).head(N)
    print(f"\nMost representative words of {doc}:")
    print(top_words)


Most representative words of Doc_1:
grill         0.810930
dedication    0.810930
it            0.810930
or            0.405465
simple        0.405465
Name: Doc_1, dtype: float64

Most representative words of Doc_2:
their            1.216395
kitchen          0.810930
experiences      0.810930
about            0.405465
relationships    0.405465
Name: Doc_2, dtype: float64

Most representative words of Doc_3:
like            0.81093
professional    0.81093
what            0.81093
most            0.81093
approaches      0.81093
Name: Doc_3, dtype: float64


In [11]:
# Globally most important words
global_top = matrix.sum(axis=0).sort_values(ascending=False).head(10)
print("\nMost significant words in the entire corpus:")
print(global_top)


Most significant words in the entire corpus:
their           1.216395
grill           0.810930
approaches      0.810930
professional    0.810930
someone         0.810930
most            0.810930
experiences     0.810930
like            0.810930
kitchen         0.810930
job             0.810930
dtype: float64
