## .ipynb

In [None]:
# Preprocesamiento de Texto - Funciones reutilizables

# === Descarga de recursos ===
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# === Librerías necesarias ===
import string
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

# === Parte 1: Tokenización ===
def tokenize_corpus(corpus):
    tokenized_corpus = []
    for document in corpus:
        sentences = sent_tokenize(document)
        tokenized_sentences = [word_tokenize(sentence) for sentence in sentences]
        tokenized_corpus.append(tokenized_sentences)
    return tokenized_corpus

# === Parte 2.1: Convertir a minúsculas ===
def lowercase_tokens(tokenized_corpus):
    lowercase_corpus = []
    for document in tokenized_corpus:
        lowercase_document = [[word.lower() for word in sentence] for sentence in document]
        lowercase_corpus.append(lowercase_document)
    return lowercase_corpus

# === Parte 2.2: Eliminar puntuación y símbolos no alfabéticos ===
def remove_punctuation(tokenized_corpus):
    table = str.maketrans('', '', string.punctuation)
    cleaned_corpus = []
    for document in tokenized_corpus:
        cleaned_document = [[word.translate(table) for word in sentence if word.isalpha()] for sentence in document]
        cleaned_corpus.append(cleaned_document)
    return cleaned_corpus

# === Parte 3: Eliminación de stopwords ===
def remove_stopwords(tokenized_corpus):
    stop_words = set(stopwords.words('english'))
    filtered_corpus = []
    for document in tokenized_corpus:
        filtered_document = [[word for word in sentence if word not in stop_words] for sentence in document]
        filtered_corpus.append(filtered_document)
    return filtered_corpus

# === Parte 4.1: Stemming ===
def stem_tokens(tokenized_corpus):
    stemmer = PorterStemmer()
    stemmed_corpus = []
    for document in tokenized_corpus:
        stemmed_document = [[stemmer.stem(word) for word in sentence] for sentence in document]
        stemmed_corpus.append(stemmed_document)
    return stemmed_corpus

# === Parte 4.2: Lematización ===
def lemmatize_tokens(tokenized_corpus):
    lemmatizer = WordNetLemmatizer()
    lemmatized_corpus = []
    for document in tokenized_corpus:
        lemmatized_document = [[lemmatizer.lemmatize(word) for word in sentence] for sentence in document]
        lemmatized_corpus.append(lemmatized_document)
    return lemmatized_corpus


¿Cómo usarlo?
Importa el archivo en otro notebook: (revisar)

In [None]:
from preprocesamiento import tokenize_corpus, lowercase_tokens, remove_punctuation, remove_stopwords, stem_tokens, lemmatize_tokens


Ejemplo de uso:

In [None]:
corpus = ["Hello world! This is a test.", "Natural Language Processing with NLTK."]
tokens = tokenize_corpus(corpus)
tokens = lowercase_tokens(tokens)
tokens = remove_punctuation(tokens)
tokens = remove_stopwords(tokens)
stems = stem_tokens(tokens)
lemmas = lemmatize_tokens(tokens)

## .py


In [None]:
# preprocesamiento.py

import nltk
import string
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

# Descargar recursos necesarios de NLTK (solo se ejecuta una vez)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# 1. Tokenización
def tokenize_corpus(corpus):
    tokenized_corpus = []
    for document in corpus:
        sentences = sent_tokenize(document)
        tokenized_sentences = [word_tokenize(sentence) for sentence in sentences]
        tokenized_corpus.append(tokenized_sentences)
    return tokenized_corpus

# 2. Conversión a minúsculas
def lowercase_tokens(tokenized_corpus):
    lowercase_corpus = []
    for document in tokenized_corpus:
        lowercase_document = [[word.lower() for word in sentence] for sentence in document]
        lowercase_corpus.append(lowercase_document)
    return lowercase_corpus

# 3. Eliminación de puntuación y símbolos no alfabéticos
def remove_punctuation(tokenized_corpus):
    table = str.maketrans('', '', string.punctuation)
    cleaned_corpus = []
    for document in tokenized_corpus:
        cleaned_document = [[word.translate(table) for word in sentence if word.isalpha()] for sentence in document]
        cleaned_corpus.append(cleaned_document)
    return cleaned_corpus

# 4. Eliminación de stopwords
def remove_stopwords(tokenized_corpus):
    stop_words = set(stopwords.words('english'))
    filtered_corpus = []
    for document in tokenized_corpus:
        filtered_document = [[word for word in sentence if word not in stop_words] for sentence in document]
        filtered_corpus.append(filtered_document)
    return filtered_corpus

# 5. Stemming
def stem_tokens(tokenized_corpus):
    stemmer = PorterStemmer()
    stemmed_corpus = []
    for document in tokenized_corpus:
        stemmed_document = [[stemmer.stem(word) for word in sentence] for sentence in document]
        stemmed_corpus.append(stemmed_document)
    return stemmed_corpus

# 6. Lematización
def lemmatize_tokens(tokenized_corpus):
    lemmatizer = WordNetLemmatizer()
    lemmatized_corpus = []
    for document in tokenized_corpus:
        lemmatized_document = [[lemmatizer.lemmatize(word) for word in sentence] for sentence in document]
        lemmatized_corpus.append(lemmatized_document)
    return lemmatized_corpus


¿Cómo usarlo?
Guarda este contenido en un archivo llamado preprocesamiento.py y luego en tu código principal usa:

In [None]:
from preprocesamiento import *

corpus = ["This is an example.", "NLP is fun!"]
tokens = tokenize_corpus(corpus)
tokens = lowercase_tokens(tokens)
tokens = remove_punctuation(tokens)
tokens = remove_stopwords(tokens)
stemmed = stem_tokens(tokens)
lemmatized = lemmatize_tokens(tokens)
