Importaciones y recursos

In [None]:
pip install nltk



In [None]:
import nltk

Tokenización

In [None]:
from nltk.tokenize import word_tokenize
nltk.download('punkt')

sentence = "NLTK es una bibilioteca de procesamiento de lenguaje natural"
tokens = word_tokenize(sentence)
print(tokens)

['NLTK', 'es', 'una', 'bibilioteca', 'de', 'procesamiento', 'de', 'lenguaje', 'natural']


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Derivación

In [None]:
from nltk.stem import PorterStemmer

words = ['running', 'plays', 'jumped']
stemmer = PorterStemmer()
stems = [stemmer.stem(word) for  word in words]
print(stems)

['run', 'play', 'jump']


Etiquetado

In [None]:
nltk.download('averaged_perceptron_tagger')
from nltk import pos_tag
from nltk.tokenize import word_tokenize

sentence = 'NLTK es una bibilioteca de procesamiento de lenguaje natural'
tokens = word_tokenize(sentence)
tagged_words = pos_tag(tokens)
print(tagged_words)

[('NLTK', 'NNP'), ('es', 'CC'), ('una', 'JJ'), ('bibilioteca', 'NN'), ('de', 'IN'), ('procesamiento', 'FW'), ('de', 'FW'), ('lenguaje', 'FW'), ('natural', 'JJ')]


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


**Ejemplo básico de clasificación de texto utilizando el clasificador Naive Bayes de NLTK**

Importaciones

In [None]:
pip install scikit-learn



In [None]:
import nltk
import random
import nltk
import random
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

Descarga de recursos necesarios para lematización y eliminación de stopwords

In [None]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

Definición del conjunto de datos etiquetados

In [None]:
additional_data = [
    ('The movie was a complete waste of time', 'negative'),
    ('Absolutely fantastic! Best movie ever!', 'positive'),
    ('I did not enjoy the movie', 'negative'),
    ('This movie was a masterpiece', 'positive'),
    ('I would not recommend this movie to anyone', 'negative'),
    ('The actors delivered outstanding performances', 'positive'),
    ('The script was poorly written', 'negative'),
    ('A visually stunning film', 'positive'),
    ('The movie was too long and boring', 'negative'),
    ('A very entertaining and engaging movie', 'positive'),
    ('The soundtrack was amazing', 'positive'),
    ('I was bored throughout the entire movie', 'negative'),
    ('A highly overrated film', 'negative'),
    ('The direction was top-notch', 'positive'),
    ('The movie was full of clichés', 'negative'),
    ('I was on the edge of my seat the whole time', 'positive')
]

# Datos originales más datos adicionales
data = [
    ('I love this movie', 'positive'),
    ('This movie is terrible', 'negative'),
    ('This movie is great', 'positive'),
    ('I dislike this movie', 'negative'),
    ('This film is amazing', 'positive'),
    ('I can’t stand watching this movie', 'negative'),
    ('The acting in this movie is phenomenal', 'positive'),
    ('I regret wasting my time on this film', 'negative'),
    ('I thoroughly enjoyed this movie', 'positive'),
    ('This movie lacks depth and substance', 'negative'),
    ('The plot of the movie was captivating', 'positive'),
    ('I found the characters in this film to be very engaging', 'positive'),
    ('The special effects in the movie were impressive', 'positive'),
    ('The story line was predictable and unoriginal', 'negative'),
    ('I was disappointed by the lack of character development', 'negative'),
    ('The cinematography in this film was stunning', 'positive'),
    ('The dialogue felt forced and unnatural', 'negative'),
    ('The pacing of the movie was too slow for my liking', 'negative'),
    ('I was pleasantly surprised by how much I enjoyed this film', 'positive'),
    ('The ending left me feeling unsatisfied and confused', 'negative'),
    ('This movie exceeded my expectations', 'positive'),
    ('The performances by the actors were lackluster', 'negative')
] + additional_data

Función de preprocesamiento con eliminación de stopwords y lematización

In [None]:
def preprocess(text):
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()

    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token.lower()) for token in tokens if token.isalnum() and token.lower() not in stop_words]

    return {word: True for word in tokens}

featuresets = [(preprocess(text), label) for (text, label) in data]

Preprocesamiento de datos

In [None]:
def preprocess(text):
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()

    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token.lower()) for token in tokens if token.isalnum() and token.lower() not in stop_words]

    return ' '.join(tokens)

Aplicación del preprocesamiento a los datos

In [None]:
preprocessed_data = [(preprocess(text), label) for (text, label) in data]

Mezcla de datos

In [None]:
random.shuffle(featuresets)

División de datos

In [None]:
train_set, test_set = featuresets[:int(len(featuresets) * 0.75)], featuresets[int(len(featuresets) * 0.75):]

Etiquetado de datos

In [None]:
train_X, train_y = zip(*train_set)
test_X, test_y = zip(*test_set)

Vectorización

In [None]:
vectorizer = DictVectorizer(sparse=False)

Clasificador NaiveBayes

In [None]:
classifier = nltk.NaiveBayesClassifier.train(train_set)

Clasificador de árboles de desición

In [None]:
dt_pipeline = make_pipeline(vectorizer, DecisionTreeClassifier())
dt_pipeline.fit(train_X, train_y)
dt_predictions = dt_pipeline.predict(test_X)
dt_accuracy = accuracy_score(test_y, dt_predictions)
print(f'Decision Tree Accuracy: {dt_accuracy}')

Decision Tree Accuracy: 0.5


Clasificador SVM

In [None]:
svm_pipeline = make_pipeline(vectorizer, SVC())
svm_pipeline.fit(train_X, train_y)
svm_predictions = svm_pipeline.predict(test_X)
svm_accuracy = accuracy_score(test_y, svm_predictions)
print(f'SVM Accuracy: {svm_accuracy}')

SVM Accuracy: 0.2


Función para calcular y mostrar las métricas

In [None]:
def evaluate_model(model, train_X, train_y, test_X, test_y, model_name="Model"):
    # Entrenar el modelo
    model.fit(train_X, train_y)

    # Hacer predicciones
    predictions = model.predict(test_X)

    # Calcular la precisión
    accuracy = accuracy_score(test_y, predictions)
    print(f'{model_name} Accuracy: {accuracy}')

    # Calcular y mostrar el reporte de clasificación
    report = classification_report(test_y, predictions)
    print(f'{model_name} Classification Report:\n{report}')

Predicción con NaiveBayes

In [None]:
nb_pipeline = make_pipeline(vectorizer, MultinomialNB())
evaluate_model(nb_pipeline, train_X, train_y, test_X, test_y, "Naive Bayes")

Naive Bayes Accuracy: 0.4
Naive Bayes Classification Report:
              precision    recall  f1-score   support

    negative       0.25      1.00      0.40         2
    positive       1.00      0.25      0.40         8

    accuracy                           0.40        10
   macro avg       0.62      0.62      0.40        10
weighted avg       0.85      0.40      0.40        10



Predicción con árboles de desición

In [None]:
dt_pipeline = make_pipeline(vectorizer, DecisionTreeClassifier())
evaluate_model(dt_pipeline, train_X, train_y, test_X, test_y, "Decision Tree")

Decision Tree Accuracy: 0.5
Decision Tree Classification Report:
              precision    recall  f1-score   support

    negative       0.29      1.00      0.44         2
    positive       1.00      0.38      0.55         8

    accuracy                           0.50        10
   macro avg       0.64      0.69      0.49        10
weighted avg       0.86      0.50      0.53        10



Predicción con SVM

In [None]:
svm_pipeline = make_pipeline(vectorizer, SVC())
evaluate_model(svm_pipeline, train_X, train_y, test_X, test_y, "SVM")

SVM Accuracy: 0.2
SVM Classification Report:
              precision    recall  f1-score   support

    negative       0.20      1.00      0.33         2
    positive       0.00      0.00      0.00         8

    accuracy                           0.20        10
   macro avg       0.10      0.50      0.17        10
weighted avg       0.04      0.20      0.07        10



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Función para predecir la etiqueta de un nuevo texto con todos los modelos

In [None]:
def classify_new_texts(new_texts):
    # Preprocesar y transformar todos los textos nuevos
    new_texts_processed = [preprocess(text) for text in new_texts]

    # Convert preprocessed texts to a list of dictionaries
    new_texts_dict = [{'text': text} for text in new_texts_processed] #This will create a list of dictionaries where the key is 'text' and the value is the preprocessed string

    # Clasificar con los modelos
    nb_predicted_labels = nb_pipeline.predict(new_texts_dict) # Pass the list of dictionaries instead of a list of strings
    dt_predicted_labels = dt_pipeline.predict(new_texts_dict)
    svm_predicted_labels = svm_pipeline.predict(new_texts_dict)

    # Imprimir resultados
    for new_text, nb_label, dt_label, svm_label in zip(new_texts, nb_predicted_labels, dt_predicted_labels, svm_predicted_labels):
        print(f'\nNew Text: "{new_text}"')
        print(f'Naive Bayes Predicted label: {nb_label}')
        print(f'Decision Tree Predicted label: {dt_label}')
        print(f'SVM Predicted label: {svm_label}')

Lista de nuevos comentarios de películas

In [None]:
new_texts = [
    "The movie was fantastic and full of surprises.",
    "I hated every minute of this film.",
    "It was an average movie with a decent plot.",
    "The special effects were top-notch, but the story was weak.",
    "I would recommend this movie to my friends.",
    "The film was boring and too long.",
    "A masterpiece of modern cinema.",
    "The actors did a terrible job."
]

In [None]:
classify_new_texts(new_texts)


New Text: "The movie was fantastic and full of surprises."
Naive Bayes Predicted label: negative
Decision Tree Predicted label: negative
SVM Predicted label: negative

New Text: "I hated every minute of this film."
Naive Bayes Predicted label: negative
Decision Tree Predicted label: negative
SVM Predicted label: negative

New Text: "It was an average movie with a decent plot."
Naive Bayes Predicted label: negative
Decision Tree Predicted label: negative
SVM Predicted label: negative

New Text: "The special effects were top-notch, but the story was weak."
Naive Bayes Predicted label: negative
Decision Tree Predicted label: negative
SVM Predicted label: negative

New Text: "I would recommend this movie to my friends."
Naive Bayes Predicted label: negative
Decision Tree Predicted label: negative
SVM Predicted label: negative

New Text: "The film was boring and too long."
Naive Bayes Predicted label: negative
Decision Tree Predicted label: negative
SVM Predicted label: negative

New Text: