# Importamos la librerias que nos sean necesarias

In [None]:
import gensim
from gensim.models import Word2Vec
import numpy as np

# Generamos el supuesto conjunto de datos

In [None]:
data = [
    ("A dog barks at night", "dog"),
    ("The cat sits on the mat", "cat"),
    ("Birds fly south in the winter", "bird"),
    ("My dog plays with a ball", "dog"),
    ("I saw two birds on the tree", "bird"),
    ("Cats love to chase mice", "cat"),
    ("Birds build nests", "bird"),
    ("A cat has nine lives", "cat"),
    ("Dogs are man's best friend", "dog"),
    ("The eagle is a bird of prey", "bird"),
    ("That cat looks like a tiger", "cat"),
    ("A dog can smell hundreds of times better than a human", "dog")
]


# Preparamos los datos para Word2Vec y entrenamos el modelo

In [None]:
# Preparar datos para Word2Vec (solo necesitamos las frases, no las etiquetas)
sentences = [gensim.utils.simple_preprocess(sentence) for sentence, animal in data]

# Entrenar un modelo Word2Vec
model = Word2Vec(sentences, vector_size=50, window=3, min_count=1, workers=4)

# Función para convertir una frase en un embedding promediando los embeddings de sus palabras
def sentence_embedding(sentence):
    words = gensim.utils.simple_preprocess(sentence)
    embeddings = [model.wv[word] for word in words if word in model.wv]
    if embeddings:
        return np.mean(embeddings, axis=0)
    else:
        return np.zeros(model.vector_size)

# Crear embeddings para cada frase
X = np.array([sentence_embedding(sentence) for sentence, animal in data])

# Creamos el Modelo de Clasificación

In [None]:
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Codificar etiquetas categóricas como números
le = LabelEncoder()
y = le.fit_transform([animal for sentence, animal in data])

# Dividir datos en entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Entrenar un clasificador SVM
clf = SVC(kernel='linear')
clf.fit(X_train, y_train)

# Evaluar el clasificador
y_pred = clf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))


Accuracy: 0.3333333333333333


# Usamos el modelo

In [None]:
new_sentences = [
    "The penguin swims in the sea",
    "Look at that flying bird",
    "My kitten is adorable"
]

new_embeddings = np.array([sentence_embedding(sentence) for sentence in new_sentences])
new_predictions = le.inverse_transform(clf.predict(new_embeddings))

for sentence, category in zip(new_sentences, new_predictions):
    print(f"Sentence: {sentence} - Predicted Category: {category}")


Sentence: The penguin swims in the sea - Predicted Category: bird
Sentence: Look at that flying bird - Predicted Category: dog
Sentence: My kitten is adorable - Predicted Category: dog
