### 1. Import Libraries and Load Data

In [1]:
import numpy as np
import pandas as pd
import gensim
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

#### Load Word2Vec model

In [9]:
word2vec_model = gensim.models.Word2Vec.load('data/word2vec_model.bin')

#### Load your conversation pairs

In [3]:
conversation_pairs_df = pd.read_csv('data/processed_conversation_pairs.csv')
conversation_pairs_df['input'] = conversation_pairs_df['input'].fillna('')
conversation_pairs_df['output'] = conversation_pairs_df['output'].fillna('')

#### Process to get vectors (from Step 2 & Step 3 above)

In [6]:
def text_to_vector(text, model, vector_size=100):
    """
    Convert a sentence into a vector by averaging the word vectors.
    Args:
        text (str): The input sentence.
        model (gensim.models.Word2Vec): The trained Word2Vec model.
        vector_size (int): The dimension of the word vectors.
    Returns:
        np.array: The averaged word vector for the sentence.
    """
    # Ensure that the input text is a string
    if not isinstance(text, str):
        text = str(text)  # Convert to string if not already a string

    words = text.split()
    vectors = []

    for word in words:
        if word in model.wv:
            vectors.append(model.wv[word])

    # If no vectors found, return a zero vector (you can also return a random vector or the mean vector)
    if len(vectors) == 0:
        return np.zeros(vector_size)

    # Compute the average of all word vectors
    sentence_vector = np.mean(vectors, axis=0)
    return sentence_vector

In [7]:
input_vectors = [text_to_vector(text, word2vec_model) for text in conversation_pairs_df['input']]
output_vectors = [text_to_vector(text, word2vec_model) for text in conversation_pairs_df['output']]

input_vectors = np.array(input_vectors)
output_vectors = np.array(output_vectors)

### 2. Split Data

In [10]:
X_train, X_test, y_train, y_test = train_test_split(input_vectors, output_vectors, test_size=0.2, random_state=42)

### 3. Build the Model

In [11]:
model = MLPClassifier(hidden_layer_sizes=(128, 64), max_iter=1000, random_state=42)

### 4. Train the Model

In [12]:
model.fit(X_train, y_train)

ValueError: Multioutput target data is not supported with label binarization