##### Vectorization with Word2Vec Model

In [1]:
import json
import gensim
import numpy as np
import pandas as pd

def word2vec_data(input_file_path, output_file_path, embedding_model_path):
    # Load pre-trained Word2Vec model
    word2vec_model = gensim.models.KeyedVectors.load_word2vec_format(embedding_model_path, binary=True)
    
    # Load tokenized data from input file
    with open(input_file_path, 'r') as file:
        data = json.load(file)
    
    # Initialize an empty list to store embeddings
    embeddings = []
    
    # Iterate through each tokenized document
    for doc in data:
        # Remove non-relevant tokens
        doc = [token for token in doc if token in word2vec_model.key_to_index]
        # Initialize an empty list to store embeddings for tokens in the document
        doc_embeddings = []
        # Iterate through tokens in the document
        for token in doc:
            # Retrieve word embedding for the token
            embedding = word2vec_model[token]
            doc_embeddings.append(embedding)
        # Calculate the mean embedding for the document
        if doc_embeddings:
            mean_embedding = np.mean(doc_embeddings, axis=0)
            embeddings.append(mean_embedding)
        else:
            # If no embeddings found for the document, use zeros
            embeddings.append(np.zeros(word2vec_model.vector_size))
    
    # Convert embeddings to DataFrame
    embeddings_df = pd.DataFrame(embeddings)
    
    # Print first few rows of the DataFrame
    print(embeddings_df.head())
    
    # Save embeddings to CSV file
    embeddings_df.to_csv(output_file_path, index=False)

# Example usage:
input_file_path = "tokenized_300.json"
output_file_path = "word2vec_vectorized_300.csv"
embedding_model_path = "/Users/QuangAP/gensim/GoogleNews-vectors-negative300.bin"
word2vec_data(input_file_path, output_file_path, embedding_model_path)


        0         1         2         3         4         5         6    \
0 -0.171622  0.108064 -0.004670  0.135769 -0.061985  0.033696 -0.083035   
1 -0.173188  0.108872 -0.011226  0.149500 -0.060411  0.030855 -0.087352   
2 -0.161672  0.112374 -0.004053  0.143581 -0.054326  0.036501 -0.091227   
3 -0.164241  0.099705  0.003212  0.147389 -0.054037  0.040446 -0.089972   
4 -0.173503  0.111899 -0.009761  0.142923 -0.061620  0.028136 -0.087772   

        7         8         9    ...       290       291       292       293  \
0 -0.044343 -0.053279  0.027136  ...  0.072206 -0.002153 -0.096771  0.076867   
1 -0.038299 -0.051793  0.017950  ...  0.070757 -0.018967 -0.095223  0.087860   
2 -0.033716 -0.058064  0.023707  ...  0.076147 -0.021281 -0.104619  0.087541   
3 -0.038930 -0.050870  0.019841  ...  0.076469 -0.026876 -0.090544  0.083582   
4 -0.043422 -0.054692  0.019189  ...  0.071873 -0.011160 -0.101913  0.088150   

        294       295       296       297       298       299  
0 -0