##### Vecterization - TFIDF Model

In [4]:
import json
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

def vectorize_tfidf(input_file_path, output_file_path, max_features=5000):
    # Load the tokenized data
    with open(input_file_path, 'r') as file:
        tokenized_data = json.load(file)
    
    # Assume tokenized_data is a list of strings
    # Initialize the TF-IDF Vectorizer
    vectorizer = TfidfVectorizer(max_features=max_features)

    # Vectorize the tokenized text
    tfidf_matrix = vectorizer.fit_transform(tokenized_data)

    # Convert to a dense matrix (optional, depending on what you need next)
    tfidf_dense = tfidf_matrix.todense()

    # Get feature names
    feature_names = vectorizer.get_feature_names_out()

    # Save as a DataFrame
    df_tfidf = pd.DataFrame(tfidf_dense, columns=feature_names)
    df_tfidf.to_csv(output_file_path, index=False)

# Example usage
input_file_path = 'tokenizedFine_300.json'
output_file_path_csv = 'vectorized_tfidf_300.csv'
vectorize_tfidf(input_file_path, output_file_path_csv)


##### Vectorization - Word2Vec Model

In [None]:
import json
import gensim
import numpy as np
import pandas as pd

def vectorize_word2vec(input_file_path, output_file_path, embedding_model_path):
    # Load pre-trained Word2Vec model
    word2vec_model = gensim.models.KeyedVectors.load_word2vec_format(embedding_model_path, binary=True)
    
    # Load tokenized data from input file
    with open(input_file_path, 'r') as file:
        data = json.load(file)
    
    # Initialize an empty list to store embeddings
    embeddings = []
    
    # Iterate through each tokenized document
    for doc in data:
        # Remove non-relevant tokens
        doc = [token for token in doc if token in word2vec_model.key_to_index]
        # Initialize an empty list to store embeddings for tokens in the document
        doc_embeddings = []
        # Iterate through tokens in the document
        for token in doc:
            # Retrieve word embedding for the token
            embedding = word2vec_model[token]
            doc_embeddings.append(embedding)
        # Calculate the mean embedding for the document
        if doc_embeddings:
            mean_embedding = np.mean(doc_embeddings, axis=0)
            embeddings.append(mean_embedding)
        else:
            # If no embeddings found for the document, use zeros
            embeddings.append(np.zeros(word2vec_model.vector_size))
    
    # Convert embeddings to DataFrame
    embeddings_df = pd.DataFrame(embeddings)
    
    # Print first few rows of the DataFrame
    print(embeddings_df.head())
    
    # Save embeddings to CSV file
    embeddings_df.to_csv(output_file_path, index=False)

# Example usage:
input_file_path = "tokenizedClean_300.json"
output_file_path = "vectorized_word2vec_300.csv"
embedding_model_path = "/Users/QuangAP/gensim/GoogleNews-vectors-negative300.bin"
vectorize_word2vec(input_file_path, output_file_path, embedding_model_path)

##### Vectorization - BERT-Based Model

In [None]:
import json
import pandas as pd
import torch
from transformers import BertTokenizer, BertModel

def vectorize_bert(input_file_path, output_file_path):
    # Load the tokenized text from the JSON file
    with open(input_file_path, 'r') as file:
        tokenized_texts = json.load(file)
    
    # Initialize BERT model and tokenizer
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertModel.from_pretrained('bert-base-uncased')

    # Ensure the model is in evaluation mode
    model.eval()

    # Function to get BERT embeddings
    def get_bert_embedding(tokens):
        # Tokenize and create tensor
        inputs = tokenizer(tokens, return_tensors='pt', is_split_into_words=True, padding=True, truncation=True)
        with torch.no_grad():
            outputs = model(**inputs)
        # Get the embeddings from the last hidden state
        embeddings = outputs.last_hidden_state
        # Take the mean of the token embeddings to get a sentence-level embedding
        sentence_embedding = torch.mean(embeddings, dim=1)
        return sentence_embedding.squeeze().numpy()

    # Process each tokenized text and get embeddings
    embeddings = []
    for tokens in tokenized_texts:
        embedding = get_bert_embedding(tokens)
        embeddings.append(embedding)

    # Convert embeddings to DataFrame and save as CSV
    df = pd.DataFrame(embeddings)
    df.to_csv(output_file_path, index=False)

# Example usage
input_file_path = 'tokenizedClean_300.json'
output_file_path = 'vectorized_bert_300.csv'
vectorize_bert(input_file_path, output_file_path)
