### Process Overview:
- Inputs: preprocessed_100.json

- Outputs: vectorized_100.csv 

Note: Output file may be saved with 'vectorized_200.npy' format

In [2]:
import json
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

def vectorized_corpus(input_file_path, output_file_path, vector_type='csv', max_features=5000):
    # Load the tokenized data
    with open(input_file_path, 'r') as file:
        tokenized_data = json.load(file)
    
    # Assume tokenized_data is a list of strings
    # Initialize the TF-IDF Vectorizer
    vectorizer = TfidfVectorizer(max_features=max_features)

    # Vectorize the tokenized text
    tfidf_matrix = vectorizer.fit_transform(tokenized_data)

    # Convert to a dense matrix (optional, depending on what you need next)
    tfidf_dense = tfidf_matrix.todense()

    if vector_type == 'csv':
        # Get feature names
        feature_names = vectorizer.get_feature_names_out()

        # Save as a DataFrame
        df_tfidf = pd.DataFrame(tfidf_dense, columns=feature_names)
        df_tfidf.to_csv(output_file_path, index=False)

    elif vector_type == 'npy':
        # Save vectorized file as a Numpy array
        np.save(output_file_path, tfidf_dense)

# Example usage
input_file_path = 'tokenized_300.json'
output_file_path_csv = 'tfidf_vectorized_300.csv'
output_file_path_npy = 'tfidf_vectorized_300.npy'
vectorized_corpus(input_file_path, output_file_path_csv, vector_type='csv')
vectorized_corpus(input_file_path, output_file_path_npy, vector_type='npy')
