##### Step 1: Convert CSV to JSON Files

In [1]:
import csv
import json
import uuid
import os

raw_reviews_file = "../data/raw/hotel_reviews_1000.csv"
transformed_dir = "../data/transformed"

raw_reviews = open(raw_reviews_file, "r").readlines()

if not os.path.exists(transformed_dir):
    os.makedirs(transformed_dir)

def process_reviews(file_path):
    with open(file_path, 'r', newline='', encoding='utf-8') as csvfile:
        # Read the first line to get the header
        header = next(csv.reader(csvfile))
        
        # Create a mapping of expected column names to actual column names
        column_mapping = {
            'dateAdded': 'dateAdded',
            'city': 'city',
            'hotel_name': 'name',
            'hotel_state': 'province',
            'review_text': 'reviews.text',
            'review_title': 'reviews.title'
        }
        
        # Find the index of each required column
        column_indices = {}
        for expected_name, actual_name in column_mapping.items():
            try:
                column_indices[expected_name] = header.index(actual_name)
            except ValueError:
                print(f"Warning: Column '{actual_name}' not found in the CSV. Some data may be missing.")
        
        # Reset file pointer to the beginning
        csvfile.seek(0)
        
        # Skip the header row
        next(csvfile)
        
        # Use csv.reader instead of DictReader
        reader = csv.reader(csvfile)
        
        for i, row in enumerate(reader, start=1):
            review_json = {}
            for key, index in column_indices.items():
                if index < len(row):
                    review_json[key] = row[index]
                else:
                    review_json[key] = ""  # or None, depending on your preference
            
            # Generate a unique identifier
            review_json['id'] = str(uuid.uuid4())
            
            # print(json.dumps(review_json, indent=2))
            print(f"processed record [{i}] with id [{review_json['id']}]")

            with open(f"{transformed_dir}/review_{i}.json", "w+") as f:
                json.dump(review_json, f, indent=2)
            
process_reviews(raw_reviews_file)

processed record [1] with id [063db5f0-56ce-4d77-9b35-d0d59989a3d9]
processed record [2] with id [345ed21d-a773-4cca-98df-e163535c50d4]
processed record [3] with id [2dd86a03-70b9-418d-a75f-5f6e5417db32]
processed record [4] with id [ce4fde9e-36ac-4a6d-b156-539110084cb1]
processed record [5] with id [3faf4283-7ae1-4e4d-b9de-abcccf9f57ce]
processed record [6] with id [3ef0b90b-f84e-45ad-94d5-4d655adcb7a9]
processed record [7] with id [0584f5dd-075f-44e2-b899-35315ff379ca]
processed record [8] with id [46c8d590-aa6f-4900-8aaf-7453ea22cc83]
processed record [9] with id [bd70a98b-79bb-4579-b9e4-6128e11b7302]
processed record [10] with id [782acfec-7ddf-48b9-89da-774c3802bbba]
processed record [11] with id [d6893f8e-dd6d-47e5-a5bb-72e1f04d8446]
processed record [12] with id [369c901c-970f-485f-8675-c4a62b9a2d80]
processed record [13] with id [6b5b24e3-8279-4928-bd8c-47d21bc739c8]
processed record [14] with id [2307ce90-a2ad-4289-a680-9d6a80182a00]
processed record [15] with id [dbf76642-c9d

#### Step 2: Create Embeddings for each of the JSON Files

In [2]:
import os
os.environ['SHELL'] = '/bin/zsh'  # or '/bin/bash' if available


In [3]:
%pip install -q python-dotenv openai

Note: you may need to restart the kernel to use updated packages.


In [4]:
from dotenv import load_dotenv
from openai import OpenAI

load_dotenv()

client = OpenAI()

response = client.embeddings.create(
    input="Hello world",
    model="text-embedding-3-small"
)

print(len(response.data[0].embedding))
print(response.data[0].embedding)

1536
[-0.002153123263269663, -0.04904741793870926, 0.020912760868668556, 0.03131536394357681, -0.04529817774891853, -0.026352230459451675, -0.028933674097061157, 0.06032586470246315, -0.025722235441207886, -0.0148509806022048, 0.01540414709597826, -0.030086103826761246, -0.020359596237540245, -0.03340510278940201, 0.025829795747995377, 0.01422098558396101, -0.07006774097681046, 0.012400145642459393, 0.014797200448811054, 0.04886302724480629, 0.020774470642209053, -0.008896758779883385, -0.015165978111326694, -0.016579624265432358, 0.02592199109494686, -0.0028349775820970535, -0.024323955178260803, 0.024277856573462486, 0.0018390860641375184, -0.05574687570333481, 0.023094695061445236, -0.0454825684428215, -0.008681639097630978, 0.0031288473401218653, 0.00451368372887373, 0.0018035528482869267, 0.026690276339650154, 0.010179798118770123, -0.012000637128949165, -0.011508933268487453, -0.014912443235516548, -0.02315615862607956, 0.025399556383490562, 0.036785561591386795, -0.0355255715548

In [5]:
import os
import json

transformed_dir = "../data/transformed"
embedded_dir = "../data/embedded"

if not os.path.exists(embedded_dir):
    os.makedirs(embedded_dir)
    
def prepare_embedding_str(review_json):
    return f"REVIEW_TITLE: {review_json['review_title']} REVIEW_TEXT: {review_json['review_text']} HOTEL_NAME: {review_json['hotel_name']} HOTEL_CITY: {review_json['city']} HOTEL_STATE: {review_json['hotel_state']}"
    
client = OpenAI()
for file in os.listdir(transformed_dir):
    with open(f"{transformed_dir}/{file}", "r") as f:
        review = json.load(f)
        
        ## start here
        embedding_str = prepare_embedding_str(review)
        response = client.embeddings.create(
            input=embedding_str,
            model="text-embedding-3-small"
        )
        
        review['embedding'] = response.data[0].embedding
        
        with open(f"{embedded_dir}/{file}", "w") as f:
            json.dump(review, f, indent=2)