# 1. Vector Embedding

In [8]:
import pandas as pd
import numpy as np
import re
import time
from sentence_transformers import SentenceTransformer
import logging
import json
from dotenv import load_dotenv
import os

# Load environment variables
load_dotenv()

# Configure logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(message)s")

def remove_emojis(text):
    return re.sub(r'[^\w\s,]', '', text)

def preprocess_text(df):
    logging.info("Starting text preprocessing")
    start = time.time()
    df = df.reset_index().rename(columns={"index": "id"})  # Create unique IDs
    df = df.dropna(subset=['review_text'])
    df['cleaned_text'] = df['review_text'].apply(remove_emojis)
    df = df[df['cleaned_text'].apply(lambda x: len(x.split()) >= 10)].reset_index(drop=True)
    logging.info(f"Text preprocessing completed in {time.time() - start:.2f} seconds")
    return df

def embed_text_and_save(df, model):
    logging.info("Starting embedding")
    start = time.time()
    
    total_rows = len(df)
    logging.info(f"Total rows to embed: {total_rows}")

    embeddings = []
    for i, (idx, row) in enumerate(df.iterrows()):
        embedding = model.encode(row['cleaned_text'], show_progress_bar=False)
        embeddings.append({
            'id': row['id'],  
            'text': row['cleaned_text'],
            'embedding': embedding.tolist()
        })
        
        # Save every 1000 embeddings to JSON without overwriting
        if (i + 1) % 1000 == 0 or (i + 1) == total_rows:
            file_name = f"embeddings_batch_{(i + 1) // 1000}.json"
            with open(file_name, 'a') as f:
                json.dump(embeddings, f)
                f.write('\n')  # Newline to avoid JSON parsing errors
            logging.info(f"Saved {i + 1} embeddings to {file_name}")
            embeddings = []  # Clear the batch after saving
    
    logging.info(f"Embedding completed in {time.time() - start:.2f} seconds")

def main():
    dataset_path = os.getenv("DATASET_PATH")
    model_name = os.getenv("EMBEDDING_MODEL_PATH")

    logging.info("Loading dataset")
    start = time.time()
    df = pd.read_csv(dataset_path)
    logging.info(f"Dataset loaded in {time.time() - start:.2f} seconds")

    df = preprocess_text(df)
    model = SentenceTransformer(model_name)
    embed_text_and_save(df, model)

if __name__ == "__main__":
    main()


2024-10-30 15:53:36,580 - INFO - Loading dataset
2024-10-30 15:54:10,070 - INFO - Dataset loaded in 33.49 seconds
2024-10-30 15:54:10,081 - INFO - Starting text preprocessing
2024-10-30 15:54:54,965 - INFO - Text preprocessing completed in 44.88 seconds
2024-10-30 15:54:55,800 - INFO - Use pytorch device_name: mps
2024-10-30 15:54:55,801 - INFO - Load pretrained SentenceTransformer: /Users/muhamadsyukron/Main Folder/Mac 2023 Files/Mekari/chatbot_project/models/all-MiniLM-L6-v2
2024-10-30 15:54:57,020 - INFO - Starting embedding
2024-10-30 15:54:57,032 - INFO - Total rows to embed: 1250338
2024-10-30 15:55:27,768 - INFO - Saved 1000 embeddings to embeddings_batch_1.json
2024-10-30 15:55:53,099 - INFO - Saved 2000 embeddings to embeddings_batch_2.json
2024-10-30 15:56:18,641 - INFO - Saved 3000 embeddings to embeddings_batch_3.json
2024-10-30 15:56:44,155 - INFO - Saved 4000 embeddings to embeddings_batch_4.json
2024-10-30 15:57:09,490 - INFO - Saved 5000 embeddings to embeddings_batch_5

KeyboardInterrupt: 

# 2. Inspects DataFrame after Pre-Processing

In [1]:
import pandas as pd
import numpy as np
import re
import time
from sentence_transformers import SentenceTransformer
import logging
import json
from dotenv import load_dotenv
import os

# Load environment variables
load_dotenv()

# Configure logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(message)s")

def remove_emojis(text):
    return re.sub(r'[^\w\s,]', '', text)

def preprocess_text(df):
    logging.info("Starting text preprocessing")
    start = time.time()
    df = df.reset_index().rename(columns={"index": "id"})  # Create unique IDs
    df = df.dropna(subset=['review_text'])
    df['cleaned_text'] = df['review_text'].apply(remove_emojis)
    df = df[df['cleaned_text'].apply(lambda x: len(x.split()) >= 10)].reset_index(drop=True)
    logging.info(f"Text preprocessing completed in {time.time() - start:.2f} seconds")
    return df


dataset_path = os.getenv("DATASET_PATH")
start = time.time()
df = pd.read_csv(dataset_path)
logging.info(f"Dataset loaded in {time.time() - start:.2f} seconds")
df = preprocess_text(df)


  from tqdm.autonotebook import tqdm, trange
2024-10-31 09:31:19,794 - Dataset loaded in 16.50 seconds
2024-10-31 09:31:19,798 - Starting text preprocessing
2024-10-31 09:31:36,388 - Text preprocessing completed in 16.59 seconds


In [4]:
df.head()

Unnamed: 0.1,id,Unnamed: 0,review_id,pseudo_author_id,author_name,review_text,review_rating,review_likes,author_app_version,review_timestamp,cleaned_text
0,1,1,bfa8876b-470e-4640-83a7-77427f7f37e8,234382942865437071667,A Google user,"I enjoy the awesome UI of this app, and it has...",5,4,1.1.0.91,2014-05-27 14:36:02,"I enjoy the awesome UI of this app, and it has..."
1,4,4,bbc1bf95-ed36-41a1-8b98-0f2e314caea5,167276875678680630145,A Google user,As a professional Android developer I'm glad t...,5,10,1.1.0.91,2014-05-27 15:26:48,As a professional Android developer Im glad to...
2,5,5,eac4e85c-2e13-4626-9072-5e190a285cb5,279544562364680964711,A Google user,If I had to put a $ amount on how much I would...,5,4,1.1.0.91,2014-05-27 15:34:29,If I had to put a amount on how much I would ...
3,8,8,5483e616-2c00-4c3e-8566-59b32a91b67f,283295985056957279128,A Google user,Easy to search and discover new music and also...,5,2,1.1.0.91,2014-05-27 16:42:06,Easy to search and discover new music and also...
4,12,12,6a9bbc61-75e2-4ce2-a092-62e74cfda8eb,137463903206137863639,A Google user,After updating to latest version I've got to d...,1,4,1.1.0.112,2014-05-27 18:16:30,After updating to latest version Ive got to do...


# 3. Add Metadata Keys

In [10]:
import json
import pandas as pd
import logging
import os
from dotenv import load_dotenv
from datetime import datetime
import re

# Load environment variables
load_dotenv()

# Configure logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(message)s")

# Load the dataset
dataset_path = os.getenv("DATASET_PATH")
df = pd.read_csv(dataset_path)
logging.info("Dataset loaded successfully")

# Rename 'Unnamed: 0' to 'id' to serve as the unique identifier
df = df.rename(columns={"Unnamed: 0": "id"})

# Preprocess and extract date components
df = df.dropna(subset=['review_text'])
df['cleaned_text'] = df['review_text'].apply(lambda text: re.sub(r'[^\w\s,]', '', text))
df = df[df['cleaned_text'].apply(lambda x: len(x.split()) >= 10)].reset_index(drop=True)
df['review_date'] = pd.to_datetime(df['review_timestamp'], errors='coerce')
df['year'] = df['review_date'].dt.year
df['month'] = df['review_date'].dt.month
df['day'] = df['review_date'].dt.day  # Extract day of the month

# Directory paths
embedding_directory_path = "/Users/muhamadsyukron/Main Folder/Mac 2023 Files/Mekari/chatbot_project/dataset/embedding"
backup_directory_path = "/Users/muhamadsyukron/Main Folder/Mac 2023 Files/Mekari/chatbot_project/dataset/embedding_backup"

# Create the backup directory if it doesn't exist
os.makedirs(backup_directory_path, exist_ok=True)

def add_fields_to_embeddings(source_dir, target_dir, df):
    logging.info("Starting to update embedding JSON files with additional fields")

    # Loop through each file in the source directory
    for file_name in os.listdir(source_dir):
        if file_name.startswith("embeddings_batch_") and file_name.endswith(".json"):
            source_file_path = os.path.join(source_dir, file_name)
            target_file_path = os.path.join(target_dir, file_name)
            logging.info(f"Processing file: {file_name}")

            # Load the embedding data from the source JSON file
            embeddings = []
            with open(source_file_path, 'r') as f:
                for line in f:
                    embeddings.extend(json.loads(line))

            # Add new fields from the DataFrame
            for embedding in embeddings:
                embedding_id = embedding['id']
                matched_row = df[df['id'] == embedding_id]

                if not matched_row.empty:
                    # Extract relevant fields
                    embedding['review_rating'] = int(matched_row['review_rating'].values[0])
                    embedding['year'] = int(matched_row['year'].values[0])
                    embedding['month'] = int(matched_row['month'].values[0])
                    embedding['day'] = int(matched_row['day'].values[0])  # Extract day as an integer

            # Save updated embeddings to the target backup file
            with open(target_file_path, 'w') as f:
                json.dump(embeddings, f)
                f.write('\n')  # Newline to avoid JSON parsing errors
            logging.info(f"Updated file saved: {target_file_path}")

    logging.info("All embedding JSON files updated successfully in backup directory")

# Call the function to update embeddings with additional fields, saving to the backup directory
add_fields_to_embeddings(embedding_directory_path, backup_directory_path, df)


2024-10-31 10:07:20,518 - Dataset loaded successfully
2024-10-31 10:08:29,321 - Starting to update embedding JSON files with additional fields
2024-10-31 10:08:29,336 - Processing file: embeddings_batch_2.json
2024-10-31 10:08:31,732 - Updated file saved: /Users/muhamadsyukron/Main Folder/Mac 2023 Files/Mekari/chatbot_project/dataset/embedding_backup/embeddings_batch_2.json
2024-10-31 10:08:31,733 - Processing file: embeddings_batch_1.json
2024-10-31 10:08:33,963 - Updated file saved: /Users/muhamadsyukron/Main Folder/Mac 2023 Files/Mekari/chatbot_project/dataset/embedding_backup/embeddings_batch_1.json
2024-10-31 10:08:33,965 - All embedding JSON files updated successfully in backup directory
