In [1]:
# pip install pandas openai pinecone
import pandas as pd
import os, time
from tqdm import tqdm
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec

client = OpenAI(api_key = os.getenv('OPENAI_API_KEY'))
pc = Pinecone(api_key = os.getenv('PINECONE_API_KEY'))

cloud = 'aws'
region = 'us-east-1'
spec = ServerlessSpec(cloud=cloud, region=region)

### Embedding

In [None]:
df = pd.read_csv('data/netflix_titles.csv')
df = df.drop(columns=['date_added', 'duration'])
df['combined'] =df.apply(lambda x: ''.join(x.astype(str)), axis=1)
df.head(3)

In [None]:
def embed_fn(text_list, batch_size=100):
    embedding = []
    for i in tqdm(range(0, len(text_list), batch_size)):
        batch = text_list[i:i+batch_size]

        try:
            response = client.embeddings.create(input=batch, model='text-embedding-3-small')
            batch_embedding = [response.data[j].embedding for j in range(len(response.data))]
            
            # Print each embedded row
            for j, emb in enumerate(batch_embedding):
                print(f'Embedded row {i+j}')  # Add this ": {emb}" to print the embedding data also

            embedding.extend(batch_embedding)
            time.sleep(1)
        except Exception as e:
            print(f'Error at batch {i//batch_size}: {e}')
            time.sleep(5)

    return embedding
df['embedding'] = embed_fn(df['combined'])
df['embedding'] = df['embedding'].apply(lambda x: [round(value, 15) for value in x])

In [None]:
len(df['embedding'][0])

## Uploading Embeded File

In [None]:
df = pd.read_csv('data/netflix_titles_embedding.csv')

index_name = 'netflix-titles'
ns_name = 'movie-tv-shows'

if index_name not in pc.list_indexes().names():
    pc.create_index(
        index_name,
        dimension = 1536,
        metric = 'cosine',
        spec = spec
    )

pinecone_index = pc.Index(index_name)

In [None]:
df['id'] = df.index.astype(str)

def batch_data(df, batch_size):
    for i in range(0, len(df), batch_size):
        yield df.iloc[i:i+batch_size]
print('Uploading vector data!')

batch_size = 100
for batch_df in tqdm(batch_data(df, batch_size)):
    vector_list = list(zip(batch_df['id'], batch_df['embedding'].apply(eval)))
    pinecone_index.upsert(vectors = vector_list, namespace = ns_name)

In [None]:
dataset = pd.read_csv('./Data/netflix_titles.csv')
dataset['id'] = dataset.index.astype(str)
df = dict(zip(
    dataset.id, 
    dataset[['type', 'title', 'director', 'actors', 'description']].to_dict(orient = 'records')
))

In [None]:
df['id'] = df.index.astype(str)

In [None]:
df