# We import everything necessary

In [3]:
import pandas as pd
import os 
import config
from openai import OpenAI
import openai
from pinecone import Pinecone
from tqdm import tqdm



# We get the necessary data/ transcripts

In [5]:
#now we load the csv file with the transcripts
#specify the path to the csv file
csv_file_path = os.path.join('..', 'csv_files', 'output_videos.csv')
# Load the CSV file into a pandas DataFrame
df = pd.read_csv(csv_file_path)
df.columns

Index(['alt', 'caption', 'shortCode', 'timestamp', 'type', 'url',
       'videoDuration', 'videoUrl', 'transcription'],
      dtype='object')

# Write "Now we create the embeddings for the transcripts"

In [3]:
# We use OpenAi's text-embedding-3-small model to get the embeddings of the the transcripts
client = OpenAI(api_key=config.OPENAI_API_KEY)
# We will store the embeddings in an additional column in the DataFrame



In [38]:
# We create an "get_embedding" function that will get the embeddings of the text
def get_embedding(text):
    #Call the OpenAI API to get the embeddings
    try: 
        response = client.embeddings.create(
        input= text,
        model="text-embedding-3-small"
        )   
    except Exception as e:
        print(f"BadRequestError for text '{text}': {e}")
        return None

    return response.data[0].embedding

In [33]:
#test with one row
text = df['transcription'][0]
embedding = get_embedding(text)
print(embedding)

[0.0019569178111851215, -0.011629682965576649, -0.013530689291656017, 0.013104360550642014, -0.007611012551933527, -0.0370137058198452, -0.030863391235470772, 0.05638718977570534, -0.0008474153582938015, -0.02982901968061924, -0.04224146902561188, -0.006747872103005648, -0.018660610541701317, -0.002993036061525345, -0.017262810841202736, 0.01114045362919569, -0.04612734913825989, 0.00877817440778017, 0.035308390855789185, 0.03301599994301796, -0.0038579239044338465, -0.009057734161615372, 0.022127149626612663, 0.07928313314914703, 0.008729251101613045, 0.032484836876392365, -0.02829144150018692, 0.05915483087301254, 0.022923896089196205, 0.004972667898982763, 0.00990340206772089, -0.012922647409141064, -0.004039637744426727, 4.8649941163603216e-05, -0.020198188722133636, -0.007932506501674652, -0.0003398397529963404, -0.0029213987290859222, 0.02781618945300579, -0.028934428468346596, 0.0051508876495063305, 0.04987344890832901, 0.02356688305735588, -0.04134687781333923, 0.04025659710168

In [39]:
# We apply the get_embedding function to the transcript column of the DataFrame
#To see progress we will use the tqdm library
from tqdm import tqdm
tqdm.pandas()
df['embedding'] = df['transcription'].progress_apply(get_embedding)
#df['embedding'] = df['transcription'].apply(get_embedding)


  0%|          | 0/272 [00:00<?, ?it/s]

 31%|███▏      | 85/272 [00:25<00:49,  3.75it/s]

BadRequestError for text 'nan': Error code: 400 - {'error': {'message': "We could not parse the JSON body of your request. (HINT: This likely means you aren't using your HTTP library correctly. The OpenAI API expects a JSON payload, but what was sent was not valid JSON. If you have trouble figuring out how to fix this, please contact us through our help center at help.openai.com.)", 'type': 'invalid_request_error', 'param': None, 'code': None}}


 32%|███▏      | 88/272 [00:26<00:44,  4.13it/s]

BadRequestError for text 'nan': Error code: 400 - {'error': {'message': "We could not parse the JSON body of your request. (HINT: This likely means you aren't using your HTTP library correctly. The OpenAI API expects a JSON payload, but what was sent was not valid JSON. If you have trouble figuring out how to fix this, please contact us through our help center at help.openai.com.)", 'type': 'invalid_request_error', 'param': None, 'code': None}}


 38%|███▊      | 102/272 [00:30<00:56,  3.01it/s]

BadRequestError for text 'nan': Error code: 400 - {'error': {'message': "We could not parse the JSON body of your request. (HINT: This likely means you aren't using your HTTP library correctly. The OpenAI API expects a JSON payload, but what was sent was not valid JSON. If you have trouble figuring out how to fix this, please contact us through our help center at help.openai.com.)", 'type': 'invalid_request_error', 'param': None, 'code': None}}


 49%|████▉     | 133/272 [00:40<00:37,  3.75it/s]

BadRequestError for text 'nan': Error code: 400 - {'error': {'message': "We could not parse the JSON body of your request. (HINT: This likely means you aren't using your HTTP library correctly. The OpenAI API expects a JSON payload, but what was sent was not valid JSON. If you have trouble figuring out how to fix this, please contact us through our help center at help.openai.com.)", 'type': 'invalid_request_error', 'param': None, 'code': None}}


 69%|██████▉   | 187/272 [00:56<00:21,  3.99it/s]

BadRequestError for text 'nan': Error code: 400 - {'error': {'message': "We could not parse the JSON body of your request. (HINT: This likely means you aren't using your HTTP library correctly. The OpenAI API expects a JSON payload, but what was sent was not valid JSON. If you have trouble figuring out how to fix this, please contact us through our help center at help.openai.com.)", 'type': 'invalid_request_error', 'param': None, 'code': None}}


100%|██████████| 272/272 [01:22<00:00,  3.30it/s]


In [40]:
#now we store the DataFrame with the embeddings in a new csv file
#specify the path to the csv file
embedding_csv_file_path = os.path.join('..', 'csv_files', 'output_videos_embeddings.csv')
# Save the DataFrame to a CSV file
df.to_csv(embedding_csv_file_path, index=False)

# Now we upsert the vectors/ embeddings with metadata to Pinecone

In [4]:
#Create the necessary connection to Pinecone
pc = Pinecone(api_key=config.PINECONE_API_KEY)
index = pc.Index("isaac")


In [5]:
#Load the DataFrame with the embeddings
embedding_csv_file_path = os.path.join('..', 'csv_files', 'output_videos_embeddings.csv')
embedding_df = pd.read_csv(embedding_csv_file_path)

In [6]:
#Create a function that we can apply to each row in the DataFrame to add the embeddings to the Pinecone index
def add_to_pinecone(row):
    # Get the video_id and embedding
    video_id = row['shortCode']
    embedding = row['embedding']
    # Create a dictionary with the metadata
    metadata = {
        "instagram_url": row['url'],
        "caption": row['caption'],
        "shortCode": row['shortCode'], 
        "transcript": row['transcription']
    }
    # Add the embedding to the Pinecone index
    index.upsert(
    vectors=[
        {
            "id": video_id,
            "values": embedding,
            "metadata": metadata
        }
    ],
    namespace= "first_try"
)

In [7]:
# Clean/ Preprocess the DataFrame with the embeddings: 
#drop rows with missing embeddings
embedding_df = embedding_df.dropna(subset=['embedding'])
embedding_df['embedding'].apply(lambda x: type(x) if isinstance(x, str) else (print(type(x)), print(x)))
# Transform each embedding to a list of floats using the eval method – this is necessary because pinecone only accepts lists of floats
embedding_df['embedding'] = embedding_df['embedding'].apply(eval)

In [8]:
# Apply the add_to_pinecone function to each row of the DataFrame and track the progress using tqdm
tqdm.pandas()
embedding_df.progress_apply(add_to_pinecone, axis=1)

100%|██████████| 267/267 [00:52<00:00,  5.06it/s]


0      None
1      None
2      None
3      None
4      None
       ... 
267    None
268    None
269    None
270    None
271    None
Length: 267, dtype: object