In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('OpenStreetMap_Tourist_Attractions_for_North_America.csv', low_memory=False)

In [3]:
df

Unnamed: 0,X,Y,OBJECTID,addr_city,addr_country,addr_housenumber,addr_postcode,addr_province,addr_state,addr_street,...,ref,source_transform,tourism,website,wheelchair,winter_road,wires,wood,zoo,osm_id2
0,-92.318096,14.616769,192,,,,,,,,...,,,viewpoint,,,,,,,5327709923
1,-92.355947,14.653835,251,,,,,,,,...,,,hotel,,,,,,,388651468
2,-92.240303,14.745767,527,,,,,,,,...,,,camp_site,,,,,,,7883004685
3,-92.433990,14.732903,611,,,,30800,,,Villa del Sol,...,,,camp_site,https://www.misionsurfmexico.com/,,,,,,7228473785
4,-92.284821,14.865885,958,,,,,,,,...,,,motel,,,,,,,4794499945
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
266400,-124.125931,40.685500,42976596,,,,,,,,...,,,information,,,,,,,11808672613
266401,-124.140263,40.691743,42976597,,,,,,,,...,,,information,,,,,,,11808672628
266402,-122.331648,47.608754,42976950,,,,,,,,...,,,information,,,,,,,11808581456
266403,-122.329952,47.609913,42976952,,,,,,,,...,,,artwork,,,,,,,11808720407


In [3]:
column_names = df.columns.tolist()
print(column_names)

['X', 'Y', 'OBJECTID', 'addr_city', 'addr_country', 'addr_housenumber', 'addr_postcode', 'addr_province', 'addr_state', 'addr_street', 'addr_unit', 'amenity', 'attraction', 'bicycle', 'board_type', 'building', 'building_levels', 'description', 'ele', 'historic', 'information', 'name', 'name_en', 'name_es', 'opening_hours', 'operator', 'phone', 'ref', 'source_transform', 'tourism', 'website', 'wheelchair', 'winter_road', 'wires', 'wood', 'zoo', 'osm_id2']


In [4]:
df['addr_city'].dropna()

18                 Tapachula
21                 Tapachula
37        Tapachula, Chiapas
78                  Valdivia
84         Frontera Comalapa
                 ...        
266390              Richmond
266391             Rosenberg
266397       Rivière-du-Loup
266398       Rivière-du-Loup
266404               Lincoln
Name: addr_city, Length: 14573, dtype: object

In [5]:
df1 = df[~df['tourism'].isin(['hotel','motel','hostel','guest_house','apartment'])]

In [6]:
df2 = df1[['tourism','name','addr_city']]

In [7]:
df2= df2.dropna()

In [8]:
df2

Unnamed: 0,tourism,name,addr_city
589,gallery,Beau Dégât [bo.de.ga],San Cristóbal de Las Casas
596,gallery,La Galería,San Cristóbal de Las Casas
617,gallery,La Tozi Galeria Hotel,San Cristóbal de las Casas
634,gallery,Etéreo Taller,San Cristobal de las Casas Chiapas
651,gallery,Galería MUY,San Cristóbal de las Casas
...,...,...,...
265558,gallery,Regina Piantedosi Gallery,Portsmouth
265669,caravan_site,Sandpipers Nudist Resort & RV Park,Edinburg
265881,museum,Museo del Taco,Tijuana
265979,gallery,Izzy's Raw Art Gallery,Detroit


In [10]:
df2['tourism'].nunique()

59

In [12]:
df2['addr_city'].nunique()

3286

In [66]:
from sentence_transformers import SentenceTransformer

# Initialize the embedding model
model = SentenceTransformer('all-MiniLM-L6-v2')
df4 = pd.DataFrame()
# Generate embeddings
df4['vectors'] = df2['name'].apply(lambda x: model.encode(x))
# Assuming df is your DataFrame
df4['metadata'] = df2.apply(lambda row: {'tourism': row['tourism'], 'addr_city': row['addr_city']}, axis=1)


In [67]:
df4 = df4.reset_index(drop=True)


In [9]:
import os
from dotenv import load_dotenv
env_path = './Apikey.env'  # Adjust the path if your file is in a different location
load_dotenv(dotenv_path=env_path)
pinecone_key = os.getenv('PINECONE_API_KEY')

In [11]:
from pinecone import Pinecone

pc = Pinecone(api_key= pinecone_key)
index = pc.Index("myscu")

In [72]:
vectors_to_upsert = [
    {
        "id": row.name,  # Directly use the index as a string
        "values": row.vectors,  # Assuming 'vectors' is a list of floats
        "metadata": row.metadata  # Assuming 'metadata' is a dictionary
    }
    for index, row in df4.iterrows()
]

# Then you can proceed with the upsert operation
# index.upsert(vectors=vectors_to_upsert, namespace="ns1")


In [77]:
def batch_upsert(vectors, batch_size=50):
    # Split the data into batches
    for i in range(0, len(vectors), batch_size):
        batch = vectors[i:i + batch_size]
        try:
            index.upsert(vectors=batch, namespace="travel_embedding")
            print(f"Batch {i//batch_size + 1} upserted successfully.")
        except Exception as e:
            print(f"Failed to upsert batch {i//batch_size + 1}: {e}")

# Assuming 'vectors_to_upsert' contains all your vectors
batch_upsert(vectors_to_upsert, batch_size=50)

Batch 1 upserted successfully.
Batch 2 upserted successfully.
Batch 3 upserted successfully.
Batch 4 upserted successfully.
Batch 5 upserted successfully.
Batch 6 upserted successfully.
Batch 7 upserted successfully.
Batch 8 upserted successfully.
Batch 9 upserted successfully.
Batch 10 upserted successfully.
Batch 11 upserted successfully.
Batch 12 upserted successfully.
Batch 13 upserted successfully.
Batch 14 upserted successfully.
Batch 15 upserted successfully.
Batch 16 upserted successfully.
Batch 17 upserted successfully.
Batch 18 upserted successfully.
Batch 19 upserted successfully.
Batch 20 upserted successfully.
Batch 21 upserted successfully.
Batch 22 upserted successfully.
Batch 23 upserted successfully.
Batch 24 upserted successfully.
Batch 25 upserted successfully.
Batch 26 upserted successfully.
Batch 27 upserted successfully.
Batch 28 upserted successfully.
Batch 29 upserted successfully.
Batch 30 upserted successfully.
Batch 31 upserted successfully.
Batch 32 upserted