In [6]:
import os
import pandas as pd
import numpy as np
from dotenv import load_dotenv
from tqdm import tqdm


In [7]:
load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

if OPENAI_API_KEY is None:
    raise ValueError("❌ OPENAI_API_KEY not found. Check your .env file.")

print("✅ API key loaded")


✅ API key loaded


In [8]:
from openai import OpenAI

client = OpenAI(api_key=OPENAI_API_KEY)
print("✅ OpenAI client initialized")

✅ OpenAI client initialized


In [10]:
df = pd.read_csv("global air pollution dataset.csv")

print("Dataset shape:", df.shape)
df.head()


Dataset shape: (23463, 12)


Unnamed: 0,Country,City,AQI Value,AQI Category,CO AQI Value,CO AQI Category,Ozone AQI Value,Ozone AQI Category,NO2 AQI Value,NO2 AQI Category,PM2.5 AQI Value,PM2.5 AQI Category
0,Russian Federation,Praskoveya,51,Moderate,1,Good,36,Good,0,Good,51,Moderate
1,Brazil,Presidente Dutra,41,Good,1,Good,5,Good,1,Good,41,Good
2,Italy,Priolo Gargallo,66,Moderate,1,Good,39,Good,2,Good,66,Moderate
3,Poland,Przasnysz,34,Good,1,Good,34,Good,0,Good,20,Good
4,France,Punaauia,22,Good,0,Good,22,Good,0,Good,6,Good


In [11]:
columns_used = [
    "Country",
    "City",
    "AQI Value",
    "CO AQI Value",
    "Ozone AQI Value",
    "NO2 AQI Value",
    "PM2.5 AQI Value"
]

df = df[columns_used]
df.head()


Unnamed: 0,Country,City,AQI Value,CO AQI Value,Ozone AQI Value,NO2 AQI Value,PM2.5 AQI Value
0,Russian Federation,Praskoveya,51,1,36,0,51
1,Brazil,Presidente Dutra,41,1,5,1,41
2,Italy,Priolo Gargallo,66,1,39,2,66
3,Poland,Przasnysz,34,1,34,0,20
4,France,Punaauia,22,0,22,0,6


In [12]:
def row_to_text(row):
    return (
        f"City {row['City']} in {row['Country']} has an overall AQI of {row['AQI Value']}. "
        f"Carbon Monoxide AQI is {row['CO AQI Value']}, "
        f"Ozone AQI is {row['Ozone AQI Value']}, "
        f"Nitrogen Dioxide AQI is {row['NO2 AQI Value']}, "
        f"PM2.5 AQI is {row['PM2.5 AQI Value']}."
    )

df["text"] = df.apply(row_to_text, axis=1)
df["text"].iloc[0]


'City Praskoveya in Russian Federation has an overall AQI of 51. Carbon Monoxide AQI is 1, Ozone AQI is 36, Nitrogen Dioxide AQI is 0, PM2.5 AQI is 51.'

In [13]:
def get_embedding(text):
    response = client.embeddings.create(
        model="text-embedding-3-small",
        input=text
    )
    return response.data[0].embedding


In [15]:
BATCH_SIZE = 100 

embeddings = []

texts = df["text"].tolist()

for i in tqdm(range(0, len(texts), BATCH_SIZE), desc="Embedding batches"):
    batch = texts[i:i + BATCH_SIZE]
    
    response = client.embeddings.create(
        model="text-embedding-3-small",
        input=batch
    )
    
    batch_embeddings = [item.embedding for item in response.data]
    embeddings.extend(batch_embeddings)

df["embedding"] = embeddings


Embedding batches: 100%|██████████| 235/235 [09:21<00:00,  2.39s/it]


In [16]:
embedding_dim = len(df["embedding"].iloc[0])
print("Embedding dimension:", embedding_dim)


Embedding dimension: 1536


In [17]:
df.to_pickle("air_pollution_embeddings.pkl")
print("✅ Embeddings saved to air_pollution_embeddings.pkl")


✅ Embeddings saved to air_pollution_embeddings.pkl


In [18]:
from sklearn.metrics.pairwise import cosine_similarity

sim = cosine_similarity(
    [df["embedding"].iloc[0]],
    [df["embedding"].iloc[1]]
)

print("Cosine similarity between first two cities:", sim[0][0])


Cosine similarity between first two cities: 0.6398483472770359
