In [None]:
!pip install faiss-cpu 
!pip install sentence-transformers
!pip install scikit-learn
!pip install streamlit

In [1]:
!pip install streamlit-aggrid
!pip install reportlab
!pip install matplotlib

Collecting streamlit-aggrid
  Downloading streamlit_aggrid-1.1.3-py3-none-any.whl.metadata (8.6 kB)
Collecting python-decouple (from streamlit-aggrid)
  Downloading python_decouple-3.8-py3-none-any.whl.metadata (14 kB)
Downloading streamlit_aggrid-1.1.3-py3-none-any.whl (12.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.9/12.9 MB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Downloading python_decouple-3.8-py3-none-any.whl (9.9 kB)
Installing collected packages: python-decouple, streamlit-aggrid
Successfully installed python-decouple-3.8 streamlit-aggrid-1.1.3


In [1]:
import pandas as pd
import openai
import numpy as np
import time
import os
from dotenv import load_dotenv

# Load environment variables
load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")

if api_key is None:
    raise ValueError("OPENAI_API_KEY not found in environment variables.")

# Initialize OpenAI client
client = openai.OpenAI(api_key=api_key)

# Load and clean data
df = pd.read_csv("Sweden_merged_with_all_topics_and_growth_category.csv")
df = df[df["Final Company Description"].notna()].reset_index(drop=True)

# === Define embedding function ===
def get_openai_embedding(text, model="text-embedding-ada-002"):
    text = text.replace("\n", " ")
    for attempt in range(3):
        try:
            response = client.embeddings.create(input=[text], model=model)
            return response.data[0].embedding
        except Exception as e:
            print(f"Retrying due to error: {e}")
            time.sleep(2)
    return None

# === Embedding 1: Final Company Description ===
desc_embeddings = []
for i, text in enumerate(df["Final Company Description"]):
    print(f"[1/2] Embedding Final Description {i + 1}/{len(df)}")
    embedding = get_openai_embedding(text)
    desc_embeddings.append(embedding)

# Save result
np.save("Sweden_embeddings_openai.npy", desc_embeddings)

# === Embedding 2: Product + Activity fields ===
def build_product_activity_text(row):
    parts = []
    product = row.get("Product offerings", "")
    activity = row.get("Key activities", "")
    if isinstance(product, str) and product.strip():
        parts.append("Products: " + product.strip())
    if isinstance(activity, str) and activity.strip():
        parts.append("Activities: " + activity.strip())
    return " | ".join(parts)

df["ProductActivity Text"] = df.apply(build_product_activity_text, axis=1)

pa_embeddings = []
for i, text in enumerate(df["ProductActivity Text"]):
    print(f"[2/2] Embedding Product + Activity {i + 1}/{len(df)}")
    embedding = get_openai_embedding(text) if text.strip() else np.zeros(1536)  # OpenAI embedding size
    pa_embeddings.append(embedding)

# Save outputs
df.to_csv("Sweden_final_filtered.csv", index=False)
np.save("Sweden_product_activity_embeddings.npy", pa_embeddings)

[1/2] Embedding Final Description 1/1500
[1/2] Embedding Final Description 2/1500
[1/2] Embedding Final Description 3/1500
[1/2] Embedding Final Description 4/1500
[1/2] Embedding Final Description 5/1500
[1/2] Embedding Final Description 6/1500
[1/2] Embedding Final Description 7/1500
[1/2] Embedding Final Description 8/1500
[1/2] Embedding Final Description 9/1500
[1/2] Embedding Final Description 10/1500
[1/2] Embedding Final Description 11/1500
[1/2] Embedding Final Description 12/1500
[1/2] Embedding Final Description 13/1500
[1/2] Embedding Final Description 14/1500
[1/2] Embedding Final Description 15/1500
[1/2] Embedding Final Description 16/1500
[1/2] Embedding Final Description 17/1500
[1/2] Embedding Final Description 18/1500
[1/2] Embedding Final Description 19/1500
[1/2] Embedding Final Description 20/1500
[1/2] Embedding Final Description 21/1500
[1/2] Embedding Final Description 22/1500
[1/2] Embedding Final Description 23/1500
[1/2] Embedding Final Description 24/1500
[