In [None]:
pip install scipy

In [3]:
import os
import requests
import pandas as pd
import numpy as np
from dotenv import load_dotenv
from scipy.spatial.distance import cosine

In [4]:
# Load environment variables
load_dotenv()
api_key = os.getenv("API_KEY")
endpoint = os.getenv("ENDPOINT")

# Load the pre-generated embeddings
embeddings_df = pd.read_csv('Customer_Embeddings.csv')

# Check the structure of the Embeddings column
# If the embeddings are stored as strings, ensure they can be properly processed
if isinstance(embeddings_df['Embeddings'][0], str):
    # Convert the embeddings from string representation to np.array
    embeddings_df['Embeddings'] = embeddings_df['Embeddings'].apply(lambda x: np.fromstring(x.strip("[]"), sep=","))

# Define headers for embedding request
headers = {
    "api-key": api_key,
    "Content-Type": "application/json"
}


In [6]:
# Function to generate an embedding for the input demographics
def get_embedding(text):
    data_payload = {
        "input": [text],
        "model": "text-embedding-ada-002"
    }
    response = requests.post(f"{endpoint}/openai/deployments/text-embedding-ada-002/embeddings?api-version=2024-04-01-preview", headers=headers, json=data_payload)
    
    if response.status_code == 200:
        return response.json()['data'][0]['embedding']
    else:
        print(f"Embedding API Error: {response.status_code} - {response.text}")
        return None


In [7]:
# Function to find the most similar profiles based on cosine similarity
def find_similar_profiles(input_embedding, embeddings_df, top_n=3):
    # Calculate similarity for each stored embedding
    try:
        embeddings_df['similarity'] = embeddings_df['Embeddings'].apply(lambda emb: 1 - cosine(input_embedding, emb))
    except Exception as e:
        print(f"Error calculating similarity: {e}")
        return None

    # Sort by similarity and select top N most similar profiles
    return embeddings_df.nlargest(top_n, 'similarity')[['CustomerID', 'similarity']]

In [16]:
# Function to predict using GPT-4o-mini based on similar profiles
def predict_with_gpt4(similar_profiles, task):
    # Format the similar profiles for the GPT-4 prompt
    profiles_text = "\n".join([f"CustomerID: {row['CustomerID']}, Similarity Score: {row['similarity']}" for _, row in similar_profiles.iterrows()])
    prompt = f"Based on the following similar customer profiles:\n{profiles_text}\nForecast {task} considering demographics and explain why this forecast is appropriate."

    # Define the endpoint for the GPT-4o-mini deployment
    if not endpoint or not api_key:
        print("Missing API endpoint or API key in environment variables.")
        return None

    url = f"{endpoint}/openai/deployments/gpt-4o-mini/chat/completions?api-version=2024-08-01-preview"

    # Define headers
    headers = {
        "api-key": api_key,
        "Content-Type": "application/json"
    }
    
    # Define the request payload
    payload = {
        "messages": [
            {"role": "system", "content": "You are a financial forecasting assistant."},
            {"role": "user", "content": prompt}
        ],
        "max_tokens": 150,
        "temperature": 0.7
    }
    # Make the request to the API
    try:
        response = requests.post(url, headers=headers, json=payload)
    except requests.exceptions.RequestException as e:
        print(f"Request error: {e}")
        return None

    # Check if the request was successful
    if response.status_code == 200:
        return response.json()['choices'][0]['message']['content']
    else:
        # Print error if the request fails
        print(f"ChatCompletion API Error: {response.status_code} - {response.text}")
        return None

In [None]:
# Main Workflow
if __name__ == "__main__":
    # Step 1: Generate embedding for input text
    test_text = "Age: 30, Salary: 5000, Job Title: Private Sector Employee"
    input_embedding = get_embedding(test_text)

    if input_embedding is not None:
        # Step 2: Find similar profiles using the corrected DataFrame
        similar_profiles_df = find_similar_profiles(input_embedding, embeddings_df)

        if similar_profiles_df is not None and not similar_profiles_df.empty:
            # Step 3: Use the similar profiles to generate predictions
            tasks = [
                "forecast credit line",
                "forecast credit risk",
                "forecast credit score",
                "forecast response to marketing campaign",
                "forecast market opportunity"
            ]

            for task in tasks:
                print(f"\n{task.capitalize()}:")
                prediction = predict_with_gpt4(similar_profiles_df, task)
                if prediction:
                    print(prediction)
        else:
            print("No similar profiles found or similarity calculation failed.")
    else:
        print("Failed to generate embedding.")
