Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 15 additions & 9 deletions operate/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from dotenv import load_dotenv
from openai import OpenAI
from prompt_toolkit.shortcuts import input_dialog
import google.generativeai as genai


class Config:
Expand All @@ -18,15 +19,19 @@ class Config:
def __init__(self):
load_dotenv()
self.verbose = False
self.openai_api_key = os.getenv("OPENAI_API_KEY", "")
self.google_api_key = os.getenv("GOOGLE_API_KEY", "")

def initialize_openai(self):
client = OpenAI()
client.api_key = self.openai_api_key
client.api_key = os.getenv("OPENAI_API_KEY")
client.base_url = os.getenv("OPENAI_API_BASE_URL", client.base_url)
return client

def initialize_google(self):
genai.configure(api_key=os.getenv("GOOGLE_API_KEY"), transport="rest")
model = genai.GenerativeModel("gemini-pro-vision")

return model

def validation(self, model, voice_mode):
"""
Validate the input parameters for the dialog operation.
Expand All @@ -39,7 +44,13 @@ def validation(self, model, voice_mode):
)

def require_api_key(self, key_name, key_description, is_required):
if is_required and not getattr(self, key_name.lower()):
key_exists = bool(os.environ.get(key_name))
if self.verbose:
print("[Config] require_api_key")
print("[Config] key_name", key_name)
print("[Config] key_description", key_description)
print("[Config] key_exists", key_exists)
if is_required and not key_exists:
self.prompt_and_save_api_key(key_name, key_description)

def prompt_and_save_api_key(self, key_name, key_description):
Expand All @@ -55,11 +66,6 @@ def prompt_and_save_api_key(self, key_name, key_description):
load_dotenv() # Reload environment variables
# Update the instance attribute with the new key

if key_value:
self.save_api_key_to_env(key_name, key_value)
load_dotenv() # Reload environment variables
setattr(self, key_name.lower(), key_value)

@staticmethod
def save_api_key_to_env(key_name, key_value):
with open(".env", "a") as file:
Expand Down
26 changes: 19 additions & 7 deletions operate/models/apis.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

from PIL import Image
from ultralytics import YOLO
import google.generativeai as genai

from operate.config import Config
from operate.exceptions import ModelNotRecognizedException
from operate.utils.screenshot import (
Expand All @@ -35,9 +35,13 @@

# Load configuration
VERBOSE = Config().verbose
config = Config()


async def get_next_action(model, messages, objective, session_id):
if VERBOSE:
print("[Self-Operating Computer][get_next_action]")
print("[Self-Operating Computer][get_next_action] model", model)
if model == "gpt-4":
return call_gpt_4_vision_preview(messages), None
if model == "gpt-4-with-som":
Expand All @@ -52,11 +56,10 @@ async def get_next_action(model, messages, objective, session_id):


def call_gpt_4_vision_preview(messages):
config = Config()
client = config.initialize_openai()
if VERBOSE:
print("[Self Operating Computer][get_next_action][call_gpt_4_v]")
time.sleep(1)
client = config.initialize_openai()
try:
screenshots_dir = "screenshots"
if not os.path.exists(screenshots_dir):
Expand Down Expand Up @@ -137,7 +140,10 @@ def call_gemini_pro_vision(messages, objective):
"""
Get the next action for Self-Operating Computer using Gemini Pro Vision
"""
config = Config()
if VERBOSE:
print(
"[Self Operating Computer][call_gemini_pro_vision]",
)
# sleep for a second
time.sleep(1)
try:
Expand All @@ -152,11 +158,18 @@ def call_gemini_pro_vision(messages, objective):
time.sleep(1)
prompt = get_system_prompt(objective)

model = genai.GenerativeModel("gemini-pro-vision")
model = config.initialize_google()
if VERBOSE:
print("[Self Operating Computer][call_gemini_pro_vision] model", model)

response = model.generate_content([prompt, Image.open(screenshot_filename)])

content = response.text[1:]
if VERBOSE:
print(
"[Self Operating Computer][call_gemini_pro_vision] response", response
)
print("[Self Operating Computer][call_gemini_pro_vision] content", content)

content = json.loads(content)
if VERBOSE:
Expand All @@ -176,9 +189,8 @@ def call_gemini_pro_vision(messages, objective):


async def call_gpt_4_vision_preview_labeled(messages, objective):
config = Config()
client = config.initialize_openai()
time.sleep(1)
client = config.initialize_openai()
try:
yolo_model = YOLO("./operate/models/weights/best.pt") # Load your trained model
screenshots_dir = "screenshots"
Expand Down
2 changes: 1 addition & 1 deletion operate/operate.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ def main(model, terminal_prompt, voice_mode=False):
Returns:
None
"""

mic = None
# Initialize `WhisperMic`, if `voice_mode` is True

Expand Down Expand Up @@ -109,7 +110,6 @@ def main(model, terminal_prompt, voice_mode=False):

while True:
if VERBOSE:
print("[Self Operating Computer]")
print("[Self Operating Computer] loop_count", loop_count)
try:
operations, session_id = asyncio.run(
Expand Down