## Runtime and Quality Benchmark - openai vs OpenAI

Below are a couple of tests comparing the performance of the Cohere and OpenAI embedding models. Main results:
 - Cohere offers English-only and multilingual models, OpenAI only offers multilingual models. The Cohere English-only model is comparable in runtime with the OpenAI default model (text-ada-002). The Cohere multilingual model is c. 2 times faster than the OpenAI default model.
 - For English, the Cohere embeddings appear to yield better results than the OpenAI embeddings, in the sense that the cosine similarity delta between similar and unsimilar sentences is significantly higher. That is, take three sentences `s1`, `s2`, `s3`, where `s1` is similar to `s2` but dissimilar from `s3`. The delta between the cosine similarities for (`s1`, `s2`) and (`s1`, `s3`) for the Cohere embeddings is significantly higher than the same delta for the OpenAI embeddings.
 - For languages other than English, the quality appears to be comparable.

In [None]:
import time
from aiohttp import ClientSession

from embed.openai import OpenAI
from embed.cohere import Cohere


async def test_cohere(test_data):
    async with ClientSession() as session:
        c = Cohere(
            key="mykey",
            session=session,
            model="embed-multilingual-v2.0"   # only set to use multilingual model, default is English-only
        )
        v = await c.embed(test_data)
    return v

async def test_openai(test_data):
    async with ClientSession() as session:
        c = OpenAI(
            key="mykey",
            session=session,
            model="text-similarity-ada-001"
        )
        v = await c.embed(test_data)
    return v

async def timeit(func, args, runs: int):
    runtimes = []
    for _ in range(runs):
        start_time = time.perf_counter()
        await func(args)
        runtimes.append(time.perf_counter() - start_time)
    print("Avg runtime:", round(sum(runtimes)/runs, 2))


In [None]:
test_data = ["Hello World"] * 1000

await timeit(func=test_cohere, args=test_data, runs=10)
await timeit(func=test_openai, args=test_data, runs=10)

## Cross-lingual test cases

In [None]:
from numpy import dot
from numpy.linalg import norm

def cosine_similarity(x, y):
    return dot(x, y)/(norm(x) * norm(y))


test_sentences = [
    "How are you doing?",
    "Hoe gaat het?",
    "Comment vas-tu?",
    "The grass is green",
    "Un gazon vert est joli"
]

cohere_embeddings = await test_cohere(test_sentences)
openai_embeddings = await test_openai(test_sentences)

cohere_emb_zipped = dict(zip(test_sentences, cohere_embeddings))
openai_emb_zipped = dict(zip(test_sentences, openai_embeddings))

cohere_cos_sim_1 = cosine_similarity(cohere_emb_zipped["How are you doing?"], cohere_emb_zipped["Hoe gaat het?"])
cohere_cos_sim_2 = cosine_similarity(cohere_emb_zipped["How are you doing?"], cohere_emb_zipped["Comment vas-tu?"])
cohere_cos_sim_3 = cosine_similarity(cohere_emb_zipped["Hoe gaat het?"], cohere_emb_zipped["Comment vas-tu?"])
cohere_cos_sim_4 = cosine_similarity(cohere_emb_zipped["How are you doing?"], cohere_emb_zipped["Un gazon vert est joli"])
cohere_cos_sim_5 = cosine_similarity(cohere_emb_zipped["How are you doing?"], cohere_emb_zipped["The grass is green"])

openai_cos_sim_1 = cosine_similarity(openai_emb_zipped["How are you doing?"], openai_emb_zipped["Hoe gaat het?"])
openai_cos_sim_2 = cosine_similarity(openai_emb_zipped["How are you doing?"], openai_emb_zipped["Comment vas-tu?"])
openai_cos_sim_3 = cosine_similarity(openai_emb_zipped["Hoe gaat het?"], openai_emb_zipped["Comment vas-tu?"])
openai_cos_sim_4 = cosine_similarity(openai_emb_zipped["How are you doing?"], openai_emb_zipped["Un gazon vert est joli"])
openai_cos_sim_5 = cosine_similarity(openai_emb_zipped["How are you doing?"], openai_emb_zipped["The grass is green"])

print(round(cohere_cos_sim_1, 3))
print(round(cohere_cos_sim_2, 3))
print(round(cohere_cos_sim_3, 3))
print(round(cohere_cos_sim_4, 3))
print(round(cohere_cos_sim_5, 3))
print("-----")
print(round(openai_cos_sim_1, 3))
print(round(openai_cos_sim_2, 3))
print(round(openai_cos_sim_3, 3))
print(round(openai_cos_sim_4, 3))
print(round(openai_cos_sim_5, 3))

## Same-language test cases

In [None]:
test_sentences = [
    "Washington, DC, is the capital of the USA",
    "Madrid is the capital of Spain",
    "The Labour Unions are in favor of labor rights"
]

cohere_embeddings_sl = await test_cohere(test_sentences)
openai_embeddings_sl = await test_openai(test_sentences)

cohere_emb_zipped_sl = dict(zip(test_sentences, cohere_embeddings_sl))
openai_emb_zipped_sl = dict(zip(test_sentences, openai_embeddings_sl))

cohere_cos_sim_sl_1 = cosine_similarity(cohere_emb_zipped_sl["Washington, DC, is the capital of the USA"], cohere_emb_zipped_sl["Madrid is the capital of Spain"])
cohere_cos_sim_sl_3 = cosine_similarity(cohere_emb_zipped_sl["Washington, DC, is the capital of the USA"], cohere_emb_zipped_sl["The Labour Unions are in favor of labor rights"])

openai_cos_sim_sl_1 = cosine_similarity(openai_emb_zipped_sl["Washington, DC, is the capital of the USA"], openai_emb_zipped_sl["Madrid is the capital of Spain"])
openai_cos_sim_sl_3 = cosine_similarity(openai_emb_zipped_sl["Washington, DC, is the capital of the USA"], openai_emb_zipped_sl["The Labour Unions are in favor of labor rights"])

print(round(cohere_cos_sim_sl_1, 3))
print(round(cohere_cos_sim_sl_3, 3))
print("-----")
print(round(openai_cos_sim_sl_1, 3))
print(round(openai_cos_sim_sl_3, 3))
