In [102]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import chromadb
from openai import OpenAI
import openai
import numpy as np
import os
from langchain.text_splitter import RecursiveCharacterTextSplitter
from dotenv import load_dotenv

load_dotenv()

print(os.environ.get("OPENAI_API_KEY"))

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


In [96]:
# Étape 1 - Chargement des données
def fetch_data(path):
    try:
        return pd.read_json(path, lines=True)["description"].tolist()
    except:
        return []
    
data = fetch_data('meta.jsonl')

data

[["JUST LOOK, You can tell the difference. Make everyday more convenient, it is slim but has big rooms. If you are looking for a rich and luxurious appearance, look no further. These double shoulders are the perfect leather for creating attractive finished belts, straps and wallets. It doesn't only show the perfect weight for accessories where rugged durability is needed but also has a natural finish and coarse grain."],
 [],
 [],
 ['Case does not need to be removed for charging. Camera opening allows unobstructed use of camera and flash.',
  'DandyCase',
  'proudly presents the premium',
  '"PERFECT PATTERN"',
  "from the line of stylish cases that will make your friends jealous! Stand out from the rest and show off your Apple iPhone 6 Plus with these one of a kind cases. These cases are made of a durable, yet slightly flexible, TPU material that won't stretch or tear and the patterns wont chip, fade, or peel. DandyCase PERFECT PATTERN cases come with a lifetime warranty against damag

In [97]:
# Étape 2 - Prétraitement et segmentation des textes

# Nettoyer les données
def clean_data(data):
    flattened_data = [item for sublist in data if isinstance(sublist, list) for item in sublist]
    return [d.strip() for d in flattened_data if isinstance(d, str) and d.strip()]

# clean_data = clean_data(data)

# Diviser en segments
def split_data(data, chunk_size=512, chunk_overlap=128):
    from langchain.text_splitter import RecursiveCharacterTextSplitter
    splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    return [chunk for desc in data for chunk in splitter.split_text(desc)]

# split_data(clean_data)

In [98]:
# Étape 3 - Création d'un index vectoriel (embeddings)
def embed(chunks, model_name):
    model = SentenceTransformer(model_name)
    return np.array([model.encode(chunk) for chunk in chunks])

In [99]:
# Étape 4 - Création d’une base de données vectorielle
def build_db(chunks, vectors, name):
    client = chromadb.Client()
    db = client.get_or_create_collection(name)
    for idx, (text, vec) in enumerate(zip(chunks, vectors)):
        db.add(ids=[f"id_{idx}"], embeddings=[vec], documents=[text])
    return db

In [100]:
# Étape 4 - Création d’un système de récupération (retrieval system)
# Recherche et réponse
def query_response(db, query, model_name):
    client = OpenAI(
        api_key=os.environ.get("OPENAI_API_KEY"),  # This is the default and can be omitted
    )
    model = SentenceTransformer(model_name)
    result = db.query(query_embeddings=[model.encode(query)], n_results=3)
    print("Query result:", result) 

    docs = result["documents"]
    print("Docs content:", docs)
    print("Docs types:", [type(doc) for doc in docs])

    if not docs:
        return "No relevant information."
    
    flattened_docs = [item for sublist in docs for item in sublist] if all(isinstance(doc, list) for doc in docs) else docs

    prompt = f"Use the following to answer: {' '.join(flattened_docs)}. Question: {query}"
    print("Prompt:", prompt)
    
    reply = client.chat.completions.create(model="gpt-3.5-turbo", messages=[{"role": "user", "content": prompt}])
    return reply["choices"][0]["message"]["content"]

In [101]:
# Main
if __name__ == "__main__":
    print(os.environ.get("OPENAI_API_KEY"))
    data = fetch_data('meta.jsonl')
    if not data:
        exit("Failed to load data.")
    cleaned = clean_data(data)
    if not cleaned:
        exit("No valid data.")
    segments = split_data(cleaned)
    if not segments:
        exit("No segments created.")
    embeddings = embed(segments, 'all-MiniLM-L6-v2')
    db = build_db(segments, embeddings, "db_collection")
    print(query_response(db, "Describe a smartphone", 'all-MiniLM-L6-v2'))

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


Add of existing embedding ID: id_0
Insert of existing embedding ID: id_0
Add of existing embedding ID: id_1
Insert of existing embedding ID: id_1
Add of existing embedding ID: id_2
Insert of existing embedding ID: id_2
Add of existing embedding ID: id_3
Insert of existing embedding ID: id_3
Add of existing embedding ID: id_4
Insert of existing embedding ID: id_4
Add of existing embedding ID: id_5
Insert of existing embedding ID: id_5
Add of existing embedding ID: id_6
Insert of existing embedding ID: id_6
Add of existing embedding ID: id_7
Insert of existing embedding ID: id_7
Add of existing embedding ID: id_8
Insert of existing embedding ID: id_8
Add of existing embedding ID: id_9
Insert of existing embedding ID: id_9
Add of existing embedding ID: id_10
Insert of existing embedding ID: id_10
Add of existing embedding ID: id_11
Insert of existing embedding ID: id_11
Add of existing embedding ID: id_12
Insert of existing embedding ID: id_12
Add of existing embedding ID: id_13
Insert of

Query result: {'ids': [['id_320', 'id_2026', 'id_1381']], 'embeddings': None, 'documents': [['— 100% handmade to add personality and style to your phone — Slim and ultra thin design adding no bulky to your device — Front windows to see important information at a glance', 'Compatible Phone Model:', 'Q1: What phone model is it compatible with?']], 'uris': None, 'data': None, 'metadatas': [[None, None, None]], 'distances': [[0.7285916805267334, 0.7936947345733643, 0.8037242293357849]], 'included': [<IncludeEnum.distances: 'distances'>, <IncludeEnum.documents: 'documents'>, <IncludeEnum.metadatas: 'metadatas'>]}
Docs content: [['— 100% handmade to add personality and style to your phone — Slim and ultra thin design adding no bulky to your device — Front windows to see important information at a glance', 'Compatible Phone Model:', 'Q1: What phone model is it compatible with?']]
Docs types: [<class 'list'>]
Prompt: Use the following to answer: — 100% handmade to add personality and style to 

AuthenticationError: Error code: 401 - {'error': {'message': 'Incorrect API key provided: sk-proj-********************************************************************************************************************************************************mfAA. You can find your API key at https://platform.openai.com/account/api-keys.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_api_key'}}