In [1]:
import psycopg2
from sentence_transformers import SentenceTransformer
import time
import json

conversation_file_path = "..\\data\\022_00000017.txt"

# Load local embedding model (no API calls needed)
model = SentenceTransformer('all-MiniLM-L6-v2')
embedding_dimension = 384  # This model produces 384-dimensional vectors

db_connection_str = "dbname=postgres user=postgres password=root host=localhost port=5433"

def create_conversation_list(file_path: str):
    encodings_to_try = ['utf-8', 'latin-1', 'iso-8859-1', 'cp1252', 'windows-1252']
    
    for encoding in encodings_to_try:
        try:
            with open(file_path, "r", encoding=encoding) as file:
                text = file.read()
                text_list = text.split("\n")
                filtered_list = [chaine.strip() for chaine in text_list if chaine.strip() and not chaine.startswith("<")]
                print(f"Successfully read {len(filtered_list)} lines with encoding: {encoding}")
                return filtered_list
        except UnicodeDecodeError:
            continue
    
    # Final attempt with error handling
    with open(file_path, "r", encoding='utf-8', errors='ignore') as file:
        text = file.read()
        text_list = text.split("\n")
        filtered_list = [chaine.strip() for chaine in text_list if chaine.strip() and not chaine.startswith("<")]
        print(f"Read {len(filtered_list)} lines with error handling")
        return filtered_list

def calculate_embeddings(corpus: str):
    # Generate embeddings locally - no API calls!
    embedding = model.encode(corpus).tolist()
    return embedding

def save_embedding(corpus: str, embedding: list, cursor):
    # Store as FLOAT array instead of VECTOR
    cursor.execute('INSERT INTO embeddings_float (corpus, embedding) VALUES (%s, %s)', (corpus, embedding))

def cosine_similarity(a, b):
    """Calculate cosine similarity between two vectors"""
    import numpy as np
    a = np.array(a)
    b = np.array(b)
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

def similar_corpus_float(input_corpus: str, db_connection_str: str, top_k: int = 3):
    input_embedding = calculate_embeddings(input_corpus)
    
    conn = psycopg2.connect(db_connection_str)
    cursor = conn.cursor()
    
    # Get all embeddings from the database
    cursor.execute("SELECT ID, corpus, embedding FROM embeddings_float")
    all_results = cursor.fetchall()
    
    # Calculate cosine similarity for each entry
    similarities = []
    for id, corpus, db_embedding in all_results:
        # Calculate cosine similarity (higher = more similar)
        similarity = cosine_similarity(input_embedding, db_embedding)
        # Convert to distance (lower = more similar)
        distance = 1 - similarity
        similarities.append((id, corpus, db_embedding, distance))
    
    # Sort by distance (ascending)
    similarities.sort(key=lambda x: x[3])
    
    cursor.close()
    conn.close()
    return similarities[:top_k]

def euclidean_distance(a, b):
    """Calculate Euclidean distance between two vectors"""
    import numpy as np
    a = np.array(a)
    b = np.array(b)
    return np.linalg.norm(a - b)

def similar_corpus_euclidean(input_corpus: str, db_connection_str: str, top_k: int = 3):
    input_embedding = calculate_embeddings(input_corpus)
    
    conn = psycopg2.connect(db_connection_str)
    cursor = conn.cursor()
    
    # Get all embeddings from the database
    cursor.execute("SELECT ID, corpus, embedding FROM embeddings_float")
    all_results = cursor.fetchall()
    
    # Calculate Euclidean distance for each entry
    distances = []
    for id, corpus, db_embedding in all_results:
        distance = euclidean_distance(input_embedding, db_embedding)
        distances.append((id, corpus, db_embedding, distance))
    
    # Sort by distance (ascending)
    distances.sort(key=lambda x: x[3])
    
    cursor.close()
    conn.close()
    return distances[:top_k]

try:
    with psycopg2.connect(db_connection_str) as conn:
        conn.autocommit = True
        with conn.cursor() as cur:
            # Create table with FLOAT array instead of VECTOR
            cur.execute("DROP TABLE IF EXISTS embeddings_float")
            
            cur.execute("""CREATE TABLE IF NOT EXISTS embeddings_float (
                        ID SERIAL PRIMARY KEY, 
                        corpus TEXT,
                        embedding FLOAT[]  -- Using FLOAT array instead of VECTOR
                        )""")
            
            corpus_list = create_conversation_list(file_path=conversation_file_path)
            
            print(f"Processing {len(corpus_list)} documents...")
            
            for i, corpus in enumerate(corpus_list):
                print(f"Processing {i+1}/{len(corpus_list)}: {corpus[:50]}...")
                embedding = calculate_embeddings(corpus=corpus)
                save_embedding(corpus=corpus, embedding=embedding, cursor=cur)
            
            conn.commit()
            print("✅ All embeddings saved successfully in FLOAT array table!")

    # Test with different similarity methods
    test_query = "services bancaires"
    print(f"\nSearching for: '{test_query}'")
    
    print("\n=== Using Cosine Similarity ===")
    results_cosine = similar_corpus_float(test_query, db_connection_str, top_k=3)
    
    print("Resultats trouves (Cosine Similarity):")
    for id, corpus, embedding, distance in results_cosine:
        similarity = 1 - distance  # Convert back to similarity score
        print(f"ID: {id}, Distance: {distance:.4f}, Similarity: {similarity:.4f}")
        print(f"Corpus: {corpus}")
        print("---")
    
    print("\n=== Using Euclidean Distance ===")
    results_euclidean = similar_corpus_euclidean(test_query, db_connection_str, top_k=3)
    
    print("Resultats trouves (Euclidean Distance):")
    for id, corpus, embedding, distance in results_euclidean:
        print(f"ID: {id}, Distance: {distance:.4f}")
        print(f"Corpus: {corpus}")
        print("---")

except Exception as e:
    print(f"Error: {e}")

  from .autonotebook import tqdm as notebook_tqdm


Successfully read 23 lines with encoding: latin-1
Processing 23 documents...
Processing 1/23: h: U B S bonjour...
Processing 2/23: c: oui bonjour est-ce que je pourrais avoir monsie...
Processing 3/23: h: c'est lui-même...
Processing 4/23: c: ah bonjour j'essayais de vous appeler e j'ai eu...
Processing 5/23: h: où ça...
Processing 6/23: c: à l'accueil...
Processing 7/23: h: oui...
Processing 8/23: c: donc voilà et c'est parce que moi je e c'est po...
Processing 9/23: h: ouais e...
Processing 10/23: c: en première année...
Processing 11/23: c: et e elle m'a donné un numéro de code...
Processing 12/23: h: ouais...
Processing 13/23: c: et quand je fais ce numéro de code on me dit co...
Processing 14/23: h: e attendez attendez parce que là je suis pas à ...
Processing 15/23: c: e oui 0 1 26 0 4...
Processing 16/23: h: 0 1 26 0 4 ouais...
Processing 17/23: c: et dans combien de temps...
Processing 18/23: h: e ben dans deux minutes trois minutes là...
Processing 19/23: c: deux minutes trois