In [None]:
!conda install -c conda-forge streamlit -y
!conda install -c conda-forge sentence-transformers -y
!conda install -c conda-forge chromadb -y
!conda install numpy -y
!conda install pytorch-gpu torchaudio pytorch-cuda=12.1 -c pytorch -c nvidia
!conda install conda-forge::tqdm -y

In [6]:
!nvidia-smi

Wed Jan  8 16:39:06 2025       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.183.01             Driver Version: 535.183.01   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Quadro RTX 8000                On  | 00000000:01:00.0 Off |                  Off |
| 33%   47C    P8              19W / 260W |    167MiB / 49152MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [1]:
import torch

# Check if CUDA is available
print("CUDA available:", torch.cuda.is_available())

# Print the name of the GPU if available
if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name(0))


CUDA available: True
GPU: Quadro RTX 8000


In [2]:
import torch
import torchvision

print("Torch version:", torch.__version__)
print("Torchvision version:", torchvision.__version__)

Torch version: 2.3.1.post300
Torchvision version: 0.18.1a0


In [6]:
from sentence_transformers import SentenceTransformer
from chromadb.config import Settings
from chromadb import PersistentClient
from chromadb.utils import embedding_functions
import os
import numpy as np
from tqdm import tqdm  # For progress tracking


# Initialize SentenceTransformer model
model_name = "all-MiniLM-L6-v2"
model = SentenceTransformer(model_name, device="cuda")

db_directory = "./chroma_db"
client = PersistentClient(path=db_directory)
embedding_fn = embedding_functions.SentenceTransformerEmbeddingFunction(model_name=model_name)

collection_name = "password_embeddings"
collection = client.get_or_create_collection(
    name=collection_name,
    embedding_function=embedding_fn
)


In [2]:
def store_passwords(passwords, batch_size=100):
    """
    Stores embeddings of passwords into ChromaDB vector store in batches.
    :param passwords: List of passwords.
    :param batch_size: Number of passwords to process in one batch.
    """
    # Prepare batches
    for i in tqdm(range(0, len(passwords), batch_size), desc="Processing batches"):
        batch = passwords[i:i + batch_size]
        
        # Generate embeddings for the batch
        embeddings = model.encode(batch, batch_size=batch_size, convert_to_numpy=True)
        
        # Prepare data for the vector store
        ids = [str(i + j) for j in range(len(batch))]
        collection.add(ids=ids, documents=batch, embeddings=embeddings.tolist())
    
    print(f"All {len(passwords)} passwords added to vector store successfully.")


In [5]:
def find_similar_passwords(target_data, n_results):
    """
    Finds and sorts passwords in similarity order to the target data.
    :param target_data: List of strings containing personal data.
    :return: List of sorted passwords by similarity.
    """
    # Generate embeddings for the target data
    target_embeddings = model.encode(target_data).tolist()
    # Query vector store
    query_results = collection.query(query_embeddings=target_embeddings, n_results=n_results)

    # Combine results and sort
    all_results = zip(query_results["documents"], query_results["distances"])
    sorted_results = sorted(all_results, key=lambda x: x[1])  # Lower distance is more similar

    return [doc for doc, _ in sorted_results]

In [5]:
def load_all_passwords(directory_name="dictionaries"):
    """
    Loads all passwords from .dic files in the specified directory.
    :param directory_name: Directory where .dic files are stored.
    :return: A list of all passwords from all .dic files.
    """
    if not os.path.exists(directory_name):
        raise FileNotFoundError(f"The directory '{directory_name}' does not exist.")

    all_passwords = []
    for file_name in os.listdir(directory_name):
        if file_name.endswith(".dic"):
            file_path = os.path.join(directory_name, file_name)
            with open(file_path, "r") as file:
                passwords = file.read().splitlines()
                all_passwords.extend(passwords)
    return all_passwords

In [17]:
# Example Usage
if __name__ == "__main__":
    import os

    # Check if the vector store directory exists
    if not os.path.exists("./chroma_db"):
        print("First-time setup: Adding common passwords to vector store.")
        passwords = load_all_passwords()
        store_passwords(passwords)

    # User inputs target data
    print("Enter target data (e.g., first name, last name, birthday). Type 'done' to finish.")
    target_data = []
    while True:
        data = input("Data: ").strip()
        if data.lower() == 'done':
            break
        target_data.append(data)

    # Find similar passwords
    similar_passwords = find_similar_passwords(target_data, 348000)
    print("\nSimilar passwords sorted by similarity:")
    for pwd in similar_passwords:
        print(pwd)

Enter target data (e.g., first name, last name, birthday). Type 'done' to finish.


Data:  dave
Data:  done


OperationalError: too many SQL variables

In [9]:
num_records = collection.count()
print("Number of records:", num_records)

Number of records: 348499


In [None]:
num_records = chroma_collection.count()
print("Number of records:", num_records)

In [31]:
target_embeddings = model.encode(["dave"]).tolist()
# Query vector store
query_results = collection.query(query_embeddings=target_embeddings, n_results=5)

# Combine results and sort
all_results = zip(query_results["documents"], query_results["distances"])
for i in sorted_results:
    sorted_results = list(filter(lambda x: x[1] > 0.55, )) 

sorted_results = sorted(all_results, key=lambda x: x[1])

TypeError: '>' not supported between instances of 'str' and 'float'

In [28]:
sorted_results

[(['dave13', 'dave15', 'daveman66', 'dave31', 'dave17'],
  [0.5031169056892395,
   0.5753949880599976,
   0.5774793028831482,
   0.5857124328613281,
   0.6067200303077698])]