# Ant Colony Optimization (ACO) for Legal Document Retrieval

In [4]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Example legal text corpus
legal_documents = [
    "Contract law covers obligations and duties under a contract.",
    "Tort law deals with civil wrongs, leading to liability and damages.",
    "Intellectual property law governs ownership and use of inventions.",
    "Criminal law deals with crimes and their legal punishment.",
    "Constitutional law provides the framework for laws and government powers.",
    "Administrative law deals with the activities of administrative agencies.",
    "Family law deals with legal matters related to family relationships.",
    "Environmental law governs the protection of the environment.",
    "Labor law governs the rights and duties of workers and employers.",
    "Tax law deals with the legal framework for taxation and tax disputes."
]

# Step 1: Calculate the similarity matrix using TF-IDF and Cosine Similarity
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(legal_documents)

# Cosine similarity matrix (used as the inverse of distance matrix)
similarity_matrix = cosine_similarity(tfidf_matrix)

# To adapt to ACO (where high similarity means shorter distance), invert the matrix
distance_matrix = 1 - similarity_matrix
np.fill_diagonal(distance_matrix, np.inf)  # No self-loops

# Parameters for ACO
num_ants = 5        # Number of ants
num_iterations = 50  # Number of iterations
alpha = 1           # Pheromone importance
beta = 2            # Distance importance
evaporation_rate = 0.5  # Evaporation rate of pheromones
pheromone_deposit = 100  # Amount of pheromone deposited by each ant

# Initialize pheromone matrix
pheromone_matrix = np.ones(distance_matrix.shape) / len(distance_matrix)

# Function to choose next node
def choose_next_node(current_node, unvisited, pheromone_matrix, distance_matrix, alpha, beta):
    pheromone = pheromone_matrix[current_node, unvisited] ** alpha
    heuristic = (1 / distance_matrix[current_node, unvisited]) ** beta
    probabilities = pheromone * heuristic
    probabilities /= probabilities.sum()

    return np.random.choice(unvisited, p=probabilities)

# Function to update pheromones
def update_pheromones(pheromone_matrix, paths, path_lengths, evaporation_rate, pheromone_deposit):
    pheromone_matrix *= (1 - evaporation_rate)
    for path, path_length in zip(paths, path_lengths):
        for i in range(len(path) - 1):
            pheromone_matrix[path[i], path[i+1]] += pheromone_deposit / path_length

# Function to calculate path length
def calculate_path_length(path, distance_matrix):
    length = 0
    for i in range(len(path) - 1):
        length += distance_matrix[path[i], path[i+1]]
    return length

# ACO algorithm for text-based legal reference optimization
def ant_colony_optimization(num_iterations, num_ants, distance_matrix, alpha, beta, evaporation_rate, pheromone_deposit):
    num_nodes = distance_matrix.shape[0]
    best_path = None
    best_length = np.inf

    for iteration in range(num_iterations):
        paths = []
        path_lengths = []

        for ant in range(num_ants):
            unvisited = list(range(num_nodes))
            current_node = np.random.choice(unvisited)
            path = [current_node]
            unvisited.remove(current_node)

            while unvisited:
                next_node = choose_next_node(current_node, unvisited, pheromone_matrix, distance_matrix, alpha, beta)
                path.append(next_node)
                unvisited.remove(next_node)
                current_node = next_node

            path_length = calculate_path_length(path, distance_matrix)
            paths.append(path)
            path_lengths.append(path_length)

            if path_length < best_length:
                best_path = path
                best_length = path_length

        # Update pheromones
        update_pheromones(pheromone_matrix, paths, path_lengths, evaporation_rate, pheromone_deposit)

        print(f"Iteration {iteration+1}: Best path so far: {best_path} with length: {best_length}")

    return best_path, best_length

# Run the ACO algorithm
best_path, best_length = ant_colony_optimization(num_iterations, num_ants, distance_matrix, alpha, beta, evaporation_rate, pheromone_deposit)

print(f"Best path through legal documents: {best_path}")
print(f"Best path length (in terms of similarity): {best_length}")

for i in best_path:
    print(legal_documents[i])


Iteration 1: Best path so far: [7, 4, 8, 0, 3, 5, 9, 6, 1, 2] with length: 7.8426581814970415
Iteration 2: Best path so far: [7, 4, 8, 0, 3, 5, 9, 6, 1, 2] with length: 7.8426581814970415
Iteration 3: Best path so far: [7, 4, 8, 0, 3, 5, 9, 6, 1, 2] with length: 7.8426581814970415
Iteration 4: Best path so far: [3, 9, 4, 1, 5, 6, 0, 7, 8, 2] with length: 7.686849448615272
Iteration 5: Best path so far: [3, 9, 4, 1, 5, 6, 0, 7, 8, 2] with length: 7.686849448615272
Iteration 6: Best path so far: [0, 5, 6, 1, 3, 8, 2, 7, 4, 9] with length: 7.6118867686898755
Iteration 7: Best path so far: [0, 5, 6, 1, 3, 8, 2, 7, 4, 9] with length: 7.6118867686898755
Iteration 8: Best path so far: [0, 5, 6, 1, 3, 8, 2, 7, 4, 9] with length: 7.6118867686898755
Iteration 9: Best path so far: [0, 5, 6, 1, 3, 8, 2, 7, 4, 9] with length: 7.6118867686898755
Iteration 10: Best path so far: [0, 5, 6, 1, 3, 8, 2, 7, 4, 9] with length: 7.6118867686898755
Iteration 11: Best path so far: [0, 5, 6, 1, 3, 8, 2, 7, 4, 9

In [46]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import random

# Step 1: Legal document corpus
legal_documents = [
    "Contract law is a legally binding agreement between parties outlining obligations and rights. Breach can lead to damages.",
    "Tort law deals with civil wrongs causing harm or loss. Negligence, defamation, and trespass are common torts.",
    "Criminal law governs harmful conduct to society, such as felonies, misdemeanors, and infractions. Penalties include imprisonment and fines.",
    "Constitutional law sets up government frameworks, individual rights, and separation of powers.",
    "Environmental law regulates human impact on the environment, including pollution and resource conservation.",
    "Employment law covers employer-employee relations, including wages, discrimination, and wrongful termination.",
    "Family law governs marriage, divorce, custody, and adoption, including property division and child support.",
    "Tax law regulates income, corporate, and property taxes, including compliance with tax authorities.",
    "Real estate law governs property ownership, zoning, mortgages, and leases. Legal compliance is necessary during transfers.",
]

# Step 2: User's issue as input query
user_issue = "Somebody breached our contract and caused financial loss. What legal actions can we take?"

# Step 3: Fine-tuned TF-IDF Vectorizer for document-query similarity
# Creating a combined vectorizer for documents and query
vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_df=0.85, stop_words='english')
# Fit and transform on both documents and query
combined_corpus = legal_documents + [user_issue]
combined_tfidf_matrix = vectorizer.fit_transform(combined_corpus)

# Separate TF-IDF matrices for documents and query
document_tfidf_matrix = combined_tfidf_matrix[:-1]  # All but the last element (documents)
query_tfidf_matrix = combined_tfidf_matrix[-1]  # The last element (query)

# Calculate cosine similarity between each document and the query
similarity_scores = cosine_similarity(document_tfidf_matrix, query_tfidf_matrix).flatten()

# Step 4: Create a distance matrix for ACO (1 - similarity scores)
distance_matrix = 1 - similarity_scores
num_documents = len(legal_documents)

# ACO Parameters 
num_ants = 5
num_iterations = 50
alpha = 1  # Pheromone importance
beta = 3  # Distance importance increased for more sensitivity to similarity
evaporation_rate = 0.2
pheromone_deposit = 50
top_k = 3  # Number of relevant documents to retrieve

# Initialize pheromone levels
pheromone_matrix = np.ones(num_documents) / num_documents

# Function to choose the next document based on probabilities
def choose_next_document(pheromones, distances, alpha, beta):
    pheromone_factor = pheromones ** alpha
    distance_factor = (1 / distances) ** beta
    probabilities = pheromone_factor * distance_factor
    probabilities /= probabilities.sum()

    return np.random.choice(range(len(legal_documents)), p=probabilities)

# Function to update pheromone levels
def update_pheromones(pheromones, documents_visited, evaporation_rate, pheromone_deposit):
    pheromones *= (1 - evaporation_rate)  # Evaporate pheromones
    for document in documents_visited:
        pheromones[document] += pheromone_deposit  # Add pheromone to visited documents

# ACO Algorithm for document retrieval
def ant_colony_optimization(num_iterations, num_ants, distance_matrix, pheromone_matrix, alpha, beta, evaporation_rate, pheromone_deposit, top_k):
    best_documents = set()
    best_similarities = []

    for iteration in range(num_iterations):
        ant_paths = []
        ant_similarities = []

        for ant in range(num_ants):
            current_path = []
            current_similarity = 0

            # Ant selects multiple documents (paths)
            for _ in range(top_k):  # Select top_k documents per ant
                current_document = choose_next_document(pheromone_matrix, distance_matrix, alpha, beta)
                current_similarity += similarity_scores[current_document]
                current_path.append(current_document)

            # Save the chosen documents and their total similarity
            ant_paths.append(current_path)
            ant_similarities.append(current_similarity)

        # Update pheromones for all documents visited by all ants
        for path in ant_paths:
            update_pheromones(pheromone_matrix, path, evaporation_rate, pheromone_deposit)

        # Keep track of the best paths (relevant documents)
        for i in range(len(ant_paths)):
            if len(best_similarities) < top_k or ant_similarities[i] > min(best_similarities):
                best_documents.update(ant_paths[i])
                best_similarities.append(ant_similarities[i])

        print(f"Iteration {iteration + 1}: Current best documents: {list(best_documents)}")

    # Sort and return the top K most relevant documents
    sorted_best_documents = sorted(best_documents, key=lambda x: similarity_scores[x], reverse=True)

    return sorted_best_documents[:top_k]

# Find indices of concept-related documents (e.g., related to "tort law")
important_keywords = ["tort", "negligence", "defamation", "trespass"]
concept_indices = [i for i, doc in enumerate(legal_documents) if any(keyword in doc.lower() for keyword in important_keywords)]

# Run the ACO algorithm to find the top K relevant documents including the concept-related ones
best_documents_indices = ant_colony_optimization(num_iterations, num_ants, distance_matrix, pheromone_matrix, alpha, beta, evaporation_rate, pheromone_deposit, top_k)

# Output the most relevant legal documents for the user's issue
print(f"\nTop {top_k} most relevant legal documents for your issue:")
for idx in best_documents_indices:
    print(f"Document {idx}: {legal_documents[idx]}")


Iteration 1: Current best documents: [0, 1, 4, 5, 6, 7, 8]
Iteration 2: Current best documents: [0, 1, 4, 5, 6, 7, 8]
Iteration 3: Current best documents: [0, 1, 4, 5, 6, 7, 8]
Iteration 4: Current best documents: [0, 1, 4, 5, 6, 7, 8]
Iteration 5: Current best documents: [0, 1, 4, 5, 6, 7, 8]
Iteration 6: Current best documents: [0, 1, 4, 5, 6, 7, 8]
Iteration 7: Current best documents: [0, 1, 4, 5, 6, 7, 8]
Iteration 8: Current best documents: [0, 1, 4, 5, 6, 7, 8]
Iteration 9: Current best documents: [0, 1, 4, 5, 6, 7, 8]
Iteration 10: Current best documents: [0, 1, 4, 5, 6, 7, 8]
Iteration 11: Current best documents: [0, 1, 4, 5, 6, 7, 8]
Iteration 12: Current best documents: [0, 1, 4, 5, 6, 7, 8]
Iteration 13: Current best documents: [0, 1, 4, 5, 6, 7, 8]
Iteration 14: Current best documents: [0, 1, 4, 5, 6, 7, 8]
Iteration 15: Current best documents: [0, 1, 4, 5, 6, 7, 8]
Iteration 16: Current best documents: [0, 1, 4, 5, 6, 7, 8]
Iteration 17: Current best documents: [0, 1, 4, 5

# Particle Swarm Optimization (PSO) for Legal Document Retrieval

In [59]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import random

# Step 1: Legal document corpus
legal_documents = [
    "Contract law is a legally binding agreement between parties outlining obligations and rights. Breach can lead to damages.",
    "Tort law deals with civil wrongs causing harm or loss. Negligence, defamation, and trespass are common torts.",
    "Criminal law governs harmful conduct to society, such as felonies, misdemeanors, and infractions. Penalties include imprisonment and fines.",
    "Constitutional law sets up government frameworks, individual rights, and separation of powers.",
    "Environmental law regulates human impact on the environment, including pollution and resource conservation.",
    "Employment law covers employer-employee relations, including wages, discrimination, and wrongful termination.",
    "Family law governs marriage, divorce, custody, and adoption, including property division and child support.",
    "Tax law regulates income, corporate, and property taxes, including compliance with tax authorities.",
    "Real estate law governs property ownership, zoning, mortgages, and leases. Legal compliance is necessary during transfers.",
]

# Step 2: User's issue as input query
user_issue = "Somebody breached our contract and caused financial loss. What legal actions can we take?"

# Step 3: Fine-tuned TF-IDF Vectorizer for document-query similarity
# Creating a combined vectorizer for documents and query
vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_df=0.85, stop_words='english')
# Fit and transform on both documents and query
combined_corpus = legal_documents + [user_issue]
combined_tfidf_matrix = vectorizer.fit_transform(combined_corpus)

# Separate TF-IDF matrices for documents and query
document_tfidf_matrix = combined_tfidf_matrix[:-1]  # All but the last element (documents)
query_tfidf_matrix = combined_tfidf_matrix[-1]  # The last element (query)

# Calculate cosine similarity between each document and the query
similarity_scores = cosine_similarity(document_tfidf_matrix, query_tfidf_matrix).flatten()

# Step 4: PSO Parameters
num_particles = 10  # Number of particles in the swarm
num_iterations = 50  # Number of iterations to run the optimization
top_k = 3  # Number of relevant documents to retrieve

# Initialize particles
# Each particle is a list of indices representing selected documents
particles = [random.sample(range(len(legal_documents)), top_k) for _ in range(num_particles)]

# Initialize velocities
# Velocity is represented as the indices to swap in the particle's document list
velocities = [random.sample(range(len(legal_documents)), top_k) for _ in range(num_particles)]

# Initialize best known positions and their scores
particle_best_positions = particles.copy()
particle_best_scores = [sum(similarity_scores[particle]) for particle in particles]

# Initialize global best
global_best_position = particle_best_positions[np.argmax(particle_best_scores)]
global_best_score = max(particle_best_scores)

# PSO Parameters
w = 0.5  # Inertia weight
c1 = 1.5  # Cognitive component
c2 = 2.0  # Social component

# Function to update velocity
def update_velocity(velocity, particle, best_position, global_best_position):
    new_velocity = velocity.copy()
    for i in range(len(velocity)):
        if random.random() < 0.5:
            new_velocity[i] = best_position[i]
        else:
            new_velocity[i] = global_best_position[i]
    return new_velocity

# Function to update particle position based on velocity
def update_position(particle, velocity):
    new_particle = particle.copy()
    for i in range(len(velocity)):
        if velocity[i] not in new_particle:
            new_particle[i] = velocity[i]
    return new_particle

# PSO Algorithm
for iteration in range(num_iterations):
    for i in range(num_particles):
        # Update velocity
        velocities[i] = update_velocity(velocities[i], particles[i], particle_best_positions[i], global_best_position)

        # Update position
        particles[i] = update_position(particles[i], velocities[i])

        # Calculate new fitness score
        current_score = sum(similarity_scores[particles[i]])

        # Update personal best
        if current_score > particle_best_scores[i]:
            particle_best_positions[i] = particles[i]
            particle_best_scores[i] = current_score

        # Update global best
        if current_score > global_best_score:
            global_best_position = particles[i]
            global_best_score = current_score

    print(f"Iteration {iteration + 1}: Global Best Score: {global_best_score} | Global Best Documents: {global_best_position}")

# Output the most relevant legal documents for the user's issue
print(f"\nTop {top_k} most relevant legal documents for your issue:")
for idx in global_best_position:
    print(f"Document {idx}: {legal_documents[idx]}")


Iteration 1: Global Best Score: 0.1217116226954705 | Global Best Documents: [0, 1, 8]
Iteration 2: Global Best Score: 0.1217116226954705 | Global Best Documents: [0, 1, 8]
Iteration 3: Global Best Score: 0.1217116226954705 | Global Best Documents: [0, 1, 8]
Iteration 4: Global Best Score: 0.1217116226954705 | Global Best Documents: [0, 1, 8]
Iteration 5: Global Best Score: 0.1217116226954705 | Global Best Documents: [0, 1, 8]
Iteration 6: Global Best Score: 0.1217116226954705 | Global Best Documents: [0, 1, 8]
Iteration 7: Global Best Score: 0.1217116226954705 | Global Best Documents: [0, 1, 8]
Iteration 8: Global Best Score: 0.1217116226954705 | Global Best Documents: [0, 1, 8]
Iteration 9: Global Best Score: 0.1217116226954705 | Global Best Documents: [0, 1, 8]
Iteration 10: Global Best Score: 0.1217116226954705 | Global Best Documents: [0, 1, 8]
Iteration 11: Global Best Score: 0.1217116226954705 | Global Best Documents: [0, 1, 8]
Iteration 12: Global Best Score: 0.1217116226954705 

# Firefly Algorithm for Legal Document Retrieval

In [71]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import random

# Step 1: Legal document corpus
legal_documents = [
    "Contract law is a legally binding agreement between parties outlining obligations and rights. Breach can lead to damages.",
    "Tort law deals with civil wrongs causing harm or loss. Negligence, defamation, and trespass are common torts.",
    "Criminal law governs harmful conduct to society, such as felonies, misdemeanors, and infractions. Penalties include imprisonment and fines.",
    "Constitutional law sets up government frameworks, individual rights, and separation of powers.",
    "Environmental law regulates human impact on the environment, including pollution and resource conservation.",
    "Employment law covers employer-employee relations, including wages, discrimination, and wrongful termination.",
    "Family law governs marriage, divorce, custody, and adoption, including property division and child support.",
    "Tax law regulates income, corporate, and property taxes, including compliance with tax authorities.",
    "Real estate law governs property ownership, zoning, mortgages, and leases. Legal compliance is necessary during transfers.",
]

# Step 2: User's issue as input query
user_issue = "Somebody breached our contract and caused financial loss. What legal actions can we take?"

# Step 3: Fine-tuned TF-IDF Vectorizer for document-query similarity
# Creating a combined vectorizer for documents and query
vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_df=0.85, stop_words='english')
# Fit and transform on both documents and query
combined_corpus = legal_documents + [user_issue]
combined_tfidf_matrix = vectorizer.fit_transform(combined_corpus)

# Separate TF-IDF matrices for documents and query
document_tfidf_matrix = combined_tfidf_matrix[:-1]  # All but the last element (documents)
query_tfidf_matrix = combined_tfidf_matrix[-1]  # The last element (query)

# Calculate cosine similarity between each document and the query
similarity_scores = cosine_similarity(document_tfidf_matrix, query_tfidf_matrix).flatten()

# Step 4: Firefly Algorithm Parameters
num_fireflies = 10  # Number of fireflies in the swarm
num_iterations = 50  # Number of iterations to run the optimization
top_k = 3  # Number of relevant documents to retrieve

# Initialize fireflies with unique document indices
fireflies = []
while len(fireflies) < num_fireflies:
    firefly = random.sample(range(len(legal_documents)), top_k)
    if firefly not in fireflies:
        fireflies.append(firefly)

# Initialize light intensities based on the similarity scores
light_intensities = [sum(similarity_scores[firefly]) for firefly in fireflies]

# Light absorption coefficient
gamma = 1.0

# Randomization parameter
alpha = 0.2

# Function to calculate the attractiveness
def attractiveness(light_intensity, distance):
    return light_intensity * np.exp(-gamma * (distance ** 2))

# Function to update firefly positions with unique indices
def update_position(firefly, brighter_firefly, alpha):
    new_firefly = list(set(brighter_firefly))  # Start with the brighter firefly's unique indices
    while len(new_firefly) < len(firefly):  # Add random unique indices to maintain length
        candidate = random.choice(range(len(legal_documents)))
        if candidate not in new_firefly:
            new_firefly.append(candidate)
    return new_firefly

# Firefly Algorithm
for iteration in range(num_iterations):
    for i in range(num_fireflies):
        for j in range(num_fireflies):
            if light_intensities[j] > light_intensities[i]:
                # Calculate distance between fireflies i and j
                distance = np.linalg.norm(np.array(fireflies[i]) - np.array(fireflies[j]))

                # Calculate new position of firefly i with unique document indices
                fireflies[i] = update_position(fireflies[i], fireflies[j], alpha)

                # Calculate new fitness score
                new_score = sum(similarity_scores[fireflies[i]])

                # Update light intensity
                if new_score > light_intensities[i]:
                    light_intensities[i] = new_score

    # Print the best score and document set in this iteration
    best_index = np.argmax(light_intensities)
    print(f"Iteration {iteration + 1}: Best Score: {light_intensities[best_index]} | Best Documents: {fireflies[best_index]}")

# Output the most relevant legal documents for the user's issue
best_firefly_index = np.argmax(light_intensities)
best_documents = fireflies[best_firefly_index]
print(f"\nTop {top_k} most relevant legal documents for your issue:")
for idx in best_documents:
    print(f"Document {idx}: {legal_documents[idx]}")


Iteration 1: Best Score: 0.08087639058419946 | Best Documents: [0, 1, 7]
Iteration 2: Best Score: 0.08087639058419946 | Best Documents: [0, 1, 7]
Iteration 3: Best Score: 0.08087639058419946 | Best Documents: [0, 1, 7]
Iteration 4: Best Score: 0.08087639058419946 | Best Documents: [0, 1, 7]
Iteration 5: Best Score: 0.08087639058419946 | Best Documents: [0, 1, 7]
Iteration 6: Best Score: 0.08087639058419946 | Best Documents: [0, 1, 7]
Iteration 7: Best Score: 0.08087639058419946 | Best Documents: [0, 1, 7]
Iteration 8: Best Score: 0.08087639058419946 | Best Documents: [0, 1, 7]
Iteration 9: Best Score: 0.08087639058419946 | Best Documents: [0, 1, 7]
Iteration 10: Best Score: 0.08087639058419946 | Best Documents: [0, 1, 7]
Iteration 11: Best Score: 0.08087639058419946 | Best Documents: [0, 1, 7]
Iteration 12: Best Score: 0.08087639058419946 | Best Documents: [0, 1, 7]
Iteration 13: Best Score: 0.08087639058419946 | Best Documents: [0, 1, 7]
Iteration 14: Best Score: 0.08087639058419946 |