In [1]:
import numpy as np
import random
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Open the file in read mode
with open('../corpus.txt', 'r') as file:
    # Read the lines from the file and store them in a list
    corpus = file.readlines()

# Optionally, you can remove newline characters from each line
corpus = [line.strip() for line in corpus]



query = "Somebody breached our contract and caused financial loss. What legal actions can we take?"

In [2]:
# Step 1: Legal document corpus
legal_documents = corpus

# Step 2: User's issue as input query
user_issue = query

# Step 3: Fine-tuned TF-IDF Vectorizer for document-query similarity
# Creating a combined vectorizer for documents and query
vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_df=0.85, stop_words='english')
combined_corpus = legal_documents + [user_issue]
combined_tfidf_matrix = vectorizer.fit_transform(combined_corpus)

# Separate TF-IDF matrices for documents and query
document_tfidf_matrix = combined_tfidf_matrix[:-1]  # All but the last element (documents)
query_tfidf_matrix = combined_tfidf_matrix[-1]  # The last element (query)

# Calculate cosine similarity between each document and the query
similarity_scores = cosine_similarity(document_tfidf_matrix, query_tfidf_matrix).flatten()

# Step 4: Create a distance matrix for ACO (1 - similarity scores)
distance_matrix = 1 - similarity_scores
num_documents = len(legal_documents)

# ACO Parameters
num_ants = 5
num_iterations = 20
alpha = 1  # Pheromone importance
beta = 3  # Distance importance increased for more sensitivity to similarity
evaporation_rate = 0.2
pheromone_deposit = 50
top_k = 3  # Number of relevant documents to retrieve

# Initialize pheromone levels
pheromone_matrix = np.ones(num_documents) / num_documents

# Function to choose the next document based on probabilities
def choose_next_document(pheromones, distances, alpha, beta):
    pheromone_factor = pheromones ** alpha
    distance_factor = (1 / distances) ** beta
    probabilities = pheromone_factor * distance_factor
    probabilities /= probabilities.sum()
    return np.random.choice(range(len(legal_documents)), p=probabilities)

# Function to update pheromone levels
def update_pheromones(pheromones, documents_visited, evaporation_rate, pheromone_deposit):
    pheromones *= (1 - evaporation_rate)  # Evaporate pheromones
    for document in documents_visited:
        pheromones[document] += pheromone_deposit  # Add pheromone to visited documents

# ACO Algorithm for document retrieval
def ant_colony_optimization(num_iterations, num_ants, distance_matrix, pheromone_matrix, alpha, beta, evaporation_rate, pheromone_deposit, top_k):
    best_documents = set()
    best_similarities = []
    for iteration in range(num_iterations):
        ant_paths = []
        ant_similarities = []
        for ant in range(num_ants):
            current_path = []
            current_similarity = 0
            for _ in range(top_k):  # Select top_k documents per ant
                current_document = choose_next_document(pheromone_matrix, distance_matrix, alpha, beta)
                current_similarity += similarity_scores[current_document]
                current_path.append(current_document)
            # Save the chosen documents and their total similarity
            ant_paths.append(current_path)
            ant_similarities.append(current_similarity)
        # Update pheromones for all documents visited by all ants
        for path in ant_paths:
            update_pheromones(pheromone_matrix, path, evaporation_rate, pheromone_deposit)
        # Keep track of the best paths (relevant documents)
        for i in range(len(ant_paths)):
            if len(best_similarities) < top_k or ant_similarities[i] > min(best_similarities):
                best_documents.update(ant_paths[i])
                best_similarities.append(ant_similarities[i])
    # Sort and return the top K most relevant documents
    sorted_best_documents = sorted(best_documents, key=lambda x: similarity_scores[x], reverse=True)
    return sorted_best_documents[:top_k]

# Run the ACO algorithm to find the top K relevant documents
best_documents_indices = ant_colony_optimization(num_iterations, num_ants, distance_matrix, pheromone_matrix, alpha, beta, evaporation_rate, pheromone_deposit, top_k)

# Output the most relevant legal documents for the user's issue
print(f"\nTop {top_k} most relevant legal documents for your issue:")
for idx in best_documents_indices:
    print(f"Document {idx}: {legal_documents[idx]}")

# Performance Metrics Calculation

# Define a similarity threshold to consider a document relevant
relevance_threshold = 0.15929

# Rank documents by similarity score
ranked_indices = np.argsort(similarity_scores)[::-1]  # Indices sorted in descending order of similarity scores
ranked_scores = similarity_scores[ranked_indices]     # Corresponding similarity scores in descending order

# Determine relevance based on the similarity threshold
relevance_labels = [1 if score >= relevance_threshold else 0 for score in ranked_scores]

# Calculate Mean Reciprocal Rank (MRR)
def mean_reciprocal_rank(relevance_labels):
    for rank, label in enumerate(relevance_labels, start=1):
        if label == 1:  # First relevant document
            return 1 / rank
    return 0  # No relevant document found

mrr = mean_reciprocal_rank(relevance_labels)
print(f"\nMean Reciprocal Rank (MRR): {mrr:.2f}")

# Calculate Mean Average Precision (MAP)
def mean_average_precision(relevance_labels):
    relevant_docs = 0
    cumulative_precision = 0
    for rank, label in enumerate(relevance_labels, start=1):
        if label == 1:
            relevant_docs += 1
            cumulative_precision += relevant_docs / rank
    return cumulative_precision / relevant_docs if relevant_docs > 0 else 0

map_score = mean_average_precision(relevance_labels)
print(f"Mean Average Precision (MAP): {map_score:.2f}")



Top 3 most relevant legal documents for your issue:
Document 51: Section 2602, Contract Law: Regulates breach of contract, including damages and remedies. Subsection 2602.1 defines compensatory damages, punitive damages, and nominal damages. Subsection 2602.2 outlines remedies for breach of contract in commercial transactions.
Document 50: Section 2601, Contract Law: Defines contracts related to sales and services. Subsection 2601.1 outlines general contract formation and validity. Subsection 2601.2 specifies terms for service contracts and the rights of the parties involved.
Document 27: Section 1402, Consumer Protection Law: Discusses debt collection practices, financial disclosures, and lending standards. Subsection 1402.1 mandates clear communication of loan terms. Subsection 1402.2 provides protections against predatory lending.

Mean Reciprocal Rank (MRR): 0.00
Mean Average Precision (MAP): 0.00
