In [3]:
import numpy as np
import random
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Open the file in read mode
with open('../corpus.txt', 'r') as file:
    # Read the lines from the file and store them in a list
    corpus = file.readlines()

# Optionally, you can remove newline characters from each line
corpus = [line.strip() for line in corpus]



query = "Somebody breached our contract and caused financial loss. What legal actions can we take?"

In [4]:
# Sample Legal document corpus for demonstration
legal_documents = corpus  # Assume this is defined elsewhere

# User's issue as input query
user_issue = query

# TF-IDF Vectorizer for document-query similarity
vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_df=0.8, max_features=500, stop_words='english')
combined_corpus = legal_documents + [user_issue]
combined_tfidf_matrix = vectorizer.fit_transform(combined_corpus)

document_tfidf_matrix = combined_tfidf_matrix[:-1]
query_tfidf_matrix = combined_tfidf_matrix[-1]

similarity_scores = cosine_similarity(document_tfidf_matrix, query_tfidf_matrix).flatten()

# Artificial Bee Colony Parameters
num_employed_bees = 5
num_onlooker_bees = 5
num_iterations = 20
top_k = 3

# Initialize food sources based on top-ranked documents
initial_candidates = np.argsort(similarity_scores)[-top_k:]
food_sources = [list(np.random.choice(initial_candidates, size=top_k, replace=False)) for _ in range(num_employed_bees)]

# Function to calculate fitness (similarity score sum)
def fitness(source):
    return sum(similarity_scores[source])

# ABC Optimization Process
for iteration in range(num_iterations):
    # Employed bees phase: improve their own food sources
    for i in range(num_employed_bees):
        candidate = food_sources[i].copy()
        candidate[random.randint(0, top_k - 1)] = random.choice(range(len(legal_documents)))
        if fitness(candidate) > fitness(food_sources[i]):
            food_sources[i] = candidate

    # Calculate probabilities for onlookers based on fitness
    fitness_scores = np.array([fitness(source) for source in food_sources])
    probabilities = fitness_scores / fitness_scores.sum()

    # Onlooker bees phase: select and improve food sources based on probabilities
    for _ in range(num_onlooker_bees):
        selected_index = np.random.choice(range(num_employed_bees), p=probabilities)
        candidate = food_sources[selected_index].copy()
        candidate[random.randint(0, top_k - 1)] = random.choice(range(len(legal_documents)))
        if fitness(candidate) > fitness(food_sources[selected_index]):
            food_sources[selected_index] = candidate

    # Scout bees phase: replace poor food sources with new random solutions if necessary
    for i in range(num_employed_bees):
        if random.random() < 0.1:  # scout probability
            food_sources[i] = list(np.random.choice(range(len(legal_documents)), size=top_k, replace=False))

# Output the best-performing food source
best_source = max(food_sources, key=fitness)
best_documents = [legal_documents[i] for i in best_source]
for i in best_documents:
    print(i, end='\n\n')

# Performance Metrics Calculation

# Define a relevance threshold to consider a document relevant
relevance_threshold = 0.158  # Adjust based on similarity score distribution

# Rank documents by similarity score
ranked_indices = np.argsort(similarity_scores)[::-1]  # Indices sorted in descending order
ranked_scores = similarity_scores[ranked_indices]

# Determine relevance based on the adjusted threshold
relevance_labels = [1 if score >= relevance_threshold else 0 for score in ranked_scores]

# Calculate Top-k Precision (P@k)
def precision_at_k(relevance_labels, k=top_k):
    top_k_relevance = relevance_labels[:k]
    return sum(top_k_relevance) / k if k > 0 else 0

# Calculate Precision at k
precision_k = precision_at_k(relevance_labels, k=top_k)

# Calculate Mean Reciprocal Rank (MRR)
def mean_reciprocal_rank(relevance_labels):
    for rank, label in enumerate(relevance_labels, start=1):
        if label == 1:  # First relevant document
            return 1 / rank
    return 0

# Calculate Mean Average Precision (MAP)
def mean_average_precision(relevance_labels):
    relevant_docs = 0
    cumulative_precision = 0
    for rank, label in enumerate(relevance_labels, start=1):
        if label == 1:
            relevant_docs += 1
            cumulative_precision += relevant_docs / rank
    return cumulative_precision / relevant_docs if relevant_docs > 0 else 0

# Calculate MRR and MAP
mrr = mean_reciprocal_rank(relevance_labels)
map_score = mean_average_precision(relevance_labels)

print(f"\nMean Reciprocal Rank (MRR): {mrr:.2f}")
print(f"Mean Average Precision (MAP): {map_score:.2f}")


Section 601, Employment Law: Protects workers' rights, covering minimum wage, overtime, and workplace safety. Subsection 601.1 sets standards for fair labor practices. Subsection 601.2 outlines legal recourse for workplace harassment and discrimination.

Section 2602, Contract Law: Regulates breach of contract, including damages and remedies. Subsection 2602.1 defines compensatory damages, punitive damages, and nominal damages. Subsection 2602.2 outlines remedies for breach of contract in commercial transactions.

Section 2602, Contract Law: Regulates breach of contract, including damages and remedies. Subsection 2602.1 defines compensatory damages, punitive damages, and nominal damages. Subsection 2602.2 outlines remedies for breach of contract in commercial transactions.


Mean Reciprocal Rank (MRR): 1.00
Mean Average Precision (MAP): 1.00
