In [1]:
!pip install --upgrade gensim

from gensim.downloader import load
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

nltk.download('punkt')
nltk.download('stopwords')



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [2]:
# Load pre-trained word2vec model (may take some time to download)
word_model = load("word2vec-google-news-300")



In [36]:
def law_cleaning(law):
    # Remove punctuations
    law = law.translate(str.maketrans('', '', string.punctuation + '\r\n\t'))

    # Remove special characters
    law = law.replace('ã', '')
    law = law.replace('Ã', '')

    # Remain only the alphabetic, numeric characters and whitespaces
    law = ''.join([i for i in law if i.isalnum() or i.isspace()])
    return law


def tokenize(text):
    tokens = nltk.word_tokenize(text.lower())
    return tokens


def remove_stopwords(tokens):
    stop_words = set(stopwords.words("english"))
    filtered = [token for token in tokens if token.lower() not in stop_words]
    return filtered


def stemmer(tokens):
    # Create the Porter stemmer object
    pstemmer = PorterStemmer()
    stemmed_words = [pstemmer.stem(token) for token in tokens]
    # Join the list of stemmed words into a single string separated by spaces
    stemmed_text = ' '.join(stemmed_words)
    return stemmed_text

In [37]:
# Load your CSV files containing Law Text and User Queries
df_law = pd.read_csv("dataset.csv")
df_query = pd.read_csv("SQ.csv") # SampleQueries.csv

In [38]:
# Preprocess law data for efficiency
vectorized = TfidfVectorizer(stop_words=stopwords.words("english"))
doc_vectors_tfidf = vectorized.fit_transform(df_law["Law"])

law_vectors_word2vec = []
for law in df_law["Law"]:
    law_tokens = tokenize(law_cleaning(law))
    law_tokens = remove_stopwords(law_tokens)
    law_vector = np.mean([word_model[word] for word in law_tokens if word in word_model], axis=0)
    law_vectors_word2vec.append(law_vector)


# Loop through each user query in the dataframe
for index, row in df_query.iterrows():
  user_query = row["Queries"]

  # Preprocess user input
  cleaned_text = law_cleaning(user_query)
  tokens = tokenize(cleaned_text)
  tokens = remove_stopwords(tokens)
  processed_text = stemmer(tokens)

  # Calculate similarity using both TF-IDF and Word2Vec
  input_vector_tfidf = vectorized.transform([processed_text])
  similarities_tfidf = cosine_similarity(input_vector_tfidf, doc_vectors_tfidf)[0]

  input_vector_word2vec = np.mean([word_model[word] for word in tokens if word in word_model], axis=0)
  similarities_word2vec = cosine_similarity([input_vector_word2vec], law_vectors_word2vec)[0]

  # Combine similarities (simple average)
  combined_similarities = (similarities_tfidf + similarities_word2vec) / 2
  highest_similarity = max(combined_similarities)

  # Update 'Highest Similarity Value' column for the current query
  df_query.at[index, 'Highest Similarity Value'] = highest_similarity

In [39]:
# Save the updated DataFrame to a new CSV file
df_query.to_csv("UpdatedSampleQueries.csv", index=False)

In [40]:
# Filter in-scope queries (Ground Truth Label = 1)
in_scope_data = df_query[df_query['Ground Truth label'] == 1]

# Calculate average highest similarity value for in-scope queries
avg_similarity = in_scope_data['Highest Similarity Value'].mean()

print(f"Average Highest Similarity Value (In-Scope Queries): {avg_similarity}")

Average Highest Similarity Value (In-Scope Queries): 0.5960439917894649


In [41]:
# Filter in-scope queries (Ground Truth Label = 1)
in_scope_data = df_query[df_query['Ground Truth label'] == 0]

# Calculate average highest similarity value for in-scope queries
avg_similarity = in_scope_data['Highest Similarity Value'].mean()

print(f"Average Highest Similarity Value (Out-Scope Queries): {avg_similarity}")

Average Highest Similarity Value (Out-Scope Queries): 0.4260699359696492


In [42]:
# Calculate average highest similarity value for all queries
avg_similarity = df_query['Highest Similarity Value'].mean()

print(f"Average Highest Similarity Value (All Queries): {avg_similarity}")

Average Highest Similarity Value (All Queries): 0.5110569638795571


In [43]:
df_query.sample(10)

Unnamed: 0,Queries,Ground Truth label,Highest Similarity Value
57,Explain the entitlement for protection where a...,1,0.672091
321,Does Parliament possess the authority to opera...,1,0.49318
592,How are international telecommunications regul...,0,0.353057
101,Which specific matters related to interim awar...,1,0.768304
43,Explain the rights and entitlements of witness...,1,0.578429
791,How do international laws address the rights o...,0,0.383689
306,Upon delegation of any of its powers to a Comm...,1,0.718322
279,How is the Constitutional Council in Sri Lanka...,1,0.599794
55,Explain the provisions of the specified Act re...,1,0.500098
566,What happens in a Primary Court if the Judge i...,0,0.39074


In [44]:
from sklearn.metrics import roc_curve, roc_auc_score
import matplotlib.pyplot as plt

In [45]:
# dataframe is named 'df_query' with columns:
#  - Queries (text)
#  - Ground Truth Label (0 - Out-of-Scope, 1 - In-Scope)
#  - Highest Similarity Value (float)
# Filter data for labeled queries
labeled_data = df_query[df_query['Ground Truth label'].notna()]

# Separate labels and similarity values
y_true = labeled_data['Ground Truth label']
# the ground truth label for each query (0 for out-of-scope, 1 for in-scope)
y_pred = labeled_data['Highest Similarity Value']
# the highest similarity scores obtained for each query after comparing them with the laws

In [46]:
# Choose a threshold based on your risk tolerance and application needs.
threshold = 0.47

# Classify queries based on the threshold
labeled_data['Predicted Label'] = labeled_data['Highest Similarity Value'].apply(lambda x: 1 if x >= threshold else 0)

# Evaluate performance metrics
from sklearn.metrics import confusion_matrix, classification_report

confusion_matrix_df = pd.DataFrame(confusion_matrix(y_true, labeled_data['Predicted Label']),
                                 index=['In-Scope (Actual)', 'Out-of-Scope (Actual)'],
                                 columns=['In-Scope (Predicted)', 'Out-of-Scope (Predicted)'])
print("Threshold: ",threshold)

print(confusion_matrix_df)

print(classification_report(y_true, labeled_data['Predicted Label']))

Threshold:  0.47
                       In-Scope (Predicted)  Out-of-Scope (Predicted)
In-Scope (Actual)                       321                        85
Out-of-Scope (Actual)                    21                       385
              precision    recall  f1-score   support

           0       0.94      0.79      0.86       406
           1       0.82      0.95      0.88       406

    accuracy                           0.87       812
   macro avg       0.88      0.87      0.87       812
weighted avg       0.88      0.87      0.87       812



In [47]:
# Choose a threshold based on your risk tolerance and application needs.
threshold = 0.48

# Classify queries based on the threshold
labeled_data['Predicted Label'] = labeled_data['Highest Similarity Value'].apply(lambda x: 1 if x >= threshold else 0)

# Evaluate performance metrics
from sklearn.metrics import confusion_matrix, classification_report

confusion_matrix_df = pd.DataFrame(confusion_matrix(y_true, labeled_data['Predicted Label']),
                                 index=['In-Scope (Actual)', 'Out-of-Scope (Actual)'],
                                 columns=['In-Scope (Predicted)', 'Out-of-Scope (Predicted)'])
print("Threshold: ",threshold)

print(confusion_matrix_df)

print(classification_report(y_true, labeled_data['Predicted Label']))

Threshold:  0.48
                       In-Scope (Predicted)  Out-of-Scope (Predicted)
In-Scope (Actual)                       341                        65
Out-of-Scope (Actual)                    30                       376
              precision    recall  f1-score   support

           0       0.92      0.84      0.88       406
           1       0.85      0.93      0.89       406

    accuracy                           0.88       812
   macro avg       0.89      0.88      0.88       812
weighted avg       0.89      0.88      0.88       812



In [48]:
# Choose a threshold based on your risk tolerance and application needs.
threshold = 0.49

# Classify queries based on the threshold
labeled_data['Predicted Label'] = labeled_data['Highest Similarity Value'].apply(lambda x: 1 if x >= threshold else 0)

# Evaluate performance metrics
from sklearn.metrics import confusion_matrix, classification_report

confusion_matrix_df = pd.DataFrame(confusion_matrix(y_true, labeled_data['Predicted Label']),
                                 index=['In-Scope (Actual)', 'Out-of-Scope (Actual)'],
                                 columns=['In-Scope (Predicted)', 'Out-of-Scope (Predicted)'])
print("Threshold: ",threshold)

print(confusion_matrix_df)

print(classification_report(y_true, labeled_data['Predicted Label']))

Threshold:  0.49
                       In-Scope (Predicted)  Out-of-Scope (Predicted)
In-Scope (Actual)                       351                        55
Out-of-Scope (Actual)                    39                       367
              precision    recall  f1-score   support

           0       0.90      0.86      0.88       406
           1       0.87      0.90      0.89       406

    accuracy                           0.88       812
   macro avg       0.88      0.88      0.88       812
weighted avg       0.88      0.88      0.88       812



In [51]:
# Choose a threshold based on your risk tolerance and application needs.
threshold = 0.498

# Classify queries based on the threshold
labeled_data['Predicted Label'] = labeled_data['Highest Similarity Value'].apply(lambda x: 1 if x >= threshold else 0)

# Evaluate performance metrics
from sklearn.metrics import confusion_matrix, classification_report

confusion_matrix_df = pd.DataFrame(confusion_matrix(y_true, labeled_data['Predicted Label']),
                                 index=['In-Scope (Actual)', 'Out-of-Scope (Actual)'],
                                 columns=['In-Scope (Predicted)', 'Out-of-Scope (Predicted)'])
print("Threshold: ",threshold)

print(confusion_matrix_df)

print(classification_report(y_true, labeled_data['Predicted Label']))

Threshold:  0.498
                       In-Scope (Predicted)  Out-of-Scope (Predicted)
In-Scope (Actual)                       358                        48
Out-of-Scope (Actual)                    46                       360
              precision    recall  f1-score   support

           0       0.89      0.88      0.88       406
           1       0.88      0.89      0.88       406

    accuracy                           0.88       812
   macro avg       0.88      0.88      0.88       812
weighted avg       0.88      0.88      0.88       812



In [53]:
# Choose a threshold based on your risk tolerance and application needs.
threshold = 0.4988

# Classify queries based on the threshold
labeled_data['Predicted Label'] = labeled_data['Highest Similarity Value'].apply(lambda x: 1 if x >= threshold else 0)

# Evaluate performance metrics
from sklearn.metrics import confusion_matrix, classification_report

confusion_matrix_df = pd.DataFrame(confusion_matrix(y_true, labeled_data['Predicted Label']),
                                 index=['In-Scope (Actual)', 'Out-of-Scope (Actual)'],
                                 columns=['In-Scope (Predicted)', 'Out-of-Scope (Predicted)'])
print("Threshold: ",threshold)

print(confusion_matrix_df)

print(classification_report(y_true, labeled_data['Predicted Label']))

Threshold:  0.4988
                       In-Scope (Predicted)  Out-of-Scope (Predicted)
In-Scope (Actual)                       360                        46
Out-of-Scope (Actual)                    46                       360
              precision    recall  f1-score   support

           0       0.89      0.89      0.89       406
           1       0.89      0.89      0.89       406

    accuracy                           0.89       812
   macro avg       0.89      0.89      0.89       812
weighted avg       0.89      0.89      0.89       812



In [50]:
# Choose a threshold based on your risk tolerance and application needs.
threshold = 0.499

# Classify queries based on the threshold
labeled_data['Predicted Label'] = labeled_data['Highest Similarity Value'].apply(lambda x: 1 if x >= threshold else 0)

# Evaluate performance metrics
from sklearn.metrics import confusion_matrix, classification_report

confusion_matrix_df = pd.DataFrame(confusion_matrix(y_true, labeled_data['Predicted Label']),
                                 index=['In-Scope (Actual)', 'Out-of-Scope (Actual)'],
                                 columns=['In-Scope (Predicted)', 'Out-of-Scope (Predicted)'])
print("Threshold: ",threshold)

print(confusion_matrix_df)

print(classification_report(y_true, labeled_data['Predicted Label']))

Threshold:  0.499
                       In-Scope (Predicted)  Out-of-Scope (Predicted)
In-Scope (Actual)                       362                        44
Out-of-Scope (Actual)                    48                       358
              precision    recall  f1-score   support

           0       0.88      0.89      0.89       406
           1       0.89      0.88      0.89       406

    accuracy                           0.89       812
   macro avg       0.89      0.89      0.89       812
weighted avg       0.89      0.89      0.89       812



In [49]:
# Choose a threshold based on your risk tolerance and application needs.
threshold = 0.5

# Classify queries based on the threshold
labeled_data['Predicted Label'] = labeled_data['Highest Similarity Value'].apply(lambda x: 1 if x >= threshold else 0)

# Evaluate performance metrics
from sklearn.metrics import confusion_matrix, classification_report

confusion_matrix_df = pd.DataFrame(confusion_matrix(y_true, labeled_data['Predicted Label']),
                                 index=['In-Scope (Actual)', 'Out-of-Scope (Actual)'],
                                 columns=['In-Scope (Predicted)', 'Out-of-Scope (Predicted)'])
print("Threshold: ",threshold)

print(confusion_matrix_df)

print(classification_report(y_true, labeled_data['Predicted Label']))

Threshold:  0.5
                       In-Scope (Predicted)  Out-of-Scope (Predicted)
In-Scope (Actual)                       362                        44
Out-of-Scope (Actual)                    49                       357
              precision    recall  f1-score   support

           0       0.88      0.89      0.89       406
           1       0.89      0.88      0.88       406

    accuracy                           0.89       812
   macro avg       0.89      0.89      0.89       812
weighted avg       0.89      0.89      0.89       812

