In [1]:
!pip install --upgrade gensim

from gensim.downloader import load
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

nltk.download('punkt')
nltk.download('stopwords')



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [2]:
# Load pre-trained word2vec model (may take some time to download)
word_model = load("word2vec-google-news-300")



In [35]:
def law_cleaning(law):
    # Remove punctuations
    law = law.translate(str.maketrans('', '', string.punctuation + '\r\n\t'))

    # Remove special characters
    law = law.replace('ã', '')
    law = law.replace('Ã', '')

    # Remain only the alphabetic, numeric characters and whitespaces
    law = ''.join([i for i in law if i.isalnum() or i.isspace()])
    return law


def tokenize(text):
    tokens = nltk.word_tokenize(text.lower())
    return tokens


def remove_stopwords(tokens):
    stop_words = set(stopwords.words("english"))
    filtered = [token for token in tokens if token.lower() not in stop_words]
    return filtered


def stemmer(tokens):
    # Create the Porter stemmer object
    pstemmer = PorterStemmer()
    stemmed_words = [pstemmer.stem(token) for token in tokens]
    # Join the list of stemmed words into a single string separated by spaces
    stemmed_text = ' '.join(stemmed_words)
    return stemmed_text

In [36]:
# Load your CSV files containing Law Text and User Queries
df_law = pd.read_csv("dataset.csv")
df_query = pd.read_csv("SampleQueries.csv")

In [37]:
# Preprocess law data for efficiency
vectorized = TfidfVectorizer(stop_words=stopwords.words("english"))
doc_vectors_tfidf = vectorized.fit_transform(df_law["Law"])

law_vectors_word2vec = []
for law in df_law["Law"]:
    law_tokens = tokenize(law_cleaning(law))
    law_tokens = remove_stopwords(law_tokens)
    law_vector = np.mean([word_model[word] for word in law_tokens if word in word_model], axis=0)
    law_vectors_word2vec.append(law_vector)


# Loop through each user query in the dataframe
for index, row in df_query.iterrows():
  user_query = row["Queries"]

  # Preprocess user input
  cleaned_text = law_cleaning(user_query)
  tokens = tokenize(cleaned_text)
  tokens = remove_stopwords(tokens)
  processed_text = stemmer(tokens)

  # Calculate similarity using both TF-IDF and Word2Vec
  input_vector_tfidf = vectorized.transform([processed_text])
  similarities_tfidf = cosine_similarity(input_vector_tfidf, doc_vectors_tfidf)[0]

  input_vector_word2vec = np.mean([word_model[word] for word in tokens if word in word_model], axis=0)
  similarities_word2vec = cosine_similarity([input_vector_word2vec], law_vectors_word2vec)[0]

  # Combine similarities (simple average)
  combined_similarities = (similarities_tfidf + similarities_word2vec) / 2
  highest_similarity = max(combined_similarities)

  # Update 'Highest Similarity Value' column for the current query
  df_query.at[index, 'Highest Similarity Value'] = highest_similarity

In [38]:
# Save the updated DataFrame to a new CSV file
df_query.to_csv("UpdatedSampleQueries.csv", index=False)

In [40]:
# Filter in-scope queries (Ground Truth Label = 1)
in_scope_data = df_query[df_query['Ground Truth label'] == 1]

# Calculate average highest similarity value for in-scope queries
avg_similarity = in_scope_data['Highest Similarity Value'].mean()

print(f"Average Highest Similarity Value (In-Scope Queries): {avg_similarity}")

Average Highest Similarity Value (In-Scope Queries): 0.5960439917894649


In [43]:
# Filter in-scope queries (Ground Truth Label = 1)
in_scope_data = df_query[df_query['Ground Truth label'] == 0]

# Calculate average highest similarity value for in-scope queries
avg_similarity = in_scope_data['Highest Similarity Value'].mean()

print(f"Average Highest Similarity Value (Out-Scope Queries): {avg_similarity}")

Average Highest Similarity Value (Out-Scope Queries): 0.4383325072548749


In [41]:
# Calculate average highest similarity value for all queries
avg_similarity = df_query['Highest Similarity Value'].mean()

print(f"Average Highest Similarity Value (All Queries): {avg_similarity}")

Average Highest Similarity Value (All Queries): 0.5171882495221699


In [42]:
df_query.sample(10)

Unnamed: 0,Queries,Ground Truth label,Highest Similarity Value,Unnamed: 3
468,How does the court proceed after determining t...,0,0.422933,
63,Describe the responsibilities concerning the p...,1,0.538181,
241,What constitutes the territory of the Republic...,1,0.546255,
73,What are the statutory disqualifications outli...,1,0.46699,
327,Is Parliament endowed with the authority to en...,1,0.583574,
190,In cases of inconsistency or conflict with oth...,1,0.658508,
592,"According to Regulation 41, where are the rate...",0,0.329553,
165,Explain the procedures outlined in the recent ...,1,0.555503,
261,How does the law regulate the use of Sinhala a...,1,0.581492,
676,How are trials conducted in Primary Courts,0,0.550708,


In [19]:
from sklearn.metrics import roc_curve, roc_auc_score
import matplotlib.pyplot as plt

In [20]:
# dataframe is named 'df_query' with columns:
#  - Queries (text)
#  - Ground Truth Label (0 - Out-of-Scope, 1 - In-Scope)
#  - Highest Similarity Value (float)
# Filter data for labeled queries
labeled_data = df_query[df_query['Ground Truth label'].notna()]

# Separate labels and similarity values
y_true = labeled_data['Ground Truth label']
# the ground truth label for each query (0 for out-of-scope, 1 for in-scope)
y_pred = labeled_data['Highest Similarity Value']
# the highest similarity scores obtained for each query after comparing them with the laws

In [44]:
# Choose a threshold based on your risk tolerance and application needs.
threshold = 0.47

# Classify queries based on the threshold
labeled_data['Predicted Label'] = labeled_data['Highest Similarity Value'].apply(lambda x: 1 if x >= threshold else 0)

# Evaluate performance metrics
from sklearn.metrics import confusion_matrix, classification_report

confusion_matrix_df = pd.DataFrame(confusion_matrix(y_true, labeled_data['Predicted Label']),
                                 index=['In-Scope (Actual)', 'Out-of-Scope (Actual)'],
                                 columns=['In-Scope (Predicted)', 'Out-of-Scope (Predicted)'])
print(confusion_matrix_df)

print(classification_report(y_true, labeled_data['Predicted Label']))

                       In-Scope (Predicted)  Out-of-Scope (Predicted)
In-Scope (Actual)                       289                       117
Out-of-Scope (Actual)                    21                       385
              precision    recall  f1-score   support

           0       0.93      0.71      0.81       406
           1       0.77      0.95      0.85       406

    accuracy                           0.83       812
   macro avg       0.85      0.83      0.83       812
weighted avg       0.85      0.83      0.83       812



In [45]:
# Choose a threshold based on your risk tolerance and application needs.
threshold = 0.48

# Classify queries based on the threshold
labeled_data['Predicted Label'] = labeled_data['Highest Similarity Value'].apply(lambda x: 1 if x >= threshold else 0)

# Evaluate performance metrics
from sklearn.metrics import confusion_matrix, classification_report

confusion_matrix_df = pd.DataFrame(confusion_matrix(y_true, labeled_data['Predicted Label']),
                                 index=['In-Scope (Actual)', 'Out-of-Scope (Actual)'],
                                 columns=['In-Scope (Predicted)', 'Out-of-Scope (Predicted)'])
print(confusion_matrix_df)

print(classification_report(y_true, labeled_data['Predicted Label']))

                       In-Scope (Predicted)  Out-of-Scope (Predicted)
In-Scope (Actual)                       309                        97
Out-of-Scope (Actual)                    30                       376
              precision    recall  f1-score   support

           0       0.91      0.76      0.83       406
           1       0.79      0.93      0.86       406

    accuracy                           0.84       812
   macro avg       0.85      0.84      0.84       812
weighted avg       0.85      0.84      0.84       812



In [46]:
# Choose a threshold based on your risk tolerance and application needs.
threshold = 0.49

# Classify queries based on the threshold
labeled_data['Predicted Label'] = labeled_data['Highest Similarity Value'].apply(lambda x: 1 if x >= threshold else 0)

# Evaluate performance metrics
from sklearn.metrics import confusion_matrix, classification_report

confusion_matrix_df = pd.DataFrame(confusion_matrix(y_true, labeled_data['Predicted Label']),
                                 index=['In-Scope (Actual)', 'Out-of-Scope (Actual)'],
                                 columns=['In-Scope (Predicted)', 'Out-of-Scope (Predicted)'])
print(confusion_matrix_df)

print(classification_report(y_true, labeled_data['Predicted Label']))

                       In-Scope (Predicted)  Out-of-Scope (Predicted)
In-Scope (Actual)                       324                        82
Out-of-Scope (Actual)                    39                       367
              precision    recall  f1-score   support

           0       0.89      0.80      0.84       406
           1       0.82      0.90      0.86       406

    accuracy                           0.85       812
   macro avg       0.85      0.85      0.85       812
weighted avg       0.85      0.85      0.85       812



In [47]:
# Choose a threshold based on your risk tolerance and application needs.
threshold = 0.5

# Classify queries based on the threshold
labeled_data['Predicted Label'] = labeled_data['Highest Similarity Value'].apply(lambda x: 1 if x >= threshold else 0)

# Evaluate performance metrics
from sklearn.metrics import confusion_matrix, classification_report

confusion_matrix_df = pd.DataFrame(confusion_matrix(y_true, labeled_data['Predicted Label']),
                                 index=['In-Scope (Actual)', 'Out-of-Scope (Actual)'],
                                 columns=['In-Scope (Predicted)', 'Out-of-Scope (Predicted)'])
print(confusion_matrix_df)

print(classification_report(y_true, labeled_data['Predicted Label']))

                       In-Scope (Predicted)  Out-of-Scope (Predicted)
In-Scope (Actual)                       338                        68
Out-of-Scope (Actual)                    49                       357
              precision    recall  f1-score   support

           0       0.87      0.83      0.85       406
           1       0.84      0.88      0.86       406

    accuracy                           0.86       812
   macro avg       0.86      0.86      0.86       812
weighted avg       0.86      0.86      0.86       812



In [48]:
# Choose a threshold based on your risk tolerance and application needs.
threshold = 0.509

# Classify queries based on the threshold
labeled_data['Predicted Label'] = labeled_data['Highest Similarity Value'].apply(lambda x: 1 if x >= threshold else 0)

# Evaluate performance metrics
from sklearn.metrics import confusion_matrix, classification_report

confusion_matrix_df = pd.DataFrame(confusion_matrix(y_true, labeled_data['Predicted Label']),
                                 index=['In-Scope (Actual)', 'Out-of-Scope (Actual)'],
                                 columns=['In-Scope (Predicted)', 'Out-of-Scope (Predicted)'])
print(confusion_matrix_df)

print(classification_report(y_true, labeled_data['Predicted Label']))

                       In-Scope (Predicted)  Out-of-Scope (Predicted)
In-Scope (Actual)                       346                        60
Out-of-Scope (Actual)                    55                       351
              precision    recall  f1-score   support

           0       0.86      0.85      0.86       406
           1       0.85      0.86      0.86       406

    accuracy                           0.86       812
   macro avg       0.86      0.86      0.86       812
weighted avg       0.86      0.86      0.86       812



In [49]:
# Choose a threshold based on your risk tolerance and application needs.
threshold = 0.51

# Classify queries based on the threshold
labeled_data['Predicted Label'] = labeled_data['Highest Similarity Value'].apply(lambda x: 1 if x >= threshold else 0)

# Evaluate performance metrics
from sklearn.metrics import confusion_matrix, classification_report

confusion_matrix_df = pd.DataFrame(confusion_matrix(y_true, labeled_data['Predicted Label']),
                                 index=['In-Scope (Actual)', 'Out-of-Scope (Actual)'],
                                 columns=['In-Scope (Predicted)', 'Out-of-Scope (Predicted)'])
print(confusion_matrix_df)

print(classification_report(y_true, labeled_data['Predicted Label']))

                       In-Scope (Predicted)  Out-of-Scope (Predicted)
In-Scope (Actual)                       350                        56
Out-of-Scope (Actual)                    56                       350
              precision    recall  f1-score   support

           0       0.86      0.86      0.86       406
           1       0.86      0.86      0.86       406

    accuracy                           0.86       812
   macro avg       0.86      0.86      0.86       812
weighted avg       0.86      0.86      0.86       812



In [50]:
# Choose a threshold based on your risk tolerance and application needs.
threshold = 0.511

# Classify queries based on the threshold
labeled_data['Predicted Label'] = labeled_data['Highest Similarity Value'].apply(lambda x: 1 if x >= threshold else 0)

# Evaluate performance metrics
from sklearn.metrics import confusion_matrix, classification_report

confusion_matrix_df = pd.DataFrame(confusion_matrix(y_true, labeled_data['Predicted Label']),
                                 index=['In-Scope (Actual)', 'Out-of-Scope (Actual)'],
                                 columns=['In-Scope (Predicted)', 'Out-of-Scope (Predicted)'])
print(confusion_matrix_df)

print(classification_report(y_true, labeled_data['Predicted Label']))

                       In-Scope (Predicted)  Out-of-Scope (Predicted)
In-Scope (Actual)                       350                        56
Out-of-Scope (Actual)                    59                       347
              precision    recall  f1-score   support

           0       0.86      0.86      0.86       406
           1       0.86      0.85      0.86       406

    accuracy                           0.86       812
   macro avg       0.86      0.86      0.86       812
weighted avg       0.86      0.86      0.86       812



In [51]:
# Choose a threshold based on your risk tolerance and application needs.
threshold = 0.52

# Classify queries based on the threshold
labeled_data['Predicted Label'] = labeled_data['Highest Similarity Value'].apply(lambda x: 1 if x >= threshold else 0)

# Evaluate performance metrics
from sklearn.metrics import confusion_matrix, classification_report

confusion_matrix_df = pd.DataFrame(confusion_matrix(y_true, labeled_data['Predicted Label']),
                                 index=['In-Scope (Actual)', 'Out-of-Scope (Actual)'],
                                 columns=['In-Scope (Predicted)', 'Out-of-Scope (Predicted)'])
print(confusion_matrix_df)

print(classification_report(y_true, labeled_data['Predicted Label']))

                       In-Scope (Predicted)  Out-of-Scope (Predicted)
In-Scope (Actual)                       358                        48
Out-of-Scope (Actual)                    69                       337
              precision    recall  f1-score   support

           0       0.84      0.88      0.86       406
           1       0.88      0.83      0.85       406

    accuracy                           0.86       812
   macro avg       0.86      0.86      0.86       812
weighted avg       0.86      0.86      0.86       812

