# Evaluation

In [60]:
import pandas as pd
import numpy as np  
gold_data = pd.read_csv('gold_train_data_entitywise.csv')
print(f'Shape of gold_data is {gold_data.shape}')
ws_data = pd.read_csv('llm_generated_output_in_entity_type_format.csv')
# Drop rows where 'entity_text' column is null
ws_data = ws_data.dropna(subset=['entity_text'])
print(f'Shape of ws_train_data is {ws_data.shape}')
# find the duplicate uuids count in ws_data and gold_data, means those uuids exists in both gold_data and ws_data and save it in duplicate_uuids_count
duplicate_uuids_count = len(set(ws_data['uuid']).intersection(set(gold_data['uuid'])))
print(f'Duplicate uuids count: {duplicate_uuids_count}')

Shape of gold_data is (866, 3)
Shape of ws_train_data is (82, 3)
Duplicate uuids count: 11


Accuracy calculation

# A. Analysis on labeled Data

### 1. No of unique samples to be labeled:   306
Because this is equal to input size. Also can be calculated/verified form the sentence level group by output that can be calculated using uuid
n unique becasue in our input we assigned unique id to each sentence/sample

In [61]:
total_data_to_label = gold_data.uuid.nunique()
print(f'unique uuids in gold_data is {total_data_to_label}')

unique uuids in gold_data is 306


In [None]:
# Open a file for writing
with open('EvaluationResult.txt', 'w') as file:
    # Write content to the file
    file.write(f'unique uuids in gold_data is {total_data_to_label}\n\n')

### 2. Total no of unique samples labeled as non-zero by ws (Any ws_entity_type assigned other than 0):

In [62]:
# find the no of unique sentences in df where ws_entity_type is not equal to 0. It means that sentence has atleast one entity
data_labeled_by_ws = ws_data['uuid'].nunique()
print(f'Total data labeled means any non-zero ws_entity_type assigned: {data_labeled_by_ws}')

Total data labeled means any non-zero ws_entity_type assigned: 11


In [None]:
# Open a file for appending
with open('EvaluationResult.txt', 'a') as file:
    # Append content to the file
    file.write(f'Total data labeled means any non-zero ws_entity_type assigned: {data_labeled_by_ws}\n\n')

### 3. Entity wise no of samples to be labeled. (e.g, where gt_entity_type = 1|2|3)

In [63]:
# total number of unique sentences that were expected to be labeled as Document Name entity
print(f'entity wise count in gold_data is \n{gold_data.gt_entity_type.value_counts()}')

entity wise count in gold_data is 
gt_entity_type
1    306
2    306
3    254
Name: count, dtype: int64


### 4. Entity wise total unique sentences labeled by WS. (e.g, where ws_entity_type = 1|2|3)

In [64]:
# show now the value counts of ws_entity_type in labeled_data
print(f'Labeled data ws_entity_type value counts: \n{ws_data.entity_type.value_counts()}')

Labeled data ws_entity_type value counts: 
entity_type
2    27
1    11
4    11
5    11
6    11
3    11
Name: count, dtype: int64


### 5. Entity wise total samples correctly labeled by weak supervision

In [65]:
# aggregate the data on the basis of uuid and ws_entity_type 
ws_train_data_agg = ws_data.groupby(['uuid','entity_type']).agg({
    'entity_text': lambda x: ' '.join(x), 
}).reset_index()
gold_data_agg = gold_data.groupby(['uuid','gt_entity_type']).agg({
    'gt_entity_text': lambda x: ' '.join(x), 
}).reset_index()

In [66]:
# print unique samples values counts in ws_train_data_agg 
print(f'ws_train_data_agg unique samples values counts: \n{ws_train_data_agg.entity_type.value_counts()}')
# also print for gold data
print(f'gold_data_agg unique samples values counts: \n{gold_data_agg.gt_entity_type.value_counts()}')

ws_train_data_agg unique samples values counts: 
entity_type
1    11
2    11
3    11
4    11
5    11
6    11
Name: count, dtype: int64
gold_data_agg unique samples values counts: 
gt_entity_type
1    306
2    306
3    254
Name: count, dtype: int64


In [67]:

# Filter the ws_data_agg for class 1
correct_doc_name_entity = ws_train_data_agg[
    (ws_train_data_agg['entity_type'] == 1) & (ws_train_data_agg['uuid'].isin(gold_data_agg['uuid']))]

# Merge correct_doc_name_entity with gold_data_agg on uuid and select desired columns
correct_doc_name_entity = correct_doc_name_entity.merge(gold_data_agg[['uuid','gt_entity_type', 'gt_entity_text']], on='uuid', how='left')

# Filter for class 1 again to ensure only relevant data is included
correct_doc_name_entity = correct_doc_name_entity[(correct_doc_name_entity['entity_type'] == 1)
                                                  & (correct_doc_name_entity['gt_entity_type'] == 1)]

# Select the desired columns
correct_doc_name_entity = correct_doc_name_entity[['uuid', 'entity_type', 'entity_text', 'gt_entity_type', 'gt_entity_text']]
print(f'No of correct document name entities extracted by ws: {correct_doc_name_entity.shape[0]}')


No of correct document name entities extracted by ws: 11


In [68]:
# Filter the ws_data_agg for class 2
correct_party_name_entity = ws_train_data_agg[
    (ws_train_data_agg['entity_type'] == 2) & (ws_train_data_agg['uuid'].isin(gold_data_agg['uuid']))]

# Merge correct_party_name_entity with gold_data_agg on uuid and select desired columns
correct_party_name_entity = correct_party_name_entity.merge(gold_data_agg[['uuid','gt_entity_type', 'gt_entity_text']], on='uuid', how='left')

# Filter for class 2 again to ensure only relevant data is included
correct_party_name_entity = correct_party_name_entity[(correct_party_name_entity['entity_type'] == 2)
                                                  & (correct_party_name_entity['gt_entity_type'] == 2)]

# Select the desired columns
correct_party_name_entity = correct_party_name_entity[['uuid', 'entity_type', 'entity_text', 'gt_entity_type', 'gt_entity_text']]
print(f'No of correct party name entities extracted by ws: {correct_party_name_entity.shape[0]}')

No of correct party name entities extracted by ws: 11


In [69]:
# Filter the ws_data_agg for class 3
correct_gov_law_entity = ws_train_data_agg[
    (ws_train_data_agg['entity_type'] == 3) & (ws_train_data_agg['uuid'].isin(gold_data_agg['uuid']))]

# Merge correct_gov_law_entity with gold_data_agg on uuid and select desired columns
correct_gov_law_entity = correct_gov_law_entity.merge(gold_data_agg[['uuid','gt_entity_type', 'gt_entity_text']], on='uuid', how='left')

# Filter for class 3 again to ensure only relevant data is included
correct_gov_law_entity = correct_gov_law_entity[(correct_gov_law_entity['entity_type'] == 3)
                                                  & (correct_gov_law_entity['gt_entity_type'] == 3)]

# Select the desired columns
correct_gov_law_entity = correct_gov_law_entity[['uuid', 'entity_type', 'entity_text', 'gt_entity_type', 'gt_entity_text']]
print(f'No of correct government law entities extracted by ws: {correct_gov_law_entity.shape[0]}')

No of correct government law entities extracted by ws: 11


### Unique text samples that are correctly labeled by ws

In [70]:
# Filter the ws_data_agg for class 1
unique_correct_doc_name_entity = ws_train_data_agg[(ws_train_data_agg['entity_type'] == 1) 
                                       & (ws_train_data_agg['uuid'].isin(gold_data_agg['uuid']))
                                       & (gold_data_agg['gt_entity_type'] == 1)]
print(f'No of correct labeled doc name entity: {correct_doc_name_entity.shape[0]}')
unique_correct_party_name_entity = ws_train_data_agg[(ws_train_data_agg['entity_type'] == 2) 
                                       & (ws_train_data_agg['uuid'].isin(gold_data_agg['uuid']))
                                       & (gold_data_agg['gt_entity_type'] == 2)]
print(f'No of correct labeled party name entity: {correct_party_name_entity.shape[0]}')
unique_correct_gov_law_entity = ws_train_data_agg[(ws_train_data_agg['entity_type'] == 3) 
                                       & (ws_train_data_agg['uuid'].isin(gold_data_agg['uuid']))
                                       & (gold_data_agg['gt_entity_type'] == 3)]
print(f'No of correct labeled gov law entity: {correct_gov_law_entity.shape[0]}')

No of correct labeled doc name entity: 11
No of correct labeled party name entity: 11
No of correct labeled gov law entity: 11


  unique_correct_doc_name_entity = ws_train_data_agg[(ws_train_data_agg['entity_type'] == 1)
  unique_correct_party_name_entity = ws_train_data_agg[(ws_train_data_agg['entity_type'] == 2)
  unique_correct_gov_law_entity = ws_train_data_agg[(ws_train_data_agg['entity_type'] == 3)


In [71]:
# Filter the ws_data_agg for class 1
correct_doc_name_entity.head()

Unnamed: 0,uuid,entity_type,entity_text,gt_entity_type,gt_entity_text
0,0facc77f-1d2a-4645-92d8-5abb5f88b6ce,1,CO-BRANDING AGREEMENT,1,CO-BRANDING AGREEMENT
3,17af879e-9455-4dbc-b2bc-b2b1b849e805,1,VIDEO-ON-DEMAND CONTENT LICENSE AGREEMENT,1,VIDEO-ON-DEMAND CONTENT LICENSE AGREEMENT
6,1bdc2e9d-4ce6-4614-afc2-a568a7a1d167,1,JOINT DEVELOPMENT AGREEMENT,1,JOINT DEVELOPMENT AGREEMENT
9,202e98a5-2b2c-48ed-ae9f-b9a27dedf809,1,e-business Hosting Agreement IBM,1,e-business Hosting Agreement
12,451238df-68cc-476e-9e0a-9182e44cdc60,1,ENDORSEMENT AGREEMENT,1,ENDORSEMENT AGREEMENT


## B. Evaluation on Correctly Labeled Data
How much text of entities correctly predicted? Similarity score calculation

In [72]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import spacy

# create a class for similarity score calculation using tfidf and using embedding
class similarity_evaluation:
  def __init__(self):
    self.tfidf_vectorizer = TfidfVectorizer()
    # Load the spaCy model with word vectors (e.g., 'en_core_web_sm')
    self.nlp = spacy.load("en_core_web_lg")

  def calculate_TfIdf(self, string1,string2):
    # Fit and transform the vectorizer on the two strings
    tfidf_matrix = self.tfidf_vectorizer.fit_transform([string1, string2])
    # Calculate the cosine similarity between the two vectors
    cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
    # The similarity score is in cosine_sim[0][1]
    similarity = cosine_sim[0][1]
    return similarity

  def similarity_evaluation_using_tfidf(self, df):
    # receive the dataframe and find similarity between ws_entity_text and gt_entity_text and create a new column of similarity score and append it to the dataframe
    # and store the similarity score of each sample in that column and return the updated dataframe
    tfidf_similarity_score = []
    for index, row in df.iterrows():
      tfidf_similarity_score.append(self.calculate_TfIdf(row['entity_text'], row['gt_entity_text']))
    df['tfidf_similarity_score'] = tfidf_similarity_score
    return df
  
  def similarity_evaluation_using_spacy_embedding(self, df):
    # receive the dataframe and find similarity between ws_entity_text and gt_entity_text and create a new column of similarity score and append it to the dataframe
    # and store the similarity score of each sample in that column and return the updated dataframe
    spacy_similarity_score = []
    for index, row in df.iterrows():
      spacy_similarity_score.append(self.nlp(row['entity_text']).similarity(self.nlp(row['gt_entity_text'])))
    df['embedding_similarity_score'] = spacy_similarity_score
    return df

### Entities Text Evaluation using Tf-Idf Cosine Similarity

In [73]:
evaluate_similarity = similarity_evaluation()
# find similarity of document name entity text extracted and actual
document_name_df_tfidf = evaluate_similarity.similarity_evaluation_using_tfidf(correct_doc_name_entity)
# now find the mean of similarity score column 
print("Similarity percentage of document_name entity text extracted and actual:", document_name_df_tfidf['tfidf_similarity_score'].mean())
# save this to a csv file document_name_df
# document_name_df.to_csv('../output/evaluation results/document_name_results.csv', index=False)
party_name_df_tfidf = evaluate_similarity.similarity_evaluation_using_tfidf(correct_party_name_entity)
print("Similarity percentage of parties entity text extracted and actual:", party_name_df_tfidf['tfidf_similarity_score'].mean())
# save this to a csv file party_name_df
# party_name_df.to_csv('../output/evaluation results/party_name_results.csv', index=False)
governing_law_df_tfidf = evaluate_similarity.similarity_evaluation_using_tfidf(correct_gov_law_entity)
print("Similarity percentage of governing law entity text extracted and actual:", governing_law_df_tfidf['tfidf_similarity_score'].mean())
# save this to a csv file governing_law_df
# governing_law_df.to_csv('../output/evaluation results/governing_law_results.csv', index=False)

Similarity percentage of document_name entity text extracted and actual: 0.9796831391340469
Similarity percentage of parties entity text extracted and actual: 0.7647441771407664
Similarity percentage of governing law entity text extracted and actual: 0.9617944246852423


### Entities text evaluation using Spacy Embedding Similarity

In [74]:
# find similarity of document name entity text extracted and actual using spacy embeddings
document_name_df = evaluate_similarity.similarity_evaluation_using_spacy_embedding(correct_doc_name_entity)
# now find the mean of similarity score column
print("Similarity percentage of document_name entity text extracted and actual:", document_name_df['embedding_similarity_score'].mean())
# save this to a csv file document_name_df
# document_name_df.to_csv('../output/evaluation results/document_name_results.csv', index=False)
party_name_df = evaluate_similarity.similarity_evaluation_using_spacy_embedding(correct_party_name_entity)
print("Similarity percentage of parties entity text extracted and actual:", party_name_df['embedding_similarity_score'].mean())
# save this to a csv file party_name_df
# party_name_df.to_csv('../output/evaluation results/party_name_results.csv', index=False)
governing_law_df = evaluate_similarity.similarity_evaluation_using_spacy_embedding(correct_gov_law_entity)
print("Similarity percentage of governing law entity text extracted and actual:", governing_law_df['embedding_similarity_score'].mean())
# save this to a csv file governing_law_df
# governing_law_df.to_csv('../output/evaluation results/governing_law_results.csv', index=False)

Similarity percentage of document_name entity text extracted and actual: 0.9966746763159936
Similarity percentage of parties entity text extracted and actual: 0.61252659546404
Similarity percentage of governing law entity text extracted and actual: 0.9780230927824337
