In [40]:
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support
 

In [41]:
test_questions = pd.read_csv('./questions-bert-similarity-with-match.csv')

count_ones = (test_questions['is_match'] == 1).sum()
total_count = len(test_questions['is_match'])
bert_percentage = (count_ones / total_count) * 100

print("Percentage of 1s: {:.2f}%".format(bert_percentage))

test_questions['predicted'] =  test_questions.apply(lambda x: x['is_duplicate'] if x['is_match'] == 1 else 1 - x['is_duplicate'], axis=1)

# calculate precision, recall, f1
precision, recall, f1, _ = precision_recall_fscore_support(test_questions['predicted'], test_questions['is_duplicate'], average='binary')

print("Precision: {:.2f}%".format(precision * 100))
print("Recall: {:.2f}%".format(recall * 100))
print("F1: {:.2f}%".format(f1 * 100))
accuracy = accuracy_score(test_questions['predicted'], test_questions['is_duplicate'].round())

print("Accuracy: {:.2f}%".format(accuracy * 100))

Percentage of 1s: 77.21%
Precision: 76.70%
Recall: 66.63%
F1: 71.31%
Accuracy: 77.21%


In [42]:
partial_ratio_df = pd.read_csv('./questions-partial-ratio-with-match.csv')
count_ones = (partial_ratio_df['is_match'] == 1).sum()
total_count = len(partial_ratio_df['is_match'])
partial_ratio_percentage = (count_ones / total_count) * 100

print("Percentage of 1s: {:.2f}%".format(partial_ratio_percentage))

# if is match is 0 then predicted is the opposite the value of is_duplicate, if is match is 1 then predicted is the same as is_duplicate
partial_ratio_df['predicted'] =  partial_ratio_df.apply(lambda x: x['is_duplicate'] if x['is_match'] == 1 else 1 - x['is_duplicate'], axis=1)

# calculate precision, recall, f1

precision, recall, f1, _ = precision_recall_fscore_support(partial_ratio_df['predicted'], partial_ratio_df['is_duplicate'], average='binary')

print("Precision: {:.2f}%".format(precision * 100))
print("Recall: {:.2f}%".format(recall * 100))
print("F1: {:.2f}%".format(f1 * 100))

accuracy = accuracy_score(partial_ratio_df['predicted'], partial_ratio_df['is_duplicate'].round())

print("Accuracy: {:.2f}%".format(accuracy * 100))

Percentage of 1s: 64.17%
Precision: 30.03%
Recall: 52.61%
F1: 38.23%
Accuracy: 64.17%


In [43]:
token_sort_df = pd.read_csv('./questions-token-sort-with-match.csv')

count_ones = (token_sort_df['is_match'] == 1).sum() 
total_count = len(token_sort_df['is_match'])
token_sort_percentage = (count_ones / total_count) * 100

print("Percentage of 1s: {:.2f}%".format(token_sort_percentage))

# if is match is 0 then predicted is the opposite the value of is_duplicate, if is match is 1 then predicted is the same as is_duplicate
token_sort_df['predicted'] =  token_sort_df.apply(lambda x: x['is_duplicate'] if x['is_match'] == 1 else 1 - x['is_duplicate'], axis=1)

# calculate precision, recall, f1

precision, recall, f1, _ = precision_recall_fscore_support(token_sort_df['predicted'], token_sort_df['is_duplicate'], average='binary')

print("Precision: {:.2f}%".format(precision * 100))

print("Recall: {:.2f}%".format(recall * 100))

print("F1: {:.2f}%".format(f1 * 100))

accuracy = accuracy_score(token_sort_df['predicted'], token_sort_df['is_duplicate'].round())

print("Accuracy: {:.2f}%".format(accuracy * 100))

Percentage of 1s: 65.42%
Precision: 29.10%
Recall: 56.12%
F1: 38.33%
Accuracy: 65.42%


In [44]:
spacy_df = pd.read_csv('./questions-spacy-with-match.csv')

count_ones = (spacy_df['is_match'] == 1).sum()
total_count = len(spacy_df['is_match'])
spacy_percentage = (count_ones / total_count) * 100

print("Percentage of 1s: {:.2f}%".format(spacy_percentage))

# if is match is 0 then predicted is the opposite the value of is_duplicate, if is match is 1 then predicted is the same as is_duplicate
spacy_df['predicted'] =  spacy_df.apply(lambda x: x['is_duplicate'] if x['is_match'] == 1 else 1 - x['is_duplicate'], axis=1)

# calculate precision, recall, f1

precision, recall, f1, _ = precision_recall_fscore_support(spacy_df['predicted'], spacy_df['is_duplicate'], average='binary')

print("Precision: {:.2f}%".format(precision * 100))

print("Recall: {:.2f}%".format(recall * 100))

print("F1: {:.2f}%".format(f1 * 100))

accuracy = accuracy_score(spacy_df['predicted'], spacy_df['is_duplicate'])

display(spacy_df)

print("Accuracy: {:.2f}%".format(accuracy * 100))

Percentage of 1s: 45.82%
Precision: 96.35%
Recall: 40.24%
F1: 56.77%


Unnamed: 0.1,Unnamed: 0,question1,question2,is_duplicate,is_match,predicted
0,0,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0,0,1
1,1,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0,0,1
2,2,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0,0,1
3,3,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0,1,0
4,4,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0,0,1
...,...,...,...,...,...,...
404346,404346,How many keywords are there in the Racket prog...,How many keywords are there in PERL Programmin...,0,0,1
404347,404347,Do you believe there is life after death?,Is it true that there is life after death?,1,1,1
404348,404348,What is one coin?,What's this coin?,0,0,1
404349,404349,What is the approx annual cost of living while...,I am having little hairfall problem but I want...,0,1,0


Accuracy: 45.82%


In [45]:
# make me a table to showcase the percentages with the different methods

import pandas as pd

data = {'Method': ['BERT', 'Partial Ratio', 'Token Sort', 'Spacy'],
        'Percentage': [bert_percentage, partial_ratio_percentage, token_sort_percentage, spacy_percentage]}
df = pd.DataFrame(data, columns = ['Method', 'Percentage'])

print(df)

          Method  Percentage
0           BERT   77.211878
1  Partial Ratio   64.174443
2     Token Sort   65.418164
3          Spacy   45.820339
