# Knowledge Based Measures on OOP Dataset

## Import Packages

In [None]:
import nltk
from nltk import word_tokenize, pos_tag
from nltk.corpus import wordnet as wn
from nltk.corpus import wordnet_ic
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet_ic')
brown_ic = wordnet_ic.ic('ic-brown.dat')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet_ic to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet_ic.zip.


## Functions common for KBM

In [None]:
def penn_to_wn(tag):
    """ Convert between a Penn Treebank tag to a simplified Wordnet tag """
    if tag.startswith('N'):
        return 'n'
 
    if tag.startswith('V'):
        return 'v'
 
    if tag.startswith('J'):
        return 'a'
 
    if tag.startswith('R'):
        return 'r'
 
    return None
 
def tagged_to_synset(word, tag):
    wn_tag = penn_to_wn(tag)
    if wn_tag is None:
        return None
 
    try:
        return wn.synsets(word, wn_tag)[0]
    except:
        return None
  

Similarity("Cats are beautiful animals.", "Dogs are awesome.") = 3.3333333333333335e+299
Similarity("Dogs are awesome.", "Cats are beautiful animals.") = 5e+299
Similarity("Cats are beautiful animals.", "Some gorgeous creatures are felines.") = 6.666666666666667e+299
Similarity("Some gorgeous creatures are felines.", "Cats are beautiful animals.") = 6.666666666666667e+299
Similarity("Cats are beautiful animals.", "Dolphins are swimming mammals.") = 3.3333333333333335e+299
Similarity("Dolphins are swimming mammals.", "Cats are beautiful animals.") = 2.5e+299
Similarity("Cats are beautiful animals.", "Cats are beautiful animals.") = 1e+300
Similarity("Cats are beautiful animals.", "Cats are beautiful animals.") = 1e+300


## Jiang Conarth Functions

In [None]:
def jc_sentence_similarity(sentence1, sentence2):
    """ compute the sentence similarity using Wordnet """
    # Tokenize and tag
    sentence1 = pos_tag(word_tokenize(sentence1))
    sentence2 = pos_tag(word_tokenize(sentence2))
 
    # Get the synsets for the tagged words
    synsets1 = [tagged_to_synset(*tagged_word) for tagged_word in sentence1]
    synsets2 = [tagged_to_synset(*tagged_word) for tagged_word in sentence2]
 
    # Filter out the Nones
    synsets1 = [ss for ss in synsets1 if ss]
    synsets2 = [ss for ss in synsets2 if ss]
 
    score, count = 0.0, 0
 
    # For each word in the first sentence
    for syn1 in synsets1:
      arr_simi_score = []
      # print('=========================================')
      # print(syn1)
      # print('----------------')
      for syn2 in synsets2:
        # print(syn2)
        try:
          simi_score = syn1.jcn_similarity(syn2,brown_ic)
        except:
          simi_score=None
        # print(simi_score)?
        if simi_score is not None:
          arr_simi_score.append(simi_score)
      # print('----------------')
      # print(arr_simi_score)
      if(len(arr_simi_score) > 0):
        best = max(arr_simi_score)
        # print(best)
        score += best
        count += 1

    # Average the values
    # print('score: ', score)
    # print('count: ', count)
    score /= count
    return score

In [None]:
def jc_symmetric_sentence_similarity(sentence1, sentence2):
    """ compute the symmetric sentence similarity using Wordnet """
    return (jc_sentence_similarity(sentence1, sentence2) + jc_sentence_similarity(sentence2, sentence1)) / 2 

SymmetricSimilarity("Cats are beautiful animals.", "Dogs are awesome.") = 4.166666666666667e+299
SymmetricSimilarity("Dogs are awesome.", "Cats are beautiful animals.") = 4.166666666666667e+299
SymmetricSimilarity("Cats are beautiful animals.", "Some gorgeous creatures are felines.") = 6.666666666666667e+299
SymmetricSimilarity("Some gorgeous creatures are felines.", "Cats are beautiful animals.") = 6.666666666666667e+299
SymmetricSimilarity("Cats are beautiful animals.", "Dolphins are swimming mammals.") = 2.916666666666667e+299
SymmetricSimilarity("Dolphins are swimming mammals.", "Cats are beautiful animals.") = 2.916666666666667e+299
SymmetricSimilarity("Cats are beautiful animals.", "Cats are beautiful animals.") = 1e+300
SymmetricSimilarity("Cats are beautiful animals.", "Cats are beautiful animals.") = 1e+300


## Lin Functions

In [None]:
def lin_sentence_similarity(sentence1, sentence2):
    """ compute the sentence similarity using Wordnet """
    # Tokenize and tag
    sentence1 = pos_tag(word_tokenize(sentence1))
    sentence2 = pos_tag(word_tokenize(sentence2))
 
    # Get the synsets for the tagged words
    synsets1 = [tagged_to_synset(*tagged_word) for tagged_word in sentence1]
    synsets2 = [tagged_to_synset(*tagged_word) for tagged_word in sentence2]
 
    # Filter out the Nones
    synsets1 = [ss for ss in synsets1 if ss]
    synsets2 = [ss for ss in synsets2 if ss]
 
    score, count = 0.0, 0
 
    # For each word in the first sentence
    for syn1 in synsets1:
      arr_simi_score = []
      # print('=========================================')
      # print(syn1)
      # print('----------------')
      for syn2 in synsets2:
        # print(syn2)
        try:
          simi_score = syn1.lin_similarity(syn2,brown_ic)
        except:
          simi_score=None
        # print(simi_score)?
        if simi_score is not None:
          arr_simi_score.append(simi_score)
      # print('----------------')
      # print(arr_simi_score)
      if(len(arr_simi_score) > 0):
        best = max(arr_simi_score)
        # print(best)
        score += best
        count += 1

    # Average the values
    # print('score: ', score)
    # print('count: ', count)
    score /= count
    return score


Similarity("Cats are beautiful animals.", "Dogs are awesome.") = 0.861934542665209
Similarity("Dogs are awesome.", "Cats are beautiful animals.") = 0.9384004921866986
Similarity("Cats are beautiful animals.", "Some gorgeous creatures are felines.") = 0.9928986752928762
Similarity("Some gorgeous creatures are felines.", "Cats are beautiful animals.") = 0.9928986752928762
Similarity("Cats are beautiful animals.", "Dolphins are swimming mammals.") = 0.9013249930561423
Similarity("Dolphins are swimming mammals.", "Cats are beautiful animals.") = 0.6278107429022008
Similarity("Cats are beautiful animals.", "Cats are beautiful animals.") = 1.0
Similarity("Cats are beautiful animals.", "Cats are beautiful animals.") = 1.0


In [None]:
def lin_symmetric_sentence_similarity(sentence1, sentence2):
    """ compute the symmetric sentence similarity using Wordnet """
    return (lin_sentence_similarity(sentence1, sentence2) + lin_sentence_similarity(sentence2, sentence1)) / 2 


SymmetricSimilarity("Cats are beautiful animals.", "Dogs are awesome.") = 0.9001675174259538
SymmetricSimilarity("Dogs are awesome.", "Cats are beautiful animals.") = 0.9001675174259538
SymmetricSimilarity("Cats are beautiful animals.", "Some gorgeous creatures are felines.") = 0.9928986752928762
SymmetricSimilarity("Some gorgeous creatures are felines.", "Cats are beautiful animals.") = 0.9928986752928762
SymmetricSimilarity("Cats are beautiful animals.", "Dolphins are swimming mammals.") = 0.7645678679791716
SymmetricSimilarity("Dolphins are swimming mammals.", "Cats are beautiful animals.") = 0.7645678679791716
SymmetricSimilarity("Cats are beautiful animals.", "Cats are beautiful animals.") = 1.0
SymmetricSimilarity("Cats are beautiful animals.", "Cats are beautiful animals.") = 1.0


## Applying the KBM functions on the Dataset

In [None]:
DATASET_CSV = '/content/drive/MyDrive/Engineering/Curriculum/8th Semester/Internship/descriptive_evaluation_project/Knowledge Based Measures/train_main.csv'

In [None]:
import pandas as pd
oop_df = pd.read_csv(DATASET_CSV)

output = oop_df[['question1', 'question2']]

In [None]:
questions_cols = ['question1', 'question2']

for i in output.index:
  output['Jiang-Conarth'][i]  = jc_symmetric_sentence_similarity(output['question1'][i], output['question2'][i])
  output['Lin'][i]  = lin_symmetric_sentence_similarity(output['question1'][i], output['question2'][i])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [None]:
output

Unnamed: 0,question1,question2,Jiang-Conarth,Lin
0,Dynamic (run time) polymorphism is the polymor...,Constructor is piece of code which we are usin...,3.202765e+299,0.415613
1,A virtual function or virtual method in an OOP...,"Within a single program, output of a function ...",3.214286e+299,0.491242
2,new: Allocates memory for the object on the fr...,Object is an instance of a class. An object in...,2.795699e+299,0.497885
3,Overloading occurs when two or more methods in...,OOPs allows us to hide implementation details ...,8.222303e-02,0.092380
4,An interface is most certainly not a blueprint...,An interface is better than a abstract class w...,4.586835e+299,0.582632
...,...,...,...,...
211533,OOPs allows us to hide implementation details ...,"Exception is an abnormal condition. In Java, a...",1.294426e-01,0.217773
211534,"When Java encounters an exception, it throws i...","An interface is a completely ""abstract class"" ...",9.756684e-02,0.101838
211535,Copy constructor is called when a new object i...,Overriding occurs when two methods have the sa...,1.678322e+299,0.349763
211536,Dynamic Polymorphism(Late Binding/ Runtime Pol...,The interface consists of the signatures of me...,7.045455e+298,0.211246


In [None]:
output.to_csv('/content/drive/MyDrive/Engineering/Curriculum/8th Semester/Internship/descriptive_evaluation_project/Knowledge Based Measures/oop_output.csv', encoding='utf-8')