In [1]:
#Importing the libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [16]:
df_train= pd.read_csv('/kaggle/input/va-assignment2/SemEval-PIT2015-master/data/SemEval-PIT2015-github/SemEval-PIT2015-github/data/train.data', sep='\t', header=None)

In [17]:
df_test= pd.read_csv('/kaggle/input/va-assignment2/SemEval-PIT2015-master/data/SemEval-PIT2015-github/SemEval-PIT2015-github/data/test.data', sep='\t', header=None)

In [18]:
df_dev= pd.read_csv('/kaggle/input/va-assignment2/SemEval-PIT2015-master/data/SemEval-PIT2015-github/SemEval-PIT2015-github/data/dev.data', sep='\t', header=None)

In [19]:
header_col = ['ID', 'Topic_name', 'S1', 'S2', 'Label', 'S1Tag', 'S2Tag']
df_train.columns = header_col
df_test.columns = header_col
df_dev.columns = header_col

# **Changing Label to Binary Classification**

In [104]:
#Changing the Labels of Dev data and test data into binary form
'''For Dev Data
paraphrases: (3, 2) (4, 1) (5, 0)
non-paraphrases: (1, 4) (0, 5)
debatable: (2, 3)  which you may discard if training binary classifier

For Test Data
paraphrases: 4 or 5
non-paraphrases: 0 or 1 or 2  
debatable: 3   which we discarded in Paraphrase Identification evaluation
'''
def binary_label(df, lable_name):
    binary_label = []
    debateble = []
    for i in range(len(df)):
        if len(str(df[lable_name][i])) != 1:
            Yval = df[lable_name][i][1]
            Nval = df[lable_name][i][4]
            
            if int(Yval) > 2:
                binary_label.append(1)
            elif int(Yval) == 2:
                debateble.append(i)
            else:
                binary_label.append(0)
        else:
            score = int(df[lable_name][i])
            if score > 3:
                binary_label.append(1)
            elif score == 3:
                debateble.append(i)
            else:
                binary_label.append(0)
            
    df_f = df.drop(df.index[debateble])
    df_f[lable_name] = binary_label
    return df_f

In [24]:
df_f_train = binary_label(df_train, 'Label')
df_f_test = binary_label(df_test, 'Label')
df_f_dev = binary_label(df_dev, 'Label')

In [25]:
df_f_train['Label'].value_counts()
df_f_test['Label'].value_counts()
df_f_dev['Label'].value_counts()

0    2672
1    1470
Name: Label, dtype: int64

# **Data Cleaning and Preprocessing**

In [26]:
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem import SnowballStemmer

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [27]:
stop_words = set(stopwords.words('english'))
stemmer = SnowballStemmer('english')

def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove non-alphanumeric characters and extra whitespaces
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text)
    # Tokenize the text into words
    words = nltk.word_tokenize(text)
    # Remove stopwords
    words = [word for word in words if word not in stop_words]
    # Stem the words
    words = [stemmer.stem(word) for word in words]
    # Join the words back into a string
    text = ' '.join(words)
    return text

In [28]:
df_f_train['S1'] = df_f_train['S1'].apply(preprocess_text)
df_f_train['S2'] = df_f_train['S2'].apply(preprocess_text)
df_f_test['S1'] = df_f_test['S1'].apply(preprocess_text)
df_f_test['S2'] = df_f_test['S2'].apply(preprocess_text)
df_f_dev['S1'] = df_f_dev['S1'].apply(preprocess_text)
df_f_dev['S2'] = df_f_dev['S2'].apply(preprocess_text)

In [29]:
df_f_train = df_f_train.reset_index()
df_f_test = df_f_test.reset_index()
df_f_dev = df_f_dev.reset_index()

In [30]:
#Function for calculating performance measures
def PerformanceMeasure(y_true,y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1 score:", f1)

# **Baseline: Cosine Similarity between Sentence 1 and Sentence 2**

In [31]:
#Applying Baseline Algorithm: Evaluating Cosine Similarity between the 2 sentences

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def cosine_sim_vectors(v1, v2):
    v1 = v1.reshape(1,-1)
    v2 = v2.reshape(1,-1)
    return cosine_similarity(v1, v2)[0][0]

def calculate_Cosi(df):
    Cosine_score = []
    for index in range(len(df)):
        s1 = df['S1'][index]
        s2 = df['S2'][index]
        s_all = [s1, s2]
        vectorizer = CountVectorizer().fit_transform(s_all)
        vectors = vectorizer.toarray()
        csim = cosine_similarity(vectors)
        Cosine_score.append(cosine_sim_vectors(vectors[0], vectors[1]))
    return Cosine_score

In [32]:
CosSimi_train = calculate_Cosi(df_f_train)
CosSimi_test = calculate_Cosi(df_f_test)
CosSimi_dev = calculate_Cosi(df_f_dev)

In [33]:
df_f_train['Baseline_Cosine_Similarity']=CosSimi_train
df_f_test['Baseline_Cosine_Similarity']=CosSimi_test
df_f_dev['Baseline_Cosine_Similarity']=CosSimi_dev

In [34]:
df_f_dev['Baseline_Cosine_Similarity'].mean()

0.3803346758769155

In [35]:
# Converting cosine similiarities to binary based on threshold
def cosine_baseline_threshold(df):
    X=df['Baseline_Cosine_Similarity']
    after_threshold=[]
    for i in range(len(X)):
        if X[i] >= 0.38:
            after_threshold.append(1)
        else:
            after_threshold.append(0)
    df['Baseline_Threshold_CS']=after_threshold
    return df

In [36]:
df_f_train=cosine_baseline_threshold(df_f_train)
df_f_test=cosine_baseline_threshold(df_f_test)
df_f_dev=cosine_baseline_threshold(df_f_dev)

In [37]:
#Evaluating Performance measures
print('Train:')
PerformanceMeasure(df_f_train['Label'],df_f_train['Baseline_Threshold_CS'])
print('\nTest:')
PerformanceMeasure(df_f_test['Label'],df_f_test['Baseline_Threshold_CS'])
print('\nDev:')
PerformanceMeasure(df_f_dev['Label'],df_f_dev['Baseline_Threshold_CS'])

Train:
Accuracy: 0.6705117085862966
Precision: 0.515864068287969
Recall: 0.8015515515515516
F1 score: 0.6277315041646251

Test:
Accuracy: 0.7231503579952268
Precision: 0.40836012861736337
Recall: 0.7257142857142858
F1 score: 0.5226337448559673

Dev:
Accuracy: 0.6619990342829551
Precision: 0.5189393939393939
Recall: 0.6523809523809524
F1 score: 0.5780590717299577


# **Algorithm A: TF-IDF with Cosine Similarity**

In [94]:
#Importing necessary libraries
from sklearn.feature_extraction.text import TfidfVectorizer

In [95]:
#Converting Sentence 1 and Sentence 2 into list form
def Sent1Sent2_list(df):
    sent1 = []
    sent2 = []
    for i in range(len(df)):
        sent1.append(df['S1'][i])
        sent2.append(df['S2'][i])
    return sent1, sent2

In [96]:
# Embedding each sentence into a vector using TF_IDF and then computing cosine similarity of the two vectors
def TF_IDF_cosine_similarity(sent1,sent2):
#   TF_IDF_cosine_similarity_threshold =0.7
    sentence_list = [sent1,sent2]
    vect = TfidfVectorizer(stop_words="english") 
    tfidf = vect.fit_transform(sentence_list)
    pairwise_similarity = tfidf * tfidf.T 
    cosine_similarity = pairwise_similarity.toarray()[0][1]
    return cosine_similarity

In [97]:
#Computing cosine similarity score for TFIDF
def tf_idf_cosine_score(df):
    tf_idf_cosine=[]
    sent1,sent2=Sent1Sent2_list(df)
    for i in range(len(sent1)):
        tf_idf_cosine.append(TF_IDF_cosine_similarity(sent1[i],sent2[i]))
    return tf_idf_cosine

In [98]:
Train_tf_idf_cosine = tf_idf_cosine_score(df_f_train)
Test_tf_idf_cosine = tf_idf_cosine_score(df_f_test)
Dev_tf_idf_cosine = tf_idf_cosine_score(df_f_dev)

In [99]:
df_f_train['TF_IDF_Cosine_Similarity']=Train_tf_idf_cosine
df_f_test['TF_IDF_Cosine_Similarity']=Test_tf_idf_cosine
df_f_dev['TF_IDF_Cosine_Similarity']=Dev_tf_idf_cosine

In [100]:
df_f_dev['TF_IDF_Cosine_Similarity'].mean()

0.2670334779791318

In [101]:
# Converting cosine similiarities to binary based on threshold
def cosine_TDIDF_threshold(df):
    X=df['TF_IDF_Cosine_Similarity']
    after_threshold=[]
    for i in range(len(X)):
        if X[i] >= 0.28:
            after_threshold.append(1)
        else:
            after_threshold.append(0)
    df['TFIDF_Threshold_CS']=after_threshold
    return df

In [102]:
df_f_train=cosine_TDIDF_threshold(df_f_train)
df_f_test=cosine_TDIDF_threshold(df_f_test)
df_f_dev=cosine_TDIDF_threshold(df_f_dev)

In [103]:
#Evaluting Performance measures
print('Train:')
PerformanceMeasure(df_f_train['Label'],df_f_train['TFIDF_Threshold_CS'])
print('\nTest:')
PerformanceMeasure(df_f_test['Label'],df_f_test['TFIDF_Threshold_CS'])
print('\nDev:')
PerformanceMeasure(df_f_dev['Label'],df_f_dev['TFIDF_Threshold_CS'])

Train:
Accuracy: 0.6935819601040764
Precision: 0.5408793925481193
Recall: 0.7665165165165165
F1 score: 0.6342271456672534

Test:
Accuracy: 0.733890214797136
Precision: 0.41836734693877553
Recall: 0.7028571428571428
F1 score: 0.5245202558635395

Dev:
Accuracy: 0.6752776436504104
Precision: 0.5387476751394916
Recall: 0.591156462585034
F1 score: 0.563736620175154


# **Algorithm B - SBert with Cosine Similarity Algorithm**

In [38]:
#Installing necessary library
!pip install -U sentence-transformers

[0m

In [39]:
#Importing Ncessary Libraries
from sentence_transformers import SentenceTransformer
import torch

In [40]:
#Loading the sbert model
sbert_model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

Downloading (…)a8e1d/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)b20bca8e1d/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)0bca8e1d/config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)e1d/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading (…)a8e1d/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

Downloading (…)8e1d/train_script.py:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

Downloading (…)b20bca8e1d/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)bca8e1d/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [41]:
#Converting Sentence 1 and Sentence 2 into list form
def Sent1Sent2_list(df):
    sent1 = []
    sent2 = []
    for i in range(len(df['S1'])):
        sent1.append(df['S1'][i])
        sent2.append(df['S2'][i])
    return sent1,sent2

In [42]:
#Extracting Embeddings from sbert model
def compute_sbert_embedding(df):
    sent1,sent2=Sent1Sent2_list(df)
    sentence_embeddings1 = sbert_model.encode(sent1,show_progress_bar=True)
    sentence_embeddings2 = sbert_model.encode(sent2,show_progress_bar=True)
    return sentence_embeddings1,sentence_embeddings2

In [43]:
#Function for calculating cosine score
def cosine(u,v):
    return np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v))

In [44]:
#Compute cosine-similarities using embeddings
def Sbert_cosine_similarity(df):
    result=[]
    sentence_embeddings1,sentence_embeddings2=compute_sbert_embedding(df)
    for i in range(len(sentence_embeddings1)):
        Sbert_Cosine_Similarity = cosine(sentence_embeddings1[i],sentence_embeddings2[i])
        result.append(Sbert_Cosine_Similarity)
    return result

In [45]:
Sbert_CosSimi_train = Sbert_cosine_similarity(df_f_train)
Sbert_CosSimi_test = Sbert_cosine_similarity(df_f_test)
Sbert_CosSimi_dev = Sbert_cosine_similarity(df_f_dev)

Batches:   0%|          | 0/361 [00:00<?, ?it/s]

Batches:   0%|          | 0/361 [00:00<?, ?it/s]

Batches:   0%|          | 0/27 [00:00<?, ?it/s]

Batches:   0%|          | 0/27 [00:00<?, ?it/s]

Batches:   0%|          | 0/130 [00:00<?, ?it/s]

Batches:   0%|          | 0/130 [00:00<?, ?it/s]

In [46]:
df_f_train['Sbert_Cosine_Similarity']=Sbert_CosSimi_train
df_f_test['Sbert_Cosine_Similarity']=Sbert_CosSimi_test
df_f_dev['Sbert_Cosine_Similarity']=Sbert_CosSimi_dev

In [47]:
df_f_dev['Sbert_Cosine_Similarity'].mean()

0.54846436

In [48]:
# Converting cosine similiarities to binary based on threshold
def cosine_sbert_threshold(df):
    X=df['Sbert_Cosine_Similarity']
    after_threshold=[]
    for i in range(len(X)):
        if X[i] >= 0.57:
            after_threshold.append(1)
        else:
            after_threshold.append(0)
    df['Sbert_Threshold_CS']=after_threshold
    return df

In [49]:
cosine_sbert_threshold(df_f_train)
cosine_sbert_threshold(df_f_test)
cosine_sbert_threshold(df_f_dev)

Unnamed: 0,level_0,index,ID,Topic_name,S1,S2,Label,S1Tag,S2Tag,Baseline_Cosine_Similarity,Baseline_Threshold_CS,Sbert_Cosine_Similarity,Sbert_Threshold_CS
0,0,0,17,A Walk To Remember,walk rememb definit true love,walk rememb im town im upset,0,A/O/DT/B-NP/O Walk/O/NN/I-NP/O to/O/TO/B-VP/O ...,A/O/DT/B-NP/O Walk/O/NN/I-NP/O to/O/TO/B-VP/O ...,0.316228,0,0.411617,0
1,1,1,17,A Walk To Remember,walk rememb definit true love,walk rememb cutest thing,1,A/O/DT/B-NP/O Walk/O/NN/I-NP/O to/O/TO/B-VP/O ...,A/O/DT/B-NP/O Walk/O/NN/I-NP/O to/O/TO/B-VP/O ...,0.447214,1,0.550081,0
2,2,2,17,A Walk To Remember,walk rememb definit true love,walk rememb abc famili your welcom,0,A/O/DT/B-NP/O Walk/O/NN/I-NP/O to/O/TO/B-VP/O ...,A/O/DT/B-NP/O walk/O/NN/I-NP/O to/O/TO/B-VP/O ...,0.365148,0,0.411227,0
3,3,3,17,A Walk To Remember,walk rememb definit true love,walk rememb amaz inspir,1,A/O/DT/B-NP/O Walk/O/NN/I-NP/O to/O/TO/B-VP/O ...,A/O/DT/B-NP/O walk/O/NN/I-NP/O to/O/TO/B-VP/O ...,0.447214,1,0.463302,0
4,4,4,17,A Walk To Remember,walk rememb definit true love,guy fave part walk rememb,0,A/O/DT/B-NP/O Walk/O/NN/I-NP/O to/O/TO/B-VP/O ...,BUT/O/CC/O/O GUYS/O/VBP/B-VP/B-EVENT ITS/O/PRP...,0.400000,1,0.498624,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4137,4722,4722,1902,iTunes,first podcast drop itun,wonder spend much itun,0,My/O/PRP$/B-NP/O first/O/JJ/I-NP/O podcast/O/N...,No/O/DT/B-NP/O wonder/O/NN/I-NP/O I/O/PRP/B-NP...,0.250000,0,0.313820,0
4138,4723,4723,1902,iTunes,first podcast drop itun,dead gone ep itun,0,My/O/PRP$/B-NP/O first/O/JJ/I-NP/O podcast/O/N...,Our/O/PRP/B-NP/O Dead/O/JJ/I-NP/O and/O/CC/I-N...,0.250000,0,0.477972,0
4139,4724,4724,1902,iTunes,first podcast drop itun,amslingshot 7 itun chart,0,My/O/PRP$/B-NP/O first/O/JJ/I-NP/O podcast/O/N...,amslingshots/O/NNP/B-NP/O is/O/VBZ/B-VP/O 7/O/...,0.288675,0,0.162879,0
4140,4725,4725,1902,iTunes,first podcast drop itun,itun delet voic memo,0,My/O/PRP$/B-NP/O first/O/JJ/I-NP/O podcast/O/N...,iTunes/B-company/NNP/B-NP/O deleted/O/VBD/B-VP...,0.250000,0,0.242315,0


In [51]:
#Evaluating Performance measures
print('Train:')
PerformanceMeasure(df_f_train['Label'],df_f_train['Sbert_Threshold_CS'])
print('\nTest:')
PerformanceMeasure(df_f_test['Label'],df_f_test['Sbert_Threshold_CS'])
print('\nDev:')
PerformanceMeasure(df_f_dev['Label'],df_f_dev['Sbert_Threshold_CS'])

Train:
Accuracy: 0.7120555073720729
Precision: 0.5582557738710789
Recall: 0.8105605605605606
F1 score: 0.6611553378240457

Test:
Accuracy: 0.7744630071599046
Precision: 0.4734848484848485
Recall: 0.7142857142857143
F1 score: 0.5694760820045558

Dev:
Accuracy: 0.6803476581361662
Precision: 0.5401982378854625
Recall: 0.6673469387755102
F1 score: 0.5970785149117469


# **------------------------------------------------------------------------------------------------------------------------------------------------------**