In [5]:
#Loading the required libraries
import pandas as pd
import numpy as np
import numpy
import csv
import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, AdaBoostClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost import XGBClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
import simhash
from nltk import ne_chunk, pos_tag, word_tokenize
from nltk.tree import Tree
from nltk.tokenize import word_tokenize

In [6]:
# Loading the training and testing data
train = pd.read_csv(r'Data/msr_paraphrase_train.txt', sep = '\t', quoting=csv.QUOTE_NONE)
test = pd.read_csv(r'Data/msr_paraphrase_test.txt', sep = '\t', quoting=csv.QUOTE_NONE)

In [7]:
def gen_accuracy(y_pred, y_actual):
    """Function to calculate the accuracy of a model, returns the accuracy
        Args: 
            y_pred: predicted values
            y_actual: actual values"""
    
    count = 0
    for i in range(len(y_pred)):
        if y_pred[i] == y_actual[i]:
            count = count+1
    return (count/len(y_pred))*100

In [8]:
def write_csv(result, id_col, path):
    result_final = pd.DataFrame(data = list(zip(id_col, result)), columns = ['id', 'prediction'])
    result_final.to_csv(path, index = False)

## Simhash

In [9]:
def ham_dist(text1, text2):
    return simhash.Simhash(text1, f = 8).distance(simhash.Simhash(text2, f = 8))

In [10]:
train['sim_dist'] = train.apply(lambda x: ham_dist(x['#1 String'], x['#2 String']), axis = 1)

In [11]:
train.groupby('Quality').agg('mean')['sim_dist']

Quality
0    2.673469
1    2.188885
Name: sim_dist, dtype: float64

In [12]:
test['sim_dist'] = test.apply(lambda x: ham_dist(x['#1 String'], x['#2 String']), axis = 1)

In [13]:
result_df = []
for i in test['sim_dist']:
    if i > 2.1:
        result_df.append(0)
    else:
        result_df.append(1)

In [14]:
count = 0
for i in range(len(result_df)):
    if result_df[i] == test['Quality'].to_numpy()[i]:
        count = count +1

In [15]:
count/len(result_df)

0.5814492753623188

In [16]:
print('f1_socre:' + str(f1_score(test['Quality'], result_df, average = 'weighted')))
print('recall_socre:' + str(recall_score(test['Quality'], result_df, average = 'weighted')))
print('precision_socre:' + str(precision_score(test['Quality'], result_df, average = 'weighted')))

f1_socre:0.5912023395952938
recall_socre:0.5814492753623188
precision_socre:0.6104681610315508


In [19]:
write_csv(result_df, test['#1 ID'], 'results/NLP/MSR/simhash.csv')

## NER similarity

In [20]:
def jaccard_similarity(list1, list2):
    intersection = len(list(set(list1).intersection(list2)))
    union = (len(list1) + len(list2)) - intersection
    if union== 0:
        return 0
    else:
        return float(intersection) / union
    
def get_continuous_chunks(text):
    chunked = ne_chunk(pos_tag(word_tokenize(text)))
    continuous_chunk = []
    current_chunk = []
    for i in chunked:
            if type(i) == Tree:
                    current_chunk.append(" ".join([token for token, pos in i.leaves()]))
            elif current_chunk:
                    named_entity = " ".join(current_chunk)
                    if named_entity not in continuous_chunk:
                            continuous_chunk.append(named_entity)
                            current_chunk = []
            else:
                    continue
    return continuous_chunk

In [21]:
train['ne1'] = train.apply(lambda x:get_continuous_chunks(x['#1 String']), axis = 1)
train['ne2'] = train.apply(lambda x: get_continuous_chunks(x['#2 String']), axis = 1)

In [22]:
train['jac_sim'] = train.apply(lambda x: jaccard_similarity(x['ne1'], x['ne2']), axis = 1)

In [23]:
train.groupby('Quality').agg('mean')['jac_sim']

Quality
0    0.381154
1    0.415262
Name: jac_sim, dtype: float64

In [24]:
test['ne1'] = test.apply(lambda x:get_continuous_chunks(x['#1 String']), axis = 1)
test['ne2'] = test.apply(lambda x: get_continuous_chunks(x['#2 String']), axis = 1)

In [25]:
test['jac_sim'] = test.apply(lambda x: jaccard_similarity(x['ne1'], x['ne2']), axis = 1)

In [26]:
result_ner = []
for i in test['jac_sim']:
    if i > .41:
        result_ner.append(1)
    else:
        result_ner.append(0)

In [27]:
count = 0
for i in range(len(result_ner)):
    if result_ner[i] == test['Quality'].to_numpy()[i]:
        count = count +1

In [28]:
count/len(result_ner)

0.4985507246376812

In [29]:
print('f1_socre:' + str(f1_score(test['Quality'], result_ner, average = 'weighted')))
print('recall_socre:' + str(recall_score(test['Quality'], result_ner, average = 'weighted')))
print('precision_socre:' + str(precision_score(test['Quality'], result_ner, average = 'weighted')))

f1_socre:0.510405536492493
recall_socre:0.4985507246376812
precision_socre:0.5743931083748017


In [30]:
write_csv(result_ner, test['#1 ID'], 'results/NLP/MSR/ner.csv')

## Cosine Distance

In [31]:
#training and testing
sent = list(train['#1 String']) + list(train['#2 String']) + list(test['#1 String']) + list(test['#2 String'])

vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    ngram_range=(1, 1),
    max_features=30000)

vectorizer.fit(sent)

#train_sent1 = train['#1 String'] + " " + train['#2 String']
#test_sent = test['#1 String']+ " " + test['#2 String']

# sent_vect_train1 = vectorizer.transform(train['#1 String'])
# sent_vect_train2 = vectorizer.transform(train['#2 String'])

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=30000,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents='unicode',
                sublinear_tf=True, token_pattern='\\w{1,}', tokenizer=None,
                use_idf=True, vocabulary=None)

In [32]:
def cosine_sim(text1, text2):
    
    tfidf = vectorizer.transform([text1, text2])
    return ((tfidf * tfidf.T).A)[0,1]


In [33]:
train['cosine_sim'] = train.apply(lambda x: cosine_sim(x['#1 String'], x['#2 String']), axis= 1)

In [34]:
train.groupby('Quality').agg('mean')['cosine_sim']

Quality
0    0.556032
1    0.706691
Name: cosine_sim, dtype: float64

In [35]:
test['cosine_sim'] = test.apply(lambda x: cosine_sim(x['#1 String'], x['#2 String']), axis= 1)

In [36]:
result_cosine = []
for i in test['cosine_sim'].to_numpy():
    if i> .7:
        result_cosine.append(1)
    else:
        result_cosine.append(0)

In [37]:
count = 0
for i in range(len(result_cosine)):
    if result_cosine[i] == test['Quality'].to_numpy()[i]:
        count = count+1

In [38]:
count/len(result_cosine)

0.6457971014492754

In [39]:
print('f1_socre:' + str(f1_score(test['Quality'], result_cosine, average = 'weighted')))
print('recall_socre:' + str(recall_score(test['Quality'], result_cosine, average = 'weighted')))
print('precision_socre:' + str(precision_score(test['Quality'], result_cosine, average = 'weighted')))

f1_socre:0.6543766516974899
recall_socre:0.6457971014492754
precision_socre:0.7274937668865716


In [40]:
write_csv(result_cosine, test['#1 ID'], 'results/NLP/MSR/cosine.csv')