In [2]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from nltk.tokenize import WordPunctTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords


In [59]:
# making the dataset
# train
train_data = pd.read_csv('data/msr_paraphrase_corpus/msr_paraphrase_train.txt', sep = "\t", header = None, names = ['class', 'id1', 'id2', 'text1', 'text2']) 
train_data = train_data.drop([0])
train_data['text'] = train_data['text1']+' '+train_data['text2']
train_data.drop(['id1', 'id2'], axis = 1, inplace = True) 
train_data = train_data.dropna()
train_data['class'] = train_data['class'].apply(lambda x: int(x))

# test
test_data = pd.read_csv('data/msr_paraphrase_corpus/msr_paraphrase_test.txt', sep = "\t", header = None, names = ['class', 'id1', 'id2', 'text1', 'text2']) 
test_data = test_data.drop([0])
test_data['text'] = test_data['text1']+' '+test_data['text2']
test_data.drop(['id1', 'id2'], axis = 1, inplace = True) 
test_data = test_data.dropna()
test_data['class'] = test_data['class'].apply(lambda x: int(x))

final_data = pd.concat([train_data, test_data])


In [15]:
def tok_helper(word):
    word.lower()
    word = word.replace(".","").replace(",","")
    return word.lower()

def lemma_tokenizer(text):
    wpt = WordPunctTokenizer()
    lemmatizer=WordNetLemmatizer()
    return [lemmatizer.lemmatize(tok_helper(w)) for w in wpt.tokenize(text) if w not in stopwords.words('english')]


# classifier.fit

In [45]:
# Word Overlap
def predict_overlap(t1, t2):
    t1_tok = set(lemma_tokenizer(t1))
    t2_tok = set(lemma_tokenizer(t2))
    common = t1_tok.intersection(t2_tok)
    return len(common) / len(t1_tok.union(t2_tok)) > 0.5

over_pred = []
for index, row in test_data.iterrows():
    over_pred.append(predict_overlap(row['text1'], row['text2']))

c = (list(over_pred == test_data['class'].values)).count(True)
acc = c/len(over_pred)
print(acc)

0.6630103595368677


In [41]:
# LogReg
vectorizer = CountVectorizer(tokenizer=lemma_tokenizer)
final_vector = vectorizer.fit_transform(final_data['text']) 

classifier = LogisticRegression(max_iter=1000)
classifier.fit(final_vector[:3941], train_data['class'].values)

classifier.score(final_vector[3941:],test_data['class'])

0.6489945155393053

In [57]:
# N-gram overlap
n_gram_count = CountVectorizer(tokenizer=lemma_tokenizer, ngram_range=(1,3))
def predict_overlap_n(t1, t2):
    t1_tok = set(n_gram_count.fit([t1]).vocabulary_)
    t2_tok = set(n_gram_count.fit([t2]).vocabulary_)    
    common = t1_tok.intersection(t2_tok)
    return len(common) / len(t1_tok.union(t2_tok)) > 0.5

over_pred = []
for index, row in test_data.iterrows():
    over_pred.append(predict_overlap_n(row['text1'], row['text2']))

c = (list(over_pred == test_data['class'].values)).count(True)
acc = c/len(over_pred)
print(acc)



0.4673979280926264


In [58]:
from sklearn import metrics
preds = classifier.predict(final_vector[3941:])
report = metrics.classification_report(test_data['class'], preds, target_names=['0','1'])
report

'              precision    recall  f1-score   support\n\n           0       0.47      0.39      0.43       549\n           1       0.72      0.78      0.75      1092\n\n    accuracy                           0.65      1641\n   macro avg       0.59      0.59      0.59      1641\nweighted avg       0.64      0.65      0.64      1641\n'