In [2]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from nltk.tokenize import WordPunctTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords


In [26]:
train_data = pd.read_csv('data/msr_paraphrase_corpus/msr_paraphrase_train.txt', sep = "\t", header = None, names = ['class', 'id1', 'id2', 'text1', 'text2']) 
train_data = train_data.drop([0])
train_data['text'] = train_data['text1']+' '+train_data['text2']
train_data.drop(['id1', 'id2'], axis = 1, inplace = True) 
train_data = train_data.dropna()
train_data['class'] = train_data['class'].apply(lambda x: int(x))


In [36]:
# test
test_data = pd.read_csv('data/msr_paraphrase_corpus/msr_paraphrase_test.txt', sep = "\t", header = None, names = ['class', 'id1', 'id2', 'text1', 'text2']) 
test_data = test_data.drop([0])
test_data['text'] = test_data['text1']+' '+test_data['text2']
test_data.drop(['id1', 'id2'], axis = 1, inplace = True) 
test_data = test_data.dropna()
test_data['class'] = test_data['class'].apply(lambda x: int(x))

final_data = pd.concat([train_data, test_data])

In [46]:
def tok_helper(word):
    word = word.replace(".","").replace(",","")
    return word

def lemma_tokenizer(text):
    wpt = WordPunctTokenizer()
    lemmatizer=WordNetLemmatizer()
    return [lemmatizer.lemmatize(tok_helper(w)) for w in wpt.tokenize(text) if w not in stopwords.words('english')]

vectorizer = CountVectorizer(tokenizer=lemma_tokenizer)
final_vector = vectorizer.fit_transform(final_data['text']) 

classifier = LogisticRegression(max_iter=1000)
classifier.fit(final_vector[:3941], train_data['class'].values)
# classifier.fit

LogisticRegression(max_iter=1000)

In [47]:
np.shape(final_vector)

(5582, 14348)

In [50]:
# test_vector = vectorizer.transform(test_data['text'])
# final_vector[3941:]

<1641x14348 sparse matrix of type '<class 'numpy.int64'>'
	with 30943 stored elements in Compressed Sparse Row format>

In [54]:
preds = classifier.predict(final_vector[3941:])

In [63]:
classifier.score(final_vector[3941:],test_data['class'])

0.6489945155393053

In [70]:
from sklearn import metrics

report = metrics.classification_report(test_data['class'], preds, target_names=['0','1'])

In [73]:
report

'              precision    recall  f1-score   support\n\n           0       0.47      0.39      0.43       549\n           1       0.72      0.78      0.75      1092\n\n    accuracy                           0.65      1641\n   macro avg       0.59      0.59      0.59      1641\nweighted avg       0.64      0.65      0.64      1641\n'