In [1]:
#Loading the required libraries
import pandas as pd
import numpy as np
import numpy
import csv
import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, AdaBoostClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost import XGBClassifier
import models.ml_models as ml_models
from sklearn.metrics import precision_score, recall_score, f1_score

In [2]:
# Loading the training and testing data
train = pd.read_csv(r'Data/msr_paraphrase_train.txt', sep = '\t', quoting=csv.QUOTE_NONE)
test = pd.read_csv(r'Data/msr_paraphrase_test.txt', sep = '\t', quoting=csv.QUOTE_NONE)

In [3]:
def gen_accuracy(y_pred, y_actual):
    """Function to calculate the accuracy of a model, returns the accuracy
        Args: 
            y_pred: predicted values
            y_actual: actual values"""
    
    count = 0
    for i in range(len(y_pred)):
        if y_pred[i] == y_actual[i]:
            count = count+1
    return (count/len(y_pred))*100

In [12]:
def write_csv(result, id_col, path):
    result_final = pd.DataFrame(data = list(zip(id_col, result)), columns = ['id', 'prediction'])
    result_final.to_csv(path, index = False)

In [6]:
#training and testing
sent = list(train['#1 String']) + list(train['#2 String']) + list(test['#1 String']) + list(test['#2 String'])

vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    ngram_range=(1, 1),
    max_features=30000)

vectorizer.fit(sent)

train_sent = train['#1 String'] + " " + train['#2 String']
test_sent = test['#1 String']+ " " + test['#2 String']

sent_vect_train = vectorizer.transform(train_sent)
sent_vect_test = vectorizer.transform(test_sent)

In [7]:
from importlib import reload
reload(ml_models)

<module 'models.ml_models' from 'C:\\Users\\hkpat\\Masters\\NLP\\NLI_project\\models\\ml_models.py'>

In [8]:
ml = ml_models.ml_models(sent_vect_train, train['Quality']) 

### Logistic regression

In [9]:
result_lr = ml.logistic_regression(sent_vect_test)

In [10]:
gen_accuracy(result_lr, test['Quality'])

67.76811594202898

In [11]:
print('f1_socre:' + str(f1_score(test['Quality'], result_lr, average = 'weighted')))
print('recall_socre:' + str(recall_score(test['Quality'], result_lr, average = 'weighted')))
print('precision_socre:' + str(precision_score(test['Quality'], result_lr, average = 'weighted')))

f1_socre:0.6493869193049334
recall_socre:0.6776811594202898
precision_socre:0.6520193236714975


In [13]:
write_csv(result_lr, test['#1 ID'], 'results/machine_learning/MSR/lr.csv')

### SVM

In [14]:
#SVC
result_svc = ml.svm(sent_vect_test)

In [15]:
gen_accuracy(result_svc, test['Quality'])

70.66666666666667

In [16]:
print('f1_socre:' + str(f1_score(test['Quality'], result_svc, average = 'weighted')))
print('recall_socre:' + str(recall_score(test['Quality'], result_svc, average = 'weighted')))
print('precision_socre:' + str(precision_score(test['Quality'], result_svc, average = 'weighted')))

f1_socre:0.6648984227723238
recall_socre:0.7066666666666667
precision_socre:0.6937687991021324


In [17]:
write_csv(result_svc, test['#1 ID'], 'results/machine_learning/MSR/svm.csv')

### Random Forest

In [18]:
#RF
result_rf = ml.random_forest(sent_vect_test)

In [19]:
gen_accuracy(result_rf, test['Quality'])

70.78260869565217

In [21]:
print('f1_socre:' + str(f1_score(test['Quality'], result_rf, average = 'weighted')))
print('recall_socre:' + str(recall_score(test['Quality'], result_rf, average = 'weighted')))
print('precision_socre:' + str(precision_score(test['Quality'], result_rf, average = 'weighted')))

f1_socre:0.6386685803770947
recall_socre:0.7078260869565217
precision_socre:0.7307167481146715


In [22]:
write_csv(result_rf, test['#1 ID'], 'results/machine_learning/MSR/rf.csv')

### XG Boost

In [23]:
result_xg = ml.xgbclassifier(sent_vect_test)

In [24]:
gen_accuracy(result_xg, test['Quality'])

69.33333333333334

In [25]:
print('f1_socre:' + str(f1_score(test['Quality'], result_xg, average = 'weighted')))
print('recall_socre:' + str(recall_score(test['Quality'], result_xg, average = 'weighted')))
print('precision_socre:' + str(precision_score(test['Quality'], result_xg, average = 'weighted')))

f1_socre:0.609957824149157
recall_socre:0.6933333333333334
precision_socre:0.7124641050478957


In [26]:
write_csv(result_xg, test['#1 ID'], 'results/machine_learning/MSR/xgboost.csv')

### Adaboost

In [27]:
result_ada = ml.adaboost(sent_vect_test)

In [28]:
gen_accuracy(result_ada, test['Quality'])

68.81159420289855

In [29]:
print('f1_socre:' + str(f1_score(test['Quality'], result_ada, average = 'weighted')))
print('recall_socre:' + str(recall_score(test['Quality'], result_ada, average = 'weighted')))
print('precision_socre:' + str(precision_score(test['Quality'], result_ada, average = 'weighted')))

f1_socre:0.6001074719302315
recall_socre:0.6881159420289855
precision_socre:0.701456735911171


In [30]:
write_csv(result_ada, test['#1 ID'], 'results/machine_learning/MSR/adaboost.csv')

### Voting Classifier

In [31]:
result_voting = ml.voting_classifier(sent_vect_test)



In [32]:
gen_accuracy(result_voting, test['Quality'])

70.20289855072464

In [33]:
print('f1_socre:' + str(f1_score(test['Quality'], result_voting, average = 'weighted')))
print('recall_socre:' + str(recall_score(test['Quality'], result_voting, average = 'weighted')))
print('precision_socre:' + str(precision_score(test['Quality'], result_voting, average = 'weighted')))

f1_socre:0.6271961906851677
recall_socre:0.7020289855072464
precision_socre:0.7246883056560518


In [34]:
write_csv(result_voting, test['#1 ID'], 'results/machine_learning/MSR/voting_classifier.csv')