In [1]:
#Loading the required libraries
import pandas as pd
import numpy as np
import numpy
import csv
import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, AdaBoostClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost import XGBClassifier
import models.ml_models as ml_models
from sklearn.metrics import precision_score, recall_score, f1_score

In [2]:
#Loading the data
df = pd.read_csv('Data/train.csv')
df['question1'] = df['question1'].apply(str)
df['question2'] = df['question2'].apply(str)
df.dropna(inplace = True)
#df = df[:30000]

In [3]:
#training and testing
from sklearn.model_selection import train_test_split
seed = 123
train, test = train_test_split(df)
q = list(train['question1']) + list(train['question2']) + list(test['question1']) + list(test['question2'])

In [4]:
def gen_accuracy(y_pred, y_actual):
    """Function to calculate the accuracy of a model, returns the accuracy
        Args: 
            y_pred: predicted values
            y_actual: actual values"""
    
    count = 0
    for i in range(len(y_pred)):
        if y_pred[i] == y_actual[i]:
            count = count+1
    return (count/len(y_pred))*100

In [5]:
#training and testing
sent = list(train['question1']) + list(train['question2']) + list(test['question1']) + list(test['question2'])

vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    ngram_range=(1, 1),
    max_features=30000)

vectorizer.fit(sent)

train_sent = train['question2'] + " " + train['question1']
test_sent = test['question1']+ " " + test['question2']

sent_vect_train = vectorizer.transform(train_sent)
sent_vect_test = vectorizer.transform(test_sent)

In [8]:
from importlib import reload
reload(ml_models)

<module 'models.ml_models' from 'C:\\Users\\hkpat\\Masters\\NLP\\NLI_project\\models\\ml_models.py'>

In [9]:
ml = ml_models.ml_models(sent_vect_train, train['is_duplicate']) 

### Logistic regression

In [10]:
result_lr = ml.logistic_regression(sent_vect_test)

In [11]:
result_lr

array([0, 0, 1, ..., 0, 0, 0], dtype=int64)

In [12]:
gen_accuracy(result_lr, test['is_duplicate'].to_numpy())

76.91371582915319

In [13]:
print('f1_socre:' + str(f1_score(test['is_duplicate'], result_lr, average = 'weighted')))
print('recall_socre:' + str(recall_score(test['is_duplicate'], result_lr, average = 'weighted')))
print('precision_socre:' + str(precision_score(test['is_duplicate'], result_lr, average = 'weighted')))

f1_socre:0.7663579349632655
recall_socre:0.7691371582915318
precision_socre:0.7658356373356363


### SVM

In [32]:
#SVC
result_svc = ml.svm(sent_vect_test)

In [33]:
gen_accuracy(result_svc, test['Quality'])

70.66666666666667

In [36]:
print('f1_socre:' + str(f1_score(test['Quality'], result_svc, average = 'weighted')))
print('recall_socre:' + str(recall_score(test['Quality'], result_svc, average = 'weighted')))
print('precision_socre:' + str(precision_score(test['Quality'], result_svc, average = 'weighted')))

f1_socre:0.6648984227723238
recall_socre:0.7066666666666667
precision_socre:0.6937687991021324


### Random Forest

In [20]:
#RF
result_rf = ml.random_forest(sent_vect_test, max_depth=100, n_estimators=400)

In [23]:
gen_accuracy(result_rf, test['is_duplicate'].to_numpy())

81.33527252579819

In [25]:
print('f1_socre:' + str(f1_score(test['is_duplicate'], result_rf, average = 'weighted')))
print('recall_socre:' + str(recall_score(test['is_duplicate'], result_rf, average = 'weighted')))
print('precision_socre:' + str(precision_score(test['is_duplicate'], result_rf, average = 'weighted')))

f1_socre:0.8074898342401919
recall_socre:0.8133527252579819
precision_socre:0.8136139803947952


### XG Boost

In [26]:
result_xg = ml.xgbclassifier(sent_vect_test, learning_rate = 1)

In [28]:
gen_accuracy(result_xg, test['is_duplicate'].to_numpy())

76.89392815094041

In [29]:
print('f1_socre:' + str(f1_score(test['is_duplicate'], result_xg, average = 'weighted')))
print('recall_socre:' + str(recall_score(test['is_duplicate'], result_xg, average = 'weighted')))
print('precision_socre:' + str(precision_score(test['is_duplicate'], result_xg, average = 'weighted')))

f1_socre:0.7647417270803679
recall_socre:0.768939281509404
precision_socre:0.764988861129378


### Adaboost

In [30]:
result_ada = ml.adaboost(sent_vect_test, n_estimators=400, learning_rate=1)

In [32]:
gen_accuracy(result_ada, test['is_duplicate'].to_numpy())

74.12563196897291

In [33]:
print('f1_socre:' + str(f1_score(test['is_duplicate'], result_ada, average = 'weighted')))
print('recall_socre:' + str(recall_score(test['is_duplicate'], result_ada, average = 'weighted')))
print('precision_socre:' + str(precision_score(test['is_duplicate'], result_ada, average = 'weighted')))

f1_socre:0.7325606018746793
recall_socre:0.7412563196897292
precision_socre:0.7355763708732987


### Voting Classifier

In [4]:
result_voting = ml.voting_classifier(sent_vect_test)

In [5]:
gen_accuracy(result_voting, test['is_duplicate'].to_numpy())

In [6]:
print('f1_socre:' + str(f1_score(test['is_duplicate'], result_voting, average = 'weighted')))
print('recall_socre:' + str(recall_score(test['is_duplicate'], result_voting, average = 'weighted')))
print('precision_socre:' + str(precision_score(test['is_duplicate'], result_voting, average = 'weighted')))