# Parameter Tuning MSR Paraphrase

<p>The best parameters for the machine learning alogorithms for the msr paraphrase data. The following machine learning algorithms models will be tuned:
<ul> <li> Logisitc Regression</li>
    <li> Support vector classification </li>
    <li> Random Forest</li>
    <li> XGB Classifier</li>
    <li> Adaboost Classifier </li>
    <li> Voting Classifier </li>

In [16]:
#Loading the required libraries
import pandas as pd
import numpy as np
import numpy
import csv
import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, AdaBoostClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost import XGBClassifier

<p> Loading the msr paraphrase data. It contains texts which are taken from online news websites

In [6]:
# Loading the training and testing data
train = pd.read_csv(r'../Data/msr_paraphrase_train.txt', sep = '\t', quoting=csv.QUOTE_NONE)
test = pd.read_csv(r'../Data/msr_paraphrase_test.txt', sep = '\t', quoting=csv.QUOTE_NONE)

In [7]:
def gen_accuracy(y_pred, y_actual):
    """Function to calculate the accuracy of a model, returns the accuracy
        Args: 
            y_pred: predicted values
            y_actual: actual values"""
    
    count = 0
    for i in range(len(y_pred)):
        if y_pred[i] == y_actual[i]:
            count = count+1
    return (count/len(y_pred))*100

## Data preprocessing

<p>

In [8]:
#training and testing
sent = list(train['#1 String']) + list(train['#2 String']) + list(test['#1 String']) + list(test['#2 String'])

vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    ngram_range=(1, 1),
    max_features=30000)

vectorizer.fit(sent)

train_sent = train['#1 String'] + " " + train['#2 String']
test_sent = test['#1 String']+ " " + test['#2 String']

sent_vect_train = vectorizer.transform(train_sent)
sent_vect_test = vectorizer.transform(test_sent)

## Machine learning Models

### Logistic Regression

<p> The parameters that are tested:
<ul> <li> solvers </li>
    <li> C</li>

In [9]:
solvers = ['liblinear', 'newton-cg', 'lbfgs', 'sag', 'saga']
C = [0.001,0.01,0.1,1,10,100]
random_state = 0
max_iter = 1000
penality = 'l2'
#penality = 'elasticnet'
acc_saga = []
acc_lgbf = []

In [11]:
#Saga
accuracy = {}
for solver in solvers:
    acc = []
    for i in C:
        lr = LogisticRegression(penalty = 'l2', 
                                C =i, 
                                random_state=random_state,
                                max_iter=max_iter, solver = solver)
        
        lr.fit(sent_vect_train, train['Quality'])
    
        lr.fit(sent_vect_train, train['Quality'])
        
        result_lr = lr.predict(sent_vect_test)
        
        acc.append(gen_accuracy(result_lr, list(test['Quality'].to_numpy())))
        
    accuracy[solver] = acc

In [49]:
accuracy

{'liblinear': [66.4927536231884,
  66.4927536231884,
  68.11594202898551,
  68.57971014492755,
  67.76811594202898,
  66.26086956521739],
 'newton-cg': [66.4927536231884,
  66.4927536231884,
  68.05797101449275,
  68.57971014492755,
  67.76811594202898,
  66.26086956521739],
 'lbfgs': [66.4927536231884,
  66.4927536231884,
  68.05797101449275,
  68.57971014492755,
  67.76811594202898,
  66.26086956521739],
 'sag': [66.4927536231884,
  66.4927536231884,
  68.05797101449275,
  68.57971014492755,
  67.76811594202898,
  66.43478260869566],
 'saga': [66.4927536231884,
  66.4927536231884,
  68.11594202898551,
  68.57971014492755,
  67.76811594202898,
  66.31884057971014]}

### Support Vector Classifier

In [51]:
C = [0.001,0.01,0.1,1,10,100]
kernels = ['linear', 'rbf', 'sigmoid']

In [53]:
#Saga
accuracy = {}
for kernel in kernels:
    acc = []
    for i in C:
        svc = SVC(C = i, kernel = kernel )
        
        svc.fit(sent_vect_train, train['Quality'])
        
        result_svm = svc.predict(sent_vect_test)
        
        acc.append(gen_accuracy(result_svm, list(test['Quality'].to_numpy())))
        
    accuracy[kernel] = acc

In [54]:
accuracy

{'linear': [66.4927536231884,
  66.4927536231884,
  68.0,
  68.69565217391305,
  64.0,
  63.36231884057971],
 'rbf': [66.4927536231884,
  66.4927536231884,
  67.42028985507247,
  69.44927536231884,
  70.31884057971014,
  70.66666666666667],
 'sigmoid': [66.4927536231884,
  66.4927536231884,
  68.0,
  68.98550724637681,
  63.94202898550725,
  59.59420289855073]}

### Random Forest
<p> <font size =3> Parameters considered:
<ul> <li> n_estimators </li>
    <li> max_depth </li>

In [55]:
n_estimators = [100, 200, 400, 600, 800, 1000]
max_depth = [10, 50, 100, 200]

In [59]:
accuracy = []
for depth in max_depth:
    acc = []
    for i in n_estimators:
        rf = RandomForestClassifier(n_estimators=i, 
                                   bootstrap = True,
                                           max_features = 'sqrt',
                                    max_depth = depth)
        
        rf.fit(sent_vect_train, train['Quality'])
        
        result_svm = rf.predict(sent_vect_test)
        
        acc.append(gen_accuracy(result_svm, list(test['Quality'].to_numpy())))
        
    accuracy.append(acc)

In [60]:
accuracy

[[67.88405797101449,
  67.88405797101449,
  68.11594202898551,
  68.05797101449275,
  68.17391304347827,
  68.17391304347827],
 [69.3913043478261,
  69.44927536231884,
  69.3913043478261,
  69.04347826086956,
  69.21739130434783,
  68.98550724637681],
 [69.97101449275362,
  69.85507246376812,
  70.02898550724638,
  70.02898550724638,
  69.79710144927537,
  70.02898550724638],
 [70.89855072463767,
  70.78260869565217,
  70.72463768115942,
  71.01449275362319,
  70.72463768115942,
  70.78260869565217]]

### XGB

In [17]:
learning_rate = [.0001, .001, .01, .1]

learning_rate    = [0.05, 0.10, 0.15, 0.20, 0.25, 0.30, .001, .01, 1 ]
max_depth        = [ 3, 4, 5, 6, 8, 10, 12, 15]
min_child_weight = [ 1, 3, 5, 7 ]
gamma            = [ 0.0, 0.1, 0.2 , 0.3, 0.4 ]
colsample_bytree = [ 0.3, 0.4, 0.5 , 0.7 ]
n_estimators = [100, 200, 300, 400, 500]

In [23]:
accuracy =[]
for j in learning_rate:
    xgb = XGBClassifier(n_estimators=100, 
                           bootstrap = True,
                           max_features = 'sqrt',
                           max_depth = 3,
                           learning_rate = j)

    xgb.fit(sent_vect_train, train['Quality'])

    result_svm = xgb.predict(sent_vect_test)

    accuracy.append(gen_accuracy(result_svm, list(test['Quality'].to_numpy())))
#accuracy.append(acc)

In [24]:
accuracy

[69.91304347826087,
 69.73913043478261,
 70.3768115942029,
 70.14492753623188,
 69.79710144927537,
 69.5072463768116,
 68.57971014492755,
 68.92753623188406,
 68.28985507246377]

In [67]:
# xgb = XGBClassifier(n_estimators=100, 
#                                bootstrap = True,
#                                max_features = 'sqrt',
#                                max_depth = 15,
#                                min_child_weight = 7,
#                                 learning_rate = .3,
#                                            gamma = .1,
#                                            colsample_bytree = .7)
# xgb.fit(sent_vect_train, train['Quality'])

XGBClassifier(base_score=0.5, booster=None, bootstrap=True, colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.7, gamma=0.1, gpu_id=-1,
              importance_type='gain', interaction_constraints=None,
              learning_rate=0.3, max_delta_step=0, max_depth=15,
              max_features='sqrt', min_child_weight=7, missing=nan,
              monotone_constraints=None, n_estimators=100, n_jobs=0,
              num_parallel_tree=1, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method=None, validate_parameters=False, verbosity=None)

In [70]:
result = xgb.predict(sent_vect_test)

In [71]:
gen_accuracy(result, test['Quality'].to_numpy())

68.98550724637681

### Adaboost

In [81]:
learning_rate = [.0001, .001, .01, .1, 1, 10]
n_estimators = [100, 200, 300, 400]

In [86]:
accuracy = []
for n_est in n_estimators:
    acc = []
    for i in learning_rate:
        ada = AdaBoostClassifier(n_estimators = n_est, learning_rate = i)
        ada.fit(sent_vect_train, train['Quality'])
        result_ada = ada.predict(sent_vect_test)
        acc.append(gen_accuracy(result_ada, test['Quality'].to_numpy()))
    accuracy.append(acc)

In [87]:
accuracy

[[68.23188405797102,
  68.23188405797102,
  68.6376811594203,
  68.81159420289855,
  66.89855072463769,
  32.231884057971016],
 [68.23188405797102,
  68.23188405797102,
  68.92753623188406,
  68.98550724637681,
  67.3623188405797,
  32.231884057971016],
 [68.23188405797102,
  68.34782608695652,
  68.8695652173913,
  69.6231884057971,
  65.85507246376811,
  32.231884057971016],
 [68.23188405797102,
  68.34782608695652,
  68.81159420289855,
  69.33333333333334,
  65.85507246376811,
  32.231884057971016]]

### Voting classifier

In [None]:
xgb = XGBClassifier(max_depth=3,learning_rate=0.01,n_estimators=312)                      

ada_boost = AdaBoostClassifier()

random_forest = RandomForestClassifier(n_estimators=100, 
                       bootstrap = True,
                       max_features = 'sqrt')
lr = LogisticRegression(solver = 'saga', penalty = 'elasticnet', l1_ratio=.5, random_state=0)

rf = random_forest = RandomForestClassifier(n_estimators=100, 
                       bootstrap = True,
                       max_features = 'sqrt')

svm= SVC(probability=True)


model = VotingClassifier(estimators=[('logistic', lr), 
                                     ('ada', ada_boost),
                                     ('random_forest', rf),
                                     ('xgb', xgb),
                                     ('svm', svm)], 
               voting='soft', weights=[1,1,3,1,1]).fit(sent_vect_train,train['Quality'])
