In [11]:
#Loading the required libraries
import pandas as pd
import numpy as np
import numpy
import csv
import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, AdaBoostClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split

In [12]:
# Loading the training and testing data
data = pd.read_csv('../Data/train.csv')
data.dropna(inplace = True)
#data = data[:30000]
train, test = train_test_split(data, random_state = 123)


In [13]:
def gen_accuracy(y_pred, y_actual):
    """Function to calculate the accuracy of a model, returns the accuracy
        Args: 
            y_pred: predicted values
            y_actual: actual values"""
    
    count = 0
    for i in range(len(y_pred)):
        if y_pred[i] == y_actual[i]:
            count = count+1
    return (count/len(y_pred))*100

## Data preprocessing

In [14]:
#training and testing
q = list(train['question1']) + list(train['question2']) + list(test['question1']) + list(test['question2'])

vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    ngram_range=(1, 1),
    max_features=30000)

vectorizer.fit(q)

train_q = train['question1'] + " " + train['question2']
test_q = test['question1']+ " " + test['question2']

sent_q_train = vectorizer.transform(train_q)
sent_q_test = vectorizer.transform(test_q)

In [15]:
#using word embedding

## Logistic regression

In [15]:
solvers = ['liblinear', 'newton-cg', 'lbfgs', 'sag', 'saga']
C = [0.001,0.01,0.1,1,10,100]
random_state = 0
max_iter = 1000
penality = 'l2'
#penality = 'elasticnet'
acc_saga = []
acc_lgbf = []

In [18]:
#Saga
accuracy = {}
for solver in solvers:
    acc = []
    for i in C:
        lr = LogisticRegression(penalty = 'l2', 
                                C =i, 
                                random_state=random_state,
                                max_iter=max_iter, solver = solver)
        
        lr.fit(sent_q_train, train['is_duplicate'])
    
        result_lr = lr.predict(sent_q_test)
        
        acc.append(gen_accuracy(result_lr, list(test['is_duplicate'].to_numpy())))
        
    accuracy[solver] = acc

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [19]:
accuracy

{'liblinear': [63.19455437707773,
  70.73670254867817,
  74.22332594586038,
  75.8657194870983,
  76.81652683235714,
  76.8422510685452],
 'newton-cg': [63.16784074719012,
  70.72384043058413,
  74.2213471584613,
  75.8627513059997,
  76.81751622605667,
  76.84818743074244],
 'lbfgs': [63.166851353490586,
  70.72384043058413,
  74.22530473325946,
  75.85879373120153,
  76.81157986385942,
  76.85412379293969],
 'sag': [63.16784074719012,
  70.72384043058413,
  74.2213471584613,
  75.8627513059997,
  76.81751622605667,
  76.84917682444198],
 'saga': [63.16784074719012,
  70.72384043058413,
  74.2213471584613,
  75.86374069969922,
  76.81553743865759,
  76.84917682444198]}

## SVM

In [7]:
C = [0.001,0.01,0.1,1,10,100]
kernels = ['linear']

In [None]:
#Saga
accuracy = {}
for kernel in kernels:
    acc = []
    for i in C:
        svc = SVC(C = i, kernel = kernel )
        
        svc.fit(sent_q_train, train['is_duplicate'])
        
        result_svm = svc.predict(sent_q_test)
        
        acc.append(gen_accuracy(result_svm, list(test['is_duplicate'].to_numpy())))
        
    accuracy[kernel] = acc

In [6]:
import time
aa = time.time()
svc = SVC(C = .1, kernel = 'linear' )

svc.fit(sent_q_train, train['is_duplicate'])

result_svm = svc.predict(sent_q_test)
bb = time.time()
print(bb-aa)

115.84925270080566


In [None]:
accuracy

### Random forest

In [18]:
n_estimators = [100, 200, 400, 600, 800, 1000]
max_depth = [10, 50, 100, 200]

In [None]:
accuracy = []
for depth in max_depth:
    acc = []
    for i in n_estimators:
        print(i)
        rf = RandomForestClassifier(n_estimators=i, 
                                   bootstrap = True,
                                           max_features = 'sqrt',
                                    max_depth = depth)
        
        rf.fit(sent_q_train, train['is_duplicate'])
        
        result_svm = rf.predict(sent_q_test)
        
        acc.append(gen_accuracy(result_svm, list(test['is_duplicate'].to_numpy())))
        
    accuracy.append(acc)

100
200
400
600
800
1000
100
200
400
600
800
1000
100
200
400
600


In [None]:
accuracy

### XGB classifier

In [None]:
learning_rate = [.0001, .001, .01, .1]
drop_out = ['l2', 'l1-l2', 'l1']

learning_rate    = [0.05, 0.10, 0.15, 0.20, 0.25, 0.30, .001, .01, 1 ]
max_depth        = [ 3, 4, 5, 6, 8, 10, 12, 15]
min_child_weight = [ 1, 3, 5, 7 ]
gamma            = [ 0.0, 0.1, 0.2 , 0.3, 0.4 ]
colsample_bytree = [ 0.3, 0.4, 0.5 , 0.7 ]
n_estimators = [100, 200, 300, 400, 500]

In [None]:
accuracy =[]
for i in n_estimators:
    acc = []
    for j in learning_rate: 
        xgb = XGBClassifier(n_estimators=i, 
                               bootstrap = True,
                               max_features = 'sqrt',
                               max_depth = 3,
                               learning_rate = j)

        svc.fit(sent_q_train, train['is_duplicate'])

        result_svm = svc.predict(sent_q_test)

        acc.append(gen_accuracy(result_svm, list(test['is_duplicate'].to_numpy())))
    accuracy.append(acc)

In [None]:
accuracy

### Adaboost

In [None]:
learning_rate = [.0001, .001, .01, .1, 1, 10]
n_estimators = [100, 200, 300, 400]

In [None]:
accuracy = []
for n_est in n_estimators:
    acc = []
    for i in learning_rate:
        ada = AdaBoostClassifier(n_estimators = n_est, learning_rate = i)
        ada.fit(sent_q_train, train['is_duplicate'])
        result_ada = ada.predict(sent_q_test)
        acc.append(gen_accuracy(result_ada, test['is_duplicate'].to_numpy()))
    accuracy.append(acc)

In [None]:
accuracy

### Voting classifier

In [None]:
xgb = XGBClassifier(max_depth=3,learning_rate=0.01,n_estimators=312)                      

ada_boost = AdaBoostClassifier()

random_forest = RandomForestClassifier(n_estimators=100, 
                       bootstrap = True,
                       max_features = 'sqrt')
lr = LogisticRegression(solver = 'saga', penalty = 'elasticnet', l1_ratio=.5, random_state=0)

rf = random_forest = RandomForestClassifier(n_estimators=100, 
                       bootstrap = True,
                       max_features = 'sqrt')

svm= SVC(probability=True)


model = VotingClassifier(estimators=[('logistic', lr), 
                                     ('ada', ada_boost),
                                     ('random_forest', rf),
                                     ('xgb', xgb),
                                     ('svm', svm)], 
               voting='soft', weights=[1,1,3,1,1]).fit(sent_vect_train,train['Quality'])
