In [2]:
import pandas as pd
import scipy
import sklearn
from sklearn import *
import numpy as np
import collections
from scipy import sparse
import nltk
from collections import defaultdict
import re
from CountVectorizer_BagOfWords import CountVectorizer as cv
from TfIdfVectorizer import TfIdfVectorizer as tf
#from Spelling_Correction_c  import Spelling_Correction_c 
import xgboost as xgb
import pickle

# DATA

We are trying to solve the following problem: given a pair of different questions of Quora, decide if they are asking the same or not. In this notebook, we will discuss the process we have followed to solve the problem, the different models that we have used as well as the mistakes that each model makes.

In [3]:
#read the data
available_data = pd.read_csv("quora_train_data.csv")
available_data.head(5)

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,346692,38482,10706,Why do I get easily bored with everything?,Why do I get bored with things so quickly and ...,1
1,327668,454117,345117,How do I study for Honeywell company recruitment?,How do I study for Honeywell company recruitme...,1
2,272993,391373,391374,Which search engine algorithm is Quora using?,Why is Quora not using reliable search engine?,0
3,54070,82673,95496,How can I smartly cut myself?,Can someone who thinks about suicide for 7 yea...,0
4,46450,38384,72436,How do I see who is viewing my Instagram videos?,Can one tell who viewed my Instagram videos?,1


In [4]:
#Split data into train and test
train_df, test_df = sklearn.model_selection.train_test_split(available_data, test_size=0.1, random_state=123)

# AUX FUNCTIONS

We will use the following functions for some of the models.The first functions are meant to extract, given a vectorizer, the matrix of features for the classifier. The two last functions are used to identify the errors that a classifier is making.

In [57]:
def cast_list_as_strings(mylist):
    """
    return a list of strings
    """
    assert isinstance(mylist, list), f"the input mylist should be a list it is {type(mylist)}"
    
    mylist_aux = []
    
    for i in mylist:
        mylist_aux.append(str(i))
        
    return mylist_aux

from scipy.sparse import hstack

def get_features_from_list(q1,q2,count_vectorizer):
    """
    returns a sparse matrix containing the features build by the count vectorizer.
    """
    
    q1_mat = count_vectorizer.transform(q1)
    q2_mat = count_vectorizer.transform(q2)
    X_q1q2 = hstack([q1_mat,q2_mat])
            
    return X_q1q2
    

def get_features_from_df(df, count_vectorizer):
    """
    returns a sparse matrix containing the features build by the count vectorizer.
    Each row should contain features from question1 and question2.
    """
    
    #list of questions where each element of the question is of type string
    q1_casted =  cast_list_as_strings(list(df["question1"]))
    q2_casted =  cast_list_as_strings(list(df["question2"]))    
    
    q1_mat = count_vectorizer.transform(q1_casted)
    q2_mat = count_vectorizer.transform(q2_casted)
    X_q1q2 = hstack([q1_mat,q2_mat])
            
    return X_q1q2

def get_mistakes(clf, X_q1q2, y):
    """
    Returns two lists: one containing the indices of the predictions that are not correct
    and another one containing the predictions
    """
    predictions        = clf.predict(X_q1q2).round(0).astype(int)
    incorrect_preds    = predictions != y
    incorrect_indices, = np.where(incorrect_preds)
    incorrect_indices2 = [x for x in  range(len(incorrect_preds)) if incorrect_preds[x] ==True]
    incorrect_indices3 = np.arange(len(incorrect_preds))[incorrect_preds]        
    
    if np.sum(incorrect_preds)==0:
        print("no mistakes in this df")
    else:
        return incorrect_indices, predictions
    
def print_mistake_k(k, dataset, mistake_indices, predictions):
    """
    Auxiliar function to print the k-th mistake made in the prediction
    """
    print("Original q1: ", train_df.iloc[mistake_indices[k]].question1, " Treated q1: ", dataset[mistake_indices[k]])
    print("Original q2: ", train_df.iloc[mistake_indices[k]].question2, " Treated q2: ", dataset[mistake_indices[k]+train_df.shape[0]])
    print("true class:", train_df.iloc[mistake_indices[k]].is_duplicate)
    print("prediction:", predictions[mistake_indices[k]])  

# PREPROCESS DATA

A first naive model was proposed in class: pass the text through the vectorizers and use the returned matrix as the matrix of features. We saw that the classifier wrongly classified some questions with spelling mistakes. For example, the classifiera would identify as different questions those who were written like "whats" from those who were written like "what's". 

We thought that this problem may be common with any model that we try to train, so the first thing we propose to do is correcting the spelling mistakes. We propose to remove "'s", change the negatives "'t" for "not" as well as the plurals "'re" for "are", remove symbols and points. Then, we implemented a spell checking function using the edit distance. 

This cell is necessary in order to obtain a list of documents. This is the structure we usually want, at least for the vectorizers.

In [6]:
#Convert all elements of the documents into strings 
q1_train_raw =  cast_list_as_strings(list(train_df["question1"]))
q2_train_raw =  cast_list_as_strings(list(train_df["question2"]))
q1_test_raw  =  cast_list_as_strings(list(test_df["question1"]))
q2_test_raw  =  cast_list_as_strings(list(test_df["question2"]))

all_questions_raw = q1_train_raw + q2_train_raw

This code takes a while to compute, so we write the result in a text file. We DON'T NEED TO RUN THE FOLLOWING CELLS, they are here to illustrate the process that we have done. 

In [7]:
#Load the words of our corpus
nltk.download('words')
words = nltk.corpus.words.words()
words.extend(['online', 'Quora'])

[nltk_data] Downloading package words to
[nltk_data]     C:\Users\Ignasi\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


In [8]:
#Create the spelling correction object (it will create the BK tree)
spelling_c = Spelling_Correction_c( words, tol = 1)

NameError: name 'Spelling_Correction_c' is not defined

In [9]:
def document_cleaner_spelling(spelling, text):
    clean_doc_pattern = re.compile( r"('\w)|([^a-zA-Z0-9.])") #Find words containing alphanumeric or points
    q = re.sub('\'s', '', text) #Remove 's
    q = re.sub('\'t', ' not', q) #Change 't for not'
    q = re.sub('\'re', ' are', q) #Change 're for are'
    q = re.sub('[?%!@#$\'\""]', '', q)#Remove symbols
    q = re.sub('\.\s', ' ', q)#Remove points with a space afterwards
    clean_q = clean_doc_pattern.sub(" ", q)
    correct_q = spelling_c.correct_text(clean_q)#Clean spelling mistakes
    return correct_q

document_cleaner_spelling(spelling_c, "Why does Sasuke have only two cars in ex-girlfriend The Movie?")

'why do sasuke have only two car in ex girlfriend the movie'

In [None]:
i=0
print(len(q1_train))
q1_train_cleaned = []
for quest in q1_train:
    #print(i)
    quest_cl = document_cleaner_spelling(spelling_c,quest)
    q1_train_cleaned.append(quest_cl)
    i+=1

In [11]:
with open('q1_train_cleaned.txt', 'w') as f:
    for item in q1_train_cleaned:
        f.write("%s\n" % item)

In [None]:
i=0
print(len(q2_train))
q2_train_cleaned = []
for quest in q2_train:
    #print(i)
    quest_cl = document_cleaner_spelling(spelling_c,quest)
    q2_train_cleaned.append(quest_cl)
    i+=1

with open('q2_train_cleaned.txt', 'w') as f:
    for item in q2_train_cleaned:
        f.write("%s\n" % item)

In [None]:
i=0
print(len(q1_test))
q1_test_cleaned = []
for quest in q1_test:
    #print(i)
    quest_cl = document_cleaner_spelling(spelling_c,quest)
    q1_test_cleaned.append(quest_cl)
    i+=1

with open('q1_test_cleaned.txt', 'w') as f:
    for item in q1_test_cleaned:
        f.write("%s\n" % item)

In [None]:
i=0
print(len(q2_test))
q2_test_cleaned = []
for quest in q2_test:
    #print(i)
    quest_cl = document_cleaner_spelling(spelling_c,quest)
    q2_test_cleaned.append(quest_cl)
    i+=1

with open('q2_test_cleaned.txt', 'w') as f:
    for item in q2_test_cleaned:
        f.write("%s\n" % item)

In [47]:
with open('train_labels.txt', 'w') as f:
    for item in train_df['is_duplicate'].values:
        f.write("%s\n" % item)

with open('test_labels.txt', 'w') as f:
    for item in test_df['is_duplicate'].values:
        f.write("%s\n" % item)


## Run from here

We run these cells to obtain the results of the  cleaned text fromthe txt files.

In [9]:
with open('q1_train_cleaned.txt') as f:
    content = f.readlines()
# you may also want to remove whitespace characters like `\n` at the end of each line
q1_train = [x.strip() for x in content] 

In [10]:
with open('q2_train_cleaned.txt') as f:
    content = f.readlines()
# you may also want to remove whitespace characters like `\n` at the end of each line
q2_train = [x.strip() for x in content] 

In [11]:
with open('q1_test_cleaned.txt') as f:
    content = f.readlines()
# you may also want to remove whitespace characters like `\n` at the end of each line
q1_test = [x.strip() for x in content] 

In [12]:
with open('q2_test_cleaned.txt') as f:
    content = f.readlines()
# you may also want to remove whitespace characters like `\n` at the end of each line
q2_test = [x.strip() for x in content] 

In [13]:
all_questions= q1_train + q2_train

In [14]:
with open('train_labels.txt') as f:
    content = f.readlines()
# you may also want to remove whitespace characters like `\n` at the end of each line
train_labels = [int(x.strip()) for x in content] 

with open('test_labels.txt') as f:
    content = f.readlines()
# you may also want to remove whitespace characters like `\n` at the end of each line
test_labels = [int(x.strip()) for x in content] 

# FIRST MODEL: NAIVE CLASSIFIER WITH SPELL CHECKING
For the first model, we just wanted to see what difference did the spellchecking do. So, did we improve the results? Did we improve the results as expected? If so, what mistakes is our model doing now?

We will do this checking for bot the CountVectorizer and the TfIdfVectorizer

## First model - with CountVectorizer

In [15]:
from nltk.stem import PorterStemmer, LancasterStemmer, WordNetLemmatizer, SnowballStemmer
from nltk.corpus import stopwords 
nltk.download('stopwords')
nltk.download('wordnet')

#How to call such stemmers and lemmatizer in the CountVectorizer object:
#PorterStemmer(): token_cleaner_func = PorterStemmer().stem
#LancasterStemmer(): token_cleaner_func = LancasterStemmer().stem
#SnowballStemmer(language='english'): token_cleaner_func = SnowballStemmer(language='english').stem
#WordNetLemmatizer(): token_cleaner_func = lambda doc: WordNetLemmatizer().lemmatize(doc,pos="v")



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Ignasi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Ignasi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [16]:
#inicialize the CountVectorizer and define its parameters
CountVectorizer = cv(stop_words = set(stopwords.words('english')),
                     ngram_range=(1,3), max_df = 0.99, min_df = 5)

In [17]:
#fit the CountVectorizer
CountVectorizer.fit(all_questions)

CountVectorizer(doc_cleaner_pattern="('\\w+)", document_cleaner_func=None,
        dtype=<class 'numpy.float32'>, max_df=0.99, min_df=5,
        min_word_counts=1, ngram_range=(1, 3),
        stop_words={'them', 'an', 'herself', 'but', 'd', 'some', 'had', 'yours', "isn't", 'than', 'then', 'being', 'after', 'i', 'until', 'with', 'between', 'won', 'should', 'what', 'over', 'again', 'for', 'more', 'before', 'having', 'y', "aren't", 'just', 'other', 'wouldn', 'each', 'how', 'does', 'such', '...s', 'who', 'those', 'a', 'as', 'ourselves', "mightn't", 'yourself', 'ain', 'nor', 'needn', 'mightn'},
        token_cleaner_func=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer_func=None)

In [18]:
countvect_sk = sklearn.feature_extraction.text.CountVectorizer(stop_words = set(stopwords.words('english')),ngram_range=(1, 3),
                                                              max_df = 0.99, min_df = 5)
countvect_sk.fit(all_questions)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.99, max_features=None, min_df=5,
        ngram_range=(1, 3), preprocessor=None,
        stop_words={'them', 'an', 'herself', 'but', 'd', 'some', 'had', 'yours', "isn't", 'than', 'then', 'being', 'after', 'i', 'until', 'with', 'between', 'won', 'should', 'what', 'over', 'again', 'for', 'more', 'before', 'having', 'y', "aren't", 'just', 'other', 'wouldn', 'each', 'how', 'does', 'such', '...s', 'who', 'those', 'a', 'as', 'ourselves', "mightn't", 'yourself', 'ain', 'nor', 'needn', 'mightn'},
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

We will the result obtained with our implementation of the CountVectorizer with the result obtained using the sklearn version of the CountVectorizer. One of the objectives of this deliverable was to implement and understend how the vectorizers worked, so we set as an objective to obtain the same as the sklearn vectorizers.

In [19]:
X_tr_q1q2 = get_features_from_list(q1_train, q2_train,CountVectorizer)
print("With our CountVectorizer: ", X_tr_q1q2.shape, train_df.shape)

X_tr_q1q2_sk = get_features_from_list(q1_train, q2_train,countvect_sk)
print("With sklearn CountVectorizer: ", X_tr_q1q2_sk.shape, train_df.shape)

X_te_q1q2  = get_features_from_list(q1_test, q2_test, CountVectorizer)

X_te_q1q2_sk  = get_features_from_list(q1_test, q2_test, countvect_sk)

With our CountVectorizer:  (291088, 285364) (291088, 6)
With sklearn CountVectorizer:  (291088, 285364) (291088, 6)


Result using our implementation of CountVectorizer

In [20]:
logistic = sklearn.linear_model.LogisticRegression(solver="liblinear")
logistic.fit(X_tr_q1q2, train_labels)

#train roc auc metrics
print("Result on train: ", sklearn.metrics.roc_auc_score(y_true = train_labels, y_score = logistic.predict(X_tr_q1q2)))

#test roc auc metrics
print("Result on test: ", sklearn.metrics.roc_auc_score(y_true = test_labels, y_score = logistic.predict(X_te_q1q2)))

Result on train:  0.8873892706438838
Result on test:  0.7564062563763118


Result using sklearn implementation of CountVectorizer

In [31]:
logistic_sk = sklearn.linear_model.LogisticRegression(solver="liblinear")
logistic_sk.fit(X_tr_q1q2_sk, train_labels)

#train roc auc metrics
print("Result on train: ", sklearn.metrics.roc_auc_score(y_true = train_labels, y_score = logistic_sk.predict(X_tr_q1q2_sk)))

#test roc auc metrics
print("Result on test: ", sklearn.metrics.roc_auc_score(y_true = test_labels, y_score = logistic_sk.predict(X_te_q1q2_sk)))

Result on train:  0.8873667596649119
Result on test:  0.7563817405636126


### Qualitative information about the mistakes

We wanted to identify the mistakes that the classifier was doing in this case. We saw that the classifier was making mistages mainly for the following reasons:
- The questions are the same, but the sentences have lots of different words.
- The questions are the same, but one sentence is way larger than the other.
- The questions are asking about the same thing but for different years, hence they must be classified as different.
- One of the questions is a subset of the other. This mistake is the harder to solve because sometimes it is even debatable of the questions should be the same or not.

In [21]:
print("Accuracy on training: ", np.sum(train_labels==logistic.predict(X_tr_q1q2))/len(train_labels))
print("Accuracy on test: ", np.sum(test_labels ==logistic.predict(X_te_q1q2))/len(test_labels))

Accuracy on training:  0.9074094431924367
Accuracy on test:  0.788647044274054


In [40]:
mistake_indices, predictions = get_mistakes(logistic, X_tr_q1q2, train_labels)
print_mistake_k(4, all_questions, mistake_indices, predictions)

Original q1:  Why do men like women's feet?  Treated q1:  why do men like woman foot
Original q2:  Why do men like womens feet?  Treated q2:  why do men like woman foot
true class: 1
prediction: 0


## First model - with TfIdfVectorizer

In [41]:
tfidf_vectorizer = tf(stop_words = set(stopwords.words('english')), ngram_range=(1,3), max_df = 0.4, min_df = 5)
tfidf_vectorizer.fit(all_questions)

tfidf_sk = sklearn.feature_extraction.text.TfidfVectorizer(use_idf=False, smooth_idf=False, sublinear_tf=False,
                                                          stop_words = set(stopwords.words('english')),
                                                          ngram_range=(1,3), max_df = 0.99, min_df = 5)
tfidf_sk.fit(all_questions) 

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.99, max_features=None, min_df=5,
        ngram_range=(1, 3), norm='l2', preprocessor=None, smooth_idf=False,
        stop_words={'them', 'an', 'herself', 'but', 'd', 'some', 'had', 'yours', "isn't", 'than', 'then', 'being', 'after', 'i', 'until', 'with', 'between', 'won', 'should', 'what', 'over', 'again', 'for', 'more', 'before', 'having', 'y', "aren't", 'just', 'other', 'wouldn', 'each', 'how', 'does', 'such', '...s', 'who', 'those', 'a', 'as', 'ourselves', "mightn't", 'yourself', 'ain', 'nor', 'needn', 'mightn'},
        strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=False,
        vocabulary=None)

Again, remember that additionally, we want to compare our result with that given by the implementation of sklearn of the TfIdfVectorizer.

In [42]:
X_tr_q1q2 = get_features_from_list(q1_train, q2_train,tfidf_vectorizer)
X_tr_q1q2_sk = get_features_from_list(q1_train, q2_train, tfidf_sk)
X_te_q1q2  = get_features_from_list(q1_test, q2_test, tfidf_vectorizer)
X_te_q1q2_sk  = get_features_from_list(q1_test, q2_test, tfidf_sk)

print("With our TfIdf Vectorizer:", X_tr_q1q2.shape, train_df.shape)
print("With sklearn TfIdf Vectorizer:", X_tr_q1q2_sk.shape, train_df.shape)

With our TfIdf Vectorizer: (291088, 285364) (291088, 6)
With sklearn TfIdf Vectorizer: (291088, 285364) (291088, 6)


Result using our implementation of TfIdf Vectorizer

In [43]:
logistic = sklearn.linear_model.LogisticRegression(solver="liblinear")
logistic.fit(X_tr_q1q2, train_labels)

print("Result on train: ", sklearn.metrics.roc_auc_score(y_true = train_labels, y_score = logistic.predict(X_tr_q1q2)))

#test roc auc metrics
print("Result on test: ", sklearn.metrics.roc_auc_score(y_true = test_labels, y_score = logistic.predict(X_te_q1q2)))

Result on train:  0.8303725534934316
Result on test:  0.7532320054109183


Result using sklearn implementation of TfIdf Vectorizer. Note that the result is different because the formula that sklearn uses is different from ours.

In [44]:
logistic_sk = sklearn.linear_model.LogisticRegression(solver="liblinear")
logistic_sk.fit(X_tr_q1q2_sk, train_labels)

print("Result on train: ", sklearn.metrics.roc_auc_score(y_true = train_labels, y_score = logistic_sk.predict(X_tr_q1q2_sk)))

#test roc auc metrics
print("Result on test: ", sklearn.metrics.roc_auc_score(y_true = test_labels, y_score = logistic_sk.predict(X_te_q1q2_sk)))

Result on train:  0.8039907438250422
Result on test:  0.7445016739956617


### Qualitative information about the mistakes

So, in this case, the mistakes are practically the same, hence:
- The questions are the same, but the sentences have lots of different words.
- The questions are the same, but one sentence is way larger than the other.
- The questions are asking about the same thing but for different years, hence they must be classified as different.
- One of the questions is a subset of the other. This mistake is the harder to solve because sometimes it is even debatable of the questions should be the same or not.

In [45]:
print("Accuracy on training: ", np.sum(train_labels ==logistic.predict(X_tr_q1q2))/len(train_labels))
print("Accuracy on test: ", np.sum(test_labels ==logistic.predict(X_te_q1q2))/len(test_labels))

Accuracy on training:  0.8615642005166823
Accuracy on test:  0.7928827603264902


In [46]:
mistake_indices, predictions = get_mistakes(logistic, X_tr_q1q2, train_labels)
print_mistake_k(4, all_questions, mistake_indices, predictions)

Original q1:  Are Persians considered Caucasian?  Treated q1:  be Persian consider Caucasian
Original q2:  Are Persians White?  Treated q2:  be Persian white
true class: 1
prediction: 0


# SECOND MODEL: NAIVE CLASSIFIER WITH EXTRA FEATURES

Given the mistakes encountered in the previous model, we tried to code some extra features to tackle with those problems.

### Code to obtain the extra features.

Here we give a list of extra features that we could add to the feature vector.

1. Lenght of the question

2. Is there a [math] tag? 

3. Is there a number in the question?

4. Is it the same number in both questions? 

5. % of intersection words?


In [47]:
def get_qlength(questions):
    qlen = []
    for quest in questions:
        clean_doc_pattern = re.compile( r"('\w)|([^a-zA-Z0-9.])") #Find words containing alphanumeric or points
        q = re.sub('\'s', '', quest) #Remove 's
        q = re.sub('\'t', ' not', q) #Change 't for not'
        q = re.sub('\'re', ' are', q) #Change 're for are'
        q = re.sub('[?%!@#$\'\""]', '', q)#Remove symbols
        q = re.sub('\.\s', ' ', q)#Remove points with a space afterwards
        clean_q = clean_doc_pattern.sub(" ", q)
        qlen.append(len(re.findall(r"(?u)\b[\w.,]+\b",q)))
        
    return np.array(qlen).reshape(-1,1)

def is_math(questions):
    math=[]
    for quest in questions:
        if '[math]' in quest:
            math.append(1)
        else:
            math.append(0)
    return np.array(math).reshape(-1,1)
    
def is_number(word):
    try :  
        w = float(word) 
        if(np.isnan(w)):
            return 0
        if(np.isinf(w)):
            return 0
        res = 1
    except : 
        res = 0
    return res    

def has_numbers(questions):
    num=np.zeros((len(questions)))
    which_num = np.zeros((len(questions)))
    i=0
    for quest in questions:
        for w in re.findall(r"(?u)\b[\w.,]+\b",quest):
            is_num = is_number(w)
            if is_num==1:
                num[i]=1
                which_num[i]=float(w)
                if(np.isnan(which_num[i])):
                    print(which_num[i])
                    print(float(w))
                break
        i+=1
    return num.reshape(-1,1), which_num.reshape(-1,1)


def is_different_number(which_num1, which_num2):
    dif = which_num1 - which_num2
    dif[dif>0]=1
    return np.array(dif).reshape(-1,1)

In [48]:
def q1_q2_intersect(row, q1, q2, q_dict):
    set1 = set(q_dict[q1[row]])
    set2 = set(q_dict[q2[row]])
    return(len(set1.intersection(set2))/len(set1.union(set2)))


def intersection(q1_train, q2_train, q1_test, q2_test):
    q1 = q1_train + q1_test
    q2 = q2_train + q2_test
    q_dict = defaultdict(set)
    for i in range(len(q1)):
            q_dict[q1[i]].add(q2[i])
            q_dict[q2[i]].add(q1[i])

    intersect_train = []
    intersect_test = []
    for row in range(len(q1_train)):
        intersect_train.append(q1_q2_intersect(row, q1_train, q2_train, q_dict))
    
    for row in range(len(q1_test)):
        intersect_test.append(q1_q2_intersect(row, q1_test, q2_test, q_dict))
    
    return np.array(intersect_train).reshape(-1,1), np.array(intersect_test).reshape(-1,1)

In [49]:
intersect_train, intersect_test = intersection(q1_train, q2_train, q1_test, q2_test)

In [50]:
num1_train, which_num1_train=  has_numbers(q1_train_raw)
num2_train, which_num2_train =  has_numbers(q2_train_raw)
dif_number_train = is_different_number(which_num1_train,which_num2_train)

num1_test, which_num1_test=  has_numbers(q1_test_raw)
num2_test, which_num2_test =  has_numbers(q2_test_raw)
dif_number_test = is_different_number(which_num1_test,which_num2_test)

In [51]:
math1_train = is_math(q1_train_raw)
math2_train = is_math(q2_train_raw)

math1_test = is_math(q1_test_raw)
math2_test = is_math(q2_test_raw)

In [52]:
len1_train = get_qlength(q1_train_raw)
len2_train = get_qlength(q2_train_raw)

len1_test = get_qlength(q1_test_raw)
len2_test = get_qlength(q2_test_raw)

## Second model - with CountVectorizer

In [53]:
from nltk.stem import PorterStemmer, LancasterStemmer, WordNetLemmatizer, SnowballStemmer
from nltk.corpus import stopwords 
nltk.download('stopwords')
nltk.download('wordnet')


#inicialize the CountVectorizer and define its parameters
CountVectorizer = cv(stop_words = set(stopwords.words('english')),
                     ngram_range=(1,3), max_df = 0.99, min_df = 5)
#fit the CountVectorizer
CountVectorizer.fit(all_questions)

countvect_sk = sklearn.feature_extraction.text.CountVectorizer(stop_words = set(stopwords.words('english')),ngram_range=(1, 3),
                                                              max_df = 0.99, min_df = 5)
countvect_sk.fit(all_questions)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Ignasi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Ignasi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.99, max_features=None, min_df=5,
        ngram_range=(1, 3), preprocessor=None,
        stop_words={'them', 'an', 'herself', 'but', 'd', 'some', 'had', 'yours', "isn't", 'than', 'then', 'being', 'after', 'i', 'until', 'with', 'between', 'won', 'should', 'what', 'over', 'again', 'for', 'more', 'before', 'having', 'y', "aren't", 'just', 'other', 'wouldn', 'each', 'how', 'does', 'such', '...s', 'who', 'those', 'a', 'as', 'ourselves', "mightn't", 'yourself', 'ain', 'nor', 'needn', 'mightn'},
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In this case, since we have already check that our CountVectorizer yields the same result as the sklearn one, we will only use ours.

In [54]:
X_tr_q1q2 = get_features_from_list(q1_train, q2_train,CountVectorizer)
X_te_q1q2  = get_features_from_list(q1_test, q2_test, CountVectorizer)

In [55]:
print('initial shape', X_tr_q1q2.shape)

new_X_tr_q1q2 = sparse.hstack((X_tr_q1q2,intersect_train, num1_train, num2_train,
                               dif_number_train,math1_train,math2_train,len1_train, len2_train))

new_X_te_q1q2 = sparse.hstack((X_te_q1q2,intersect_test, num1_test, num2_test,
                               dif_number_test, math1_test,math2_test,len1_test, len2_test))

print('final shape', new_X_tr_q1q2.shape)

initial shape (291088, 285364)
final shape (291088, 285372)


We obtain the following result. We see that there is not much of an improvement. We think that this may be due to the imbalance of the values of the different features, i.e., we are not normalizing the values of any of the features. We thought that it would be necessary to change the model, then.

In [58]:
logistic = sklearn.linear_model.LogisticRegression(solver="liblinear")
logistic.fit(X_tr_q1q2, train_labels)

#train roc auc metrics
print("Result on train: ", sklearn.metrics.roc_auc_score(y_true = train_labels, y_score = logistic.predict(X_tr_q1q2)))

#test roc auc metrics
print("Result on test: ", sklearn.metrics.roc_auc_score(y_true = test_labels, y_score = logistic.predict(X_te_q1q2)))

Result on train:  0.8873892706438838
Result on test:  0.7564062563763118


## Second model - with TfIdfVectorizer

In [59]:
tfidf_vectorizer = tf(stop_words = set(stopwords.words('english')), ngram_range=(1,3), max_df = 0.4, min_df = 5)
tfidf_vectorizer.fit(all_questions)

tfidf_sk = sklearn.feature_extraction.text.TfidfVectorizer(use_idf=False, smooth_idf=False, sublinear_tf=False,
                                                          stop_words = set(stopwords.words('english')),
                                                          ngram_range=(1,3), max_df = 0.99, min_df = 5)
tfidf_sk.fit(all_questions) 

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.99, max_features=None, min_df=5,
        ngram_range=(1, 3), norm='l2', preprocessor=None, smooth_idf=False,
        stop_words={'them', 'an', 'herself', 'but', 'd', 'some', 'had', 'yours', "isn't", 'than', 'then', 'being', 'after', 'i', 'until', 'with', 'between', 'won', 'should', 'what', 'over', 'again', 'for', 'more', 'before', 'having', 'y', "aren't", 'just', 'other', 'wouldn', 'each', 'how', 'does', 'such', '...s', 'who', 'those', 'a', 'as', 'ourselves', "mightn't", 'yourself', 'ain', 'nor', 'needn', 'mightn'},
        strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=False,
        vocabulary=None)

Again we will only run the code for our TfIdfVectorizer.

In [60]:
X_tr_q1q2 = get_features_from_list(q1_train, q2_train,tfidf_vectorizer)
X_te_q1q2  = get_features_from_list(q1_test, q2_test, tfidf_vectorizer)

In [61]:
print('initial shape', X_tr_q1q2.shape)

new_X_tr_q1q2 = sparse.hstack((X_tr_q1q2,intersect_train, num1_train, num2_train,
                               dif_number_train,math1_train,math2_train,len1_train, len2_train))

new_X_te_q1q2 = sparse.hstack((X_te_q1q2,intersect_test, num1_test, num2_test,
                               dif_number_test, math1_test,math2_test,len1_test, len2_test))

print('final shape', new_X_tr_q1q2.shape)

initial shape (291088, 285364)
final shape (291088, 285372)


A very similar thing happens with the tfidfVectorizer.

In [62]:
logistic = sklearn.linear_model.LogisticRegression(solver="liblinear")
logistic.fit(X_tr_q1q2, train_labels)

print("Result on train: ", sklearn.metrics.roc_auc_score(y_true = train_labels, y_score = logistic.predict(X_tr_q1q2)))

#test roc auc metrics
print("Result on test: ", sklearn.metrics.roc_auc_score(y_true = test_labels, y_score = logistic.predict(X_te_q1q2)))

Result on train:  0.8303725534934316
Result on test:  0.7532320054109183


# THIRD MODEL: XGBOOST

Given all the previous results, a thing was clear: we needed to change the classifier. So our take was: combine everything we have done until now (text with the spell checking and the extra features) but with a more sophisticated model. We chose the XGBoost.

## Third model - with CountVectorizer

In [63]:
from nltk.stem import PorterStemmer, LancasterStemmer, WordNetLemmatizer, SnowballStemmer
from nltk.corpus import stopwords 
nltk.download('stopwords')
nltk.download('wordnet')


#inicialize the CountVectorizer and define its parameters
CountVectorizer = cv(stop_words = set(stopwords.words('english')),
                     ngram_range=(1,3), max_df = 0.99, min_df = 5)
#fit the CountVectorizer
CountVectorizer.fit(all_questions)

countvect_sk = sklearn.feature_extraction.text.CountVectorizer(stop_words = set(stopwords.words('english')),ngram_range=(1, 3),
                                                              max_df = 0.99, min_df = 5)
countvect_sk.fit(all_questions)

X_tr_q1q2 = get_features_from_list(q1_train, q2_train,CountVectorizer)
X_te_q1q2  = get_features_from_list(q1_test, q2_test, CountVectorizer)

print('initial shape', X_tr_q1q2.shape)
new_X_tr_q1q2 = sparse.hstack((X_tr_q1q2,intersect_train, num1_train, num2_train,
                               dif_number_train,math1_train,math2_train,len1_train, len2_train))
new_X_te_q1q2 = sparse.hstack((X_te_q1q2,intersect_test, num1_test, num2_test,
                               dif_number_test, math1_test,math2_test,len1_test, len2_test))
print('final shape', new_X_tr_q1q2.shape)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Ignasi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Ignasi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


initial shape (291088, 285364)
final shape (291088, 285372)


In [None]:
params = {}
params['objective'] = 'binary:logistic'
params['eval_metric'] = 'auc'
params['eta'] = 0.02
params['max_depth'] = 4

d_train = xgb.DMatrix(new_X_tr_q1q2, label=train_labels)
d_test = xgb.DMatrix(new_X_te_q1q2, label=test_labels)

evallist = [(d_train, 'train'), (d_test, 'test')]

num_iters = 50000

xgb_count_new = xgb.train(params, d_train, num_iters, evallist, early_stopping_rounds=50, verbose_eval=10)


[0]	train-auc:0.82769	test-auc:0.82534
Multiple eval metrics have been passed: 'test-auc' will be used for early stopping.

Will train until test-auc hasn't improved in 50 rounds.
[10]	train-auc:0.83285	test-auc:0.82989
[20]	train-auc:0.83325	test-auc:0.83053
[30]	train-auc:0.83574	test-auc:0.83320
[40]	train-auc:0.83592	test-auc:0.83329
[50]	train-auc:0.83727	test-auc:0.83427
[60]	train-auc:0.83765	test-auc:0.83447
[70]	train-auc:0.83845	test-auc:0.83486
[80]	train-auc:0.84003	test-auc:0.83675
[90]	train-auc:0.84034	test-auc:0.83716
[100]	train-auc:0.84131	test-auc:0.83839
[110]	train-auc:0.84167	test-auc:0.83879
[120]	train-auc:0.84227	test-auc:0.83951
[130]	train-auc:0.84266	test-auc:0.84011
[140]	train-auc:0.84382	test-auc:0.84161
[150]	train-auc:0.84432	test-auc:0.84220
[160]	train-auc:0.84494	test-auc:0.84270
[170]	train-auc:0.84575	test-auc:0.84347
[180]	train-auc:0.84652	test-auc:0.84402
[190]	train-auc:0.84688	test-auc:0.84453
[200]	train-auc:0.84757	test-auc:0.84494
[210]	tra

[1950]	train-auc:0.87919	test-auc:0.87157
[1960]	train-auc:0.87924	test-auc:0.87160
[1970]	train-auc:0.87930	test-auc:0.87163
[1980]	train-auc:0.87936	test-auc:0.87165
[1990]	train-auc:0.87943	test-auc:0.87169
[2000]	train-auc:0.87949	test-auc:0.87172
[2010]	train-auc:0.87958	test-auc:0.87183
[2020]	train-auc:0.87965	test-auc:0.87184
[2030]	train-auc:0.87972	test-auc:0.87187
[2040]	train-auc:0.87977	test-auc:0.87191
[2050]	train-auc:0.87984	test-auc:0.87196
[2060]	train-auc:0.87991	test-auc:0.87200
[2070]	train-auc:0.87996	test-auc:0.87205
[2080]	train-auc:0.88001	test-auc:0.87208
[2090]	train-auc:0.88007	test-auc:0.87210
[2100]	train-auc:0.88016	test-auc:0.87217
[2110]	train-auc:0.88022	test-auc:0.87221
[2120]	train-auc:0.88030	test-auc:0.87227
[2130]	train-auc:0.88038	test-auc:0.87230
[2140]	train-auc:0.88043	test-auc:0.87232
[2150]	train-auc:0.88049	test-auc:0.87238
[2160]	train-auc:0.88054	test-auc:0.87238
[2170]	train-auc:0.88060	test-auc:0.87242
[2180]	train-auc:0.88070	test-auc:

[3910]	train-auc:0.88883	test-auc:0.87752
[3920]	train-auc:0.88889	test-auc:0.87756
[3930]	train-auc:0.88893	test-auc:0.87758
[3940]	train-auc:0.88896	test-auc:0.87759
[3950]	train-auc:0.88900	test-auc:0.87761
[3960]	train-auc:0.88904	test-auc:0.87764
[3970]	train-auc:0.88908	test-auc:0.87766
[3980]	train-auc:0.88911	test-auc:0.87768
[3990]	train-auc:0.88915	test-auc:0.87770
[4000]	train-auc:0.88917	test-auc:0.87772
[4010]	train-auc:0.88921	test-auc:0.87772
[4020]	train-auc:0.88924	test-auc:0.87774
[4030]	train-auc:0.88928	test-auc:0.87775
[4040]	train-auc:0.88931	test-auc:0.87777
[4050]	train-auc:0.88938	test-auc:0.87784
[4060]	train-auc:0.88942	test-auc:0.87788
[4070]	train-auc:0.88946	test-auc:0.87791
[4080]	train-auc:0.88950	test-auc:0.87793
[4090]	train-auc:0.88953	test-auc:0.87796
[4100]	train-auc:0.88957	test-auc:0.87799
[4110]	train-auc:0.88961	test-auc:0.87800
[4120]	train-auc:0.88964	test-auc:0.87801
[4130]	train-auc:0.88968	test-auc:0.87802
[4140]	train-auc:0.88972	test-auc:

[5870]	train-auc:0.89522	test-auc:0.88101
[5880]	train-auc:0.89525	test-auc:0.88101
[5890]	train-auc:0.89528	test-auc:0.88103
[5900]	train-auc:0.89530	test-auc:0.88104
[5910]	train-auc:0.89533	test-auc:0.88105
[5920]	train-auc:0.89536	test-auc:0.88106
[5930]	train-auc:0.89538	test-auc:0.88107
[5940]	train-auc:0.89540	test-auc:0.88107
[5950]	train-auc:0.89542	test-auc:0.88109
[5960]	train-auc:0.89545	test-auc:0.88110
[5970]	train-auc:0.89548	test-auc:0.88112
[5980]	train-auc:0.89550	test-auc:0.88114
[5990]	train-auc:0.89553	test-auc:0.88115
[6000]	train-auc:0.89557	test-auc:0.88117
[6010]	train-auc:0.89559	test-auc:0.88117
[6020]	train-auc:0.89562	test-auc:0.88119
[6030]	train-auc:0.89564	test-auc:0.88121
[6040]	train-auc:0.89567	test-auc:0.88122
[6050]	train-auc:0.89569	test-auc:0.88122
[6060]	train-auc:0.89572	test-auc:0.88123
[6070]	train-auc:0.89574	test-auc:0.88124
[6080]	train-auc:0.89578	test-auc:0.88128
[6090]	train-auc:0.89580	test-auc:0.88129
[6100]	train-auc:0.89583	test-auc:

[7830]	train-auc:0.90009	test-auc:0.88364
[7840]	train-auc:0.90010	test-auc:0.88365
[7850]	train-auc:0.90014	test-auc:0.88367
[7860]	train-auc:0.90015	test-auc:0.88367
[7870]	train-auc:0.90017	test-auc:0.88369
[7880]	train-auc:0.90019	test-auc:0.88369
[7890]	train-auc:0.90020	test-auc:0.88370
[7900]	train-auc:0.90022	test-auc:0.88370
[7910]	train-auc:0.90024	test-auc:0.88371
[7920]	train-auc:0.90026	test-auc:0.88372
[7930]	train-auc:0.90028	test-auc:0.88374
[7940]	train-auc:0.90030	test-auc:0.88375
[7950]	train-auc:0.90033	test-auc:0.88377
[7960]	train-auc:0.90034	test-auc:0.88377
[7970]	train-auc:0.90037	test-auc:0.88378
[7980]	train-auc:0.90038	test-auc:0.88379
[7990]	train-auc:0.90040	test-auc:0.88379
[8000]	train-auc:0.90043	test-auc:0.88382
[8010]	train-auc:0.90046	test-auc:0.88383
[8020]	train-auc:0.90048	test-auc:0.88383
[8030]	train-auc:0.90050	test-auc:0.88384
[8040]	train-auc:0.90052	test-auc:0.88385
[8050]	train-auc:0.90054	test-auc:0.88388
[8060]	train-auc:0.90057	test-auc:

[9790]	train-auc:0.90404	test-auc:0.88582
[9800]	train-auc:0.90406	test-auc:0.88583
[9810]	train-auc:0.90408	test-auc:0.88584
[9820]	train-auc:0.90410	test-auc:0.88584
[9830]	train-auc:0.90413	test-auc:0.88587
[9840]	train-auc:0.90414	test-auc:0.88587
[9850]	train-auc:0.90416	test-auc:0.88588
[9860]	train-auc:0.90418	test-auc:0.88589
[9870]	train-auc:0.90420	test-auc:0.88589
[9880]	train-auc:0.90421	test-auc:0.88590
[9890]	train-auc:0.90424	test-auc:0.88591
[9900]	train-auc:0.90425	test-auc:0.88591
[9910]	train-auc:0.90427	test-auc:0.88592
[9920]	train-auc:0.90429	test-auc:0.88593
[9930]	train-auc:0.90431	test-auc:0.88594
[9940]	train-auc:0.90433	test-auc:0.88594
[9950]	train-auc:0.90435	test-auc:0.88596
[9960]	train-auc:0.90437	test-auc:0.88597
[9970]	train-auc:0.90439	test-auc:0.88597
[9980]	train-auc:0.90441	test-auc:0.88598
[9990]	train-auc:0.90444	test-auc:0.88600
[10000]	train-auc:0.90445	test-auc:0.88600
[10010]	train-auc:0.90447	test-auc:0.88601
[10020]	train-auc:0.90449	test-a

[11710]	train-auc:0.90734	test-auc:0.88738
[11720]	train-auc:0.90736	test-auc:0.88738
[11730]	train-auc:0.90737	test-auc:0.88738
[11740]	train-auc:0.90738	test-auc:0.88738
[11750]	train-auc:0.90739	test-auc:0.88739
[11760]	train-auc:0.90742	test-auc:0.88739
[11770]	train-auc:0.90743	test-auc:0.88740
[11780]	train-auc:0.90744	test-auc:0.88741
[11790]	train-auc:0.90746	test-auc:0.88743
[11800]	train-auc:0.90747	test-auc:0.88743
[11810]	train-auc:0.90749	test-auc:0.88744
[11820]	train-auc:0.90751	test-auc:0.88744
[11830]	train-auc:0.90752	test-auc:0.88745
[11840]	train-auc:0.90754	test-auc:0.88746
[11850]	train-auc:0.90755	test-auc:0.88747
[11860]	train-auc:0.90757	test-auc:0.88748
[11870]	train-auc:0.90758	test-auc:0.88748
[11880]	train-auc:0.90759	test-auc:0.88748
[11890]	train-auc:0.90760	test-auc:0.88749
[11900]	train-auc:0.90761	test-auc:0.88749
[11910]	train-auc:0.90763	test-auc:0.88750
[11920]	train-auc:0.90765	test-auc:0.88752
[11930]	train-auc:0.90767	test-auc:0.88753
[11940]	tra

In [33]:
import pickle
#store xgboost
# open a file, where you ant to store the data
file = open('xgb_countvectorizer_features', 'wb')
# dump information to that file
pickle.dump(xgb_count_new, file)
# close the file
file.close()



In [75]:
import pickle
#Code to load xgboost

# open a file, where you stored the pickled data
file = open('xgb_countvectorizer_features', 'rb+')
# dump information to that file
xgb_count_new = pickle.load(file)
# close the file
file.close()

XGBoostError: [13:01:26] C:\Jenkins\workspace\xgboost-win64_release_0.90\src\gbm\gbm.cc:20: Unknown gbm type 

In [None]:
d_train = xgb.DMatrix(new_X_tr_q1q2, label=train_labels)
d_test = xgb.DMatrix(new_X_te_q1q2, label=test_labels)

pred_test = xgb_count_new.predict(d_test).round(0).astype(int)
pred_train = xgb_count_new.predict(d_train).round(0).astype(int)

print("Accuracy on training: ", np.sum(train_labels==pred_train)/len(train_labels))
print("Accuracy on test: ", np.sum(test_labels ==pred_test)/len(test_labels))

mistake_indices, predictions = get_mistakes(xgb_count_new, d_train, train_labels)
print_mistake_k(4, mistake_indices, predictions)      


## Third model - with TfIdfVectorizer

In [42]:
tfidf_vectorizer = tf(stop_words = set(stopwords.words('english')), ngram_range=(1,3), max_df = 0.4, min_df = 5)
tfidf_vectorizer.fit(all_questions)

tfidf_sk = sklearn.feature_extraction.text.TfidfVectorizer(use_idf=False, smooth_idf=False, sublinear_tf=False,
                                                          stop_words = set(stopwords.words('english')),
                                                          ngram_range=(1,3), max_df = 0.99, min_df = 5)
tfidf_sk.fit(all_questions)

X_tr_q1q2 = get_features_from_list(q1_train, q2_train,tfidf_vectorizer)
X_te_q1q2  = get_features_from_list(q1_test, q2_test, tfidf_vectorizer)

print('initial shape', X_tr_q1q2.shape)
new_X_tr_q1q2 = sparse.hstack((X_tr_q1q2,intersect_train, num1_train, num2_train,
                               dif_number_train,math1_train,math2_train,len1_train, len2_train))

new_X_te_q1q2 = sparse.hstack((X_te_q1q2,intersect_test, num1_test, num2_test,
                               dif_number_test, math1_test,math2_test,len1_test, len2_test))
print('final shape', new_X_tr_q1q2.shape)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=0.99, max_features=None,
                min_df=5, ngram_range=(1, 3), norm='l2', preprocessor=None,
                smooth_idf=False,
                stop_words={'a', 'about', 'above', 'after', 'again', 'against',
                            'ain', 'all', 'am', 'an', 'and', 'any', 'are',
                            'aren', "aren't", 'as', 'at', 'be', 'because',
                            'been', 'before', 'being', 'below', 'between',
                            'both', 'but', 'by', 'can', 'couldn', "couldn't", ...},
                strip_accents=None, sublinear_tf=False,
                token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None,
                use_idf=False, vocabulary=None)

In [41]:
import xgboost as xgb

params = {}
params['objective'] = 'binary:logistic'
params['eval_metric'] = 'auc'
params['eta'] = 0.02
params['max_depth'] = 4

d_train = xgb.DMatrix(new_X_tr_q1q2, label=train_labels)
d_test = xgb.DMatrix(new_X_te_q1q2, label=test_labels)

evallist = [(d_train, 'train'), (d_test, 'test')]

num_iters = 50000

xgb_tfidf = xgb.train(params, d_train, num_iters, evallist, early_stopping_rounds=50, verbose_eval=10)


[0]	train-auc:0.82767	test-auc:0.82525
Multiple eval metrics have been passed: 'test-auc' will be used for early stopping.

Will train until test-auc hasn't improved in 50 rounds.
[10]	train-auc:0.83268	test-auc:0.82966
[20]	train-auc:0.83330	test-auc:0.83053
[30]	train-auc:0.83577	test-auc:0.83323
[40]	train-auc:0.83597	test-auc:0.83334
[50]	train-auc:0.83647	test-auc:0.83373
[60]	train-auc:0.83780	test-auc:0.83457
[70]	train-auc:0.83905	test-auc:0.83555
[80]	train-auc:0.84052	test-auc:0.83766
[90]	train-auc:0.84091	test-auc:0.83806
[100]	train-auc:0.84164	test-auc:0.83865
[110]	train-auc:0.84204	test-auc:0.83923
[120]	train-auc:0.84267	test-auc:0.84007
[130]	train-auc:0.84326	test-auc:0.84091
[140]	train-auc:0.84421	test-auc:0.84192
[150]	train-auc:0.84487	test-auc:0.84274
[160]	train-auc:0.84544	test-auc:0.84320
[170]	train-auc:0.84620	test-auc:0.84408
[180]	train-auc:0.84685	test-auc:0.84444
[190]	train-auc:0.84742	test-auc:0.84471
[200]	train-auc:0.84806	test-auc:0.84544
[210]	tra

[1950]	train-auc:0.88132	test-auc:0.87026
[1960]	train-auc:0.88140	test-auc:0.87028
[1970]	train-auc:0.88146	test-auc:0.87030
[1980]	train-auc:0.88154	test-auc:0.87035
[1990]	train-auc:0.88163	test-auc:0.87038
[2000]	train-auc:0.88172	test-auc:0.87042
[2010]	train-auc:0.88182	test-auc:0.87048
[2020]	train-auc:0.88190	test-auc:0.87052
[2030]	train-auc:0.88197	test-auc:0.87055
[2040]	train-auc:0.88206	test-auc:0.87059
[2050]	train-auc:0.88213	test-auc:0.87064
[2060]	train-auc:0.88219	test-auc:0.87065
[2070]	train-auc:0.88228	test-auc:0.87070
[2080]	train-auc:0.88240	test-auc:0.87078
[2090]	train-auc:0.88252	test-auc:0.87086
[2100]	train-auc:0.88259	test-auc:0.87089
[2110]	train-auc:0.88267	test-auc:0.87091
[2120]	train-auc:0.88276	test-auc:0.87097
[2130]	train-auc:0.88282	test-auc:0.87098
[2140]	train-auc:0.88289	test-auc:0.87100
[2150]	train-auc:0.88295	test-auc:0.87100
[2160]	train-auc:0.88303	test-auc:0.87105
[2170]	train-auc:0.88312	test-auc:0.87111
[2180]	train-auc:0.88322	test-auc:

[3910]	train-auc:0.89403	test-auc:0.87626
[3920]	train-auc:0.89408	test-auc:0.87627
[3930]	train-auc:0.89412	test-auc:0.87629
[3940]	train-auc:0.89416	test-auc:0.87629
[3950]	train-auc:0.89421	test-auc:0.87629
[3960]	train-auc:0.89424	test-auc:0.87630
[3970]	train-auc:0.89428	test-auc:0.87631
[3980]	train-auc:0.89433	test-auc:0.87633
[3990]	train-auc:0.89439	test-auc:0.87638
[4000]	train-auc:0.89444	test-auc:0.87641
[4010]	train-auc:0.89449	test-auc:0.87643
[4020]	train-auc:0.89453	test-auc:0.87645
[4030]	train-auc:0.89458	test-auc:0.87647
[4040]	train-auc:0.89463	test-auc:0.87647
[4050]	train-auc:0.89467	test-auc:0.87648
[4060]	train-auc:0.89472	test-auc:0.87651
[4070]	train-auc:0.89477	test-auc:0.87654
[4080]	train-auc:0.89482	test-auc:0.87655
[4090]	train-auc:0.89485	test-auc:0.87655
[4100]	train-auc:0.89492	test-auc:0.87658
[4110]	train-auc:0.89496	test-auc:0.87661
[4120]	train-auc:0.89501	test-auc:0.87663
[4130]	train-auc:0.89506	test-auc:0.87666
[4140]	train-auc:0.89512	test-auc:

[5870]	train-auc:0.90208	test-auc:0.87925
[5880]	train-auc:0.90212	test-auc:0.87927
[5890]	train-auc:0.90216	test-auc:0.87928
[5900]	train-auc:0.90220	test-auc:0.87929
[5910]	train-auc:0.90223	test-auc:0.87931
[5920]	train-auc:0.90227	test-auc:0.87933
[5930]	train-auc:0.90230	test-auc:0.87934
[5940]	train-auc:0.90234	test-auc:0.87934
[5950]	train-auc:0.90237	test-auc:0.87935
[5960]	train-auc:0.90241	test-auc:0.87937
[5970]	train-auc:0.90244	test-auc:0.87938
[5980]	train-auc:0.90247	test-auc:0.87939
[5990]	train-auc:0.90250	test-auc:0.87940
[6000]	train-auc:0.90255	test-auc:0.87943
[6010]	train-auc:0.90259	test-auc:0.87944
[6020]	train-auc:0.90262	test-auc:0.87945
[6030]	train-auc:0.90266	test-auc:0.87945
[6040]	train-auc:0.90269	test-auc:0.87948
[6050]	train-auc:0.90272	test-auc:0.87949
[6060]	train-auc:0.90275	test-auc:0.87949
[6070]	train-auc:0.90277	test-auc:0.87949
[6080]	train-auc:0.90280	test-auc:0.87950
[6090]	train-auc:0.90284	test-auc:0.87951
[6100]	train-auc:0.90287	test-auc:

[7830]	train-auc:0.90852	test-auc:0.88146
[7840]	train-auc:0.90855	test-auc:0.88147
[7850]	train-auc:0.90858	test-auc:0.88148
[7860]	train-auc:0.90860	test-auc:0.88149
[7870]	train-auc:0.90862	test-auc:0.88149
[7880]	train-auc:0.90865	test-auc:0.88150
[7890]	train-auc:0.90869	test-auc:0.88152
[7900]	train-auc:0.90872	test-auc:0.88153
[7910]	train-auc:0.90875	test-auc:0.88153
[7920]	train-auc:0.90878	test-auc:0.88156
[7930]	train-auc:0.90881	test-auc:0.88157
[7940]	train-auc:0.90883	test-auc:0.88158
[7950]	train-auc:0.90886	test-auc:0.88158
[7960]	train-auc:0.90889	test-auc:0.88159
[7970]	train-auc:0.90892	test-auc:0.88161
[7980]	train-auc:0.90895	test-auc:0.88163
[7990]	train-auc:0.90898	test-auc:0.88164
[8000]	train-auc:0.90901	test-auc:0.88164
[8010]	train-auc:0.90903	test-auc:0.88165
[8020]	train-auc:0.90906	test-auc:0.88167
[8030]	train-auc:0.90909	test-auc:0.88169
[8040]	train-auc:0.90911	test-auc:0.88170
[8050]	train-auc:0.90914	test-auc:0.88169
[8060]	train-auc:0.90916	test-auc:

In [33]:
import pickle
#store xgboost
# open a file, where you ant to store the data
file = open('xgb_tfidf', 'wb')
# dump information to that file
pickle.dump(xgb_tfidf, file)
# close the file
file.close()



In [45]:
#Code to load xgboost

# open a file, where you stored the pickled data
file = open('xgb_tfidf', 'rb')
# dump information to that file
xgb_tfidf = pickle.load(file)
# close the file
file.close()

In [47]:
d_train = xgb.DMatrix(new_X_tr_q1q2, label=train_labels)
d_test = xgb.DMatrix(new_X_te_q1q2, label=test_labels)

pred_test = xgb_tfidf.predict(d_test).round(0).astype(int)
pred_train = xgb_tfidf.predict(d_train).round(0).astype(int)

print("Accuracy on training: ", np.sum(train_labels==pred_train)/len(train_labels))
print("Accuracy on test: ", np.sum(test_labels ==pred_test)/len(test_labels))

mistake_indices, predictions = get_mistakes(xgb_tfidf, d_train, train_labels)
print_mistake_k(4, mistake_indices, predictions)      


Accuracy on training:  0.8425562029351948
Accuracy on test:  0.832148157308929
Neuroscience: Which regions of the brain are responsible for pain?
Neuroscience: Which regions of the brain are responsible for sadness?
true class: 0
prediction: 1


# FOURTH MODEL: DIFFERENT APPROACH WITH DEEP LEARNING

Our main objective for this deliverable was to work with a more classic approach for natural language processing, mainly to implement and understand the CountVectorizer and TfIdfVectorizer. Additionally, we tried to work on the mistakes and limitations that this approach had, hence having to do a bit of feature engeenireing to tackle those problems.

However, nowadays deep learning is used practically to solve anything, so, how well could it work to solve this problem? In this section we explore a completely different approach using deep learning.

# LAST BUT NOT LEAST: LET'S DO PIPELINES

In [76]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_validate

To pass data from CountVectorizer to model automatically, we need to transform the data first. This is just a hacky way to do it. 

Note that the correct way is to modify the CountVectorized that we implemented so that the output of the  transform is already the desired matrix (i.e. doing the hstack inside). We tried to implement the most general CountVectorized that we could, so we have to resort to this way. It is in no way slower in terms of speed. It is just ugly in the sense that is doing something hack-ish. 

In [77]:
class MiddleTransformer(sklearn.base.BaseEstimator, sklearn.base.TransformerMixin):
    def __init__(self):
        return
    
    def fit(self, X, y=None):
        return self
    
    def fit_transform(self, X, y=None):
        return self.transform(X)
    
    def transform(self, X):
        nexamples, nvars = X.shape
        split = (int)(nexamples/2)
        XX1 = X[:split,:]
        XX2 = X[split:,:]
        XX = sparse.hstack([XX1, XX2], format='csr')
        return XX

In [78]:
model_cv = Pipeline([
    ('countVectorizer', cv(#token_cleaner_func = lambda doc: WordNetLemmatizer().lemmatize(doc,pos="v"),
                     stop_words = set(stopwords.words('english')),
                     ngram_range=(1,3))),
    ('middleTransformer', MiddleTransformer()),
    ('model', sklearn.linear_model.LogisticRegression(solver="liblinear"))])

In [79]:
model_cv.fit(all_questions, train_labels)

Pipeline(memory=None,
     steps=[('countVectorizer', CountVectorizer(doc_cleaner_pattern="('\\w+)", document_cleaner_func=None,
        dtype=<class 'numpy.float32'>, max_df=1.0, min_df=1,
        min_word_counts=1, ngram_range=(1, 3),
        stop_words={'them', 'an', 'herself', 'but', 'd', 'some', 'had', 'yours', "isn't", ...ty='l2', random_state=None, solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False))])

In [80]:
all_questions_test = q1_test+q2_test

In [81]:
sklearn.metrics.roc_auc_score(test_labels, model_cv.predict(all_questions_test))

0.7778368046218165

### Make Pipeline Great Again: GridSearch Hyperparameters

To select samples to cross validate, we need the samples to be in pair with the labels. Right now we dont have that, so we are going to (once again) hack our way through it.

Again note that the correct way to do it is to modify the CountVectorizer so that it automatically does this process inside the fit and transform functions. But following our filosophy of having a general CountVectorizer, this is a good way to solve the problem.

In [44]:
ndocs = (int)(len(all_questions)/2)
all_questions_tuples = [(q1_train[i], q2_train[i]) for i in range(ndocs)]

class CrossValidationTransformer(sklearn.base.BaseEstimator, sklearn.base.TransformerMixin):
    def __init__(self):
        return
    
    def fit(self, X, y=None):
        return self
    
    def fit_transform(self, X, y=None):
        return self.transform(X)
    
    def transform(self, X):
        XX = [x[0] for x in X]
        for x in X:
            XX.append(x[1])
        return XX

In [52]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV

mnb = MultinomialNB()

model_cv = Pipeline([
    ('crossValidationTransformer', CrossValidationTransformer()),
    ('countVectorizer', tf(#token_cleaner_func = lambda doc: WordNetLemmatizer().lemmatize(doc,pos="v"),
                     stop_words = set(stopwords.words('english')))),
    ('middleTransformer', MiddleTransformer()),
    ('model', MultinomialNB())
])

params = {'countVectorizer__ngram_range':[(1,3)], 
         'countVectorizer__min_df':[1,5,10,15,],
         'countVectorizer__max_df':[.4,.3],
         }

gs = GridSearchCV(model_cv, params, scoring='roc_auc', cv=5, n_jobs=-1, verbose=2)

In [53]:
results = gs.fit(all_questions_tuples, train_labels)

Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed: 10.8min
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed: 12.1min finished


In [56]:
results.best_params_

{'countVectorizer__max_df': 0.4,
 'countVectorizer__min_df': 1,
 'countVectorizer__ngram_range': (1, 3)}

In [51]:
results.best_score_

0.8436795654397367