In [3]:
import pandas as pd
import scipy
import sklearn
from sklearn import *
import numpy as np
import collections
from scipy import sparse
import nltk
from collections import defaultdict
import re
from CountVectorizer_BagOfWords import CountVectorizer as cv
from TfIdfVectorizer import TfIdfVectorizer as tf
#from Spelling_Correction_c  import Spelling_Correction_c 
import xgboost as xgb
import pickle
import json

In [4]:
from nltk.stem import PorterStemmer, LancasterStemmer, WordNetLemmatizer, SnowballStemmer
from nltk.corpus import stopwords 
nltk.download('stopwords')
nltk.download('wordnet')

#How to call such stemmers and lemmatizer in the CountVectorizer object:
#PorterStemmer(): token_cleaner_func = PorterStemmer().stem
#LancasterStemmer(): token_cleaner_func = LancasterStemmer().stem
#SnowballStemmer(language='english'): token_cleaner_func = SnowballStemmer(language='english').stem
#WordNetLemmatizer(): token_cleaner_func = lambda doc: WordNetLemmatizer().lemmatize(doc,pos="v")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Ignasi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Ignasi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# DATA

We are trying to solve the following problem: given a pair of different questions of Quora, decide if they are asking the same or not. In this notebook, we will discuss the process we have followed to solve the problem, the different models that we have used as well as the mistakes that each model makes.

In [5]:
#read the data
available_data = pd.read_csv("quora_train_data.csv")
test_df = pd.read_csv("quora_test_data.csv")
available_data

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,346692,38482,10706,Why do I get easily bored with everything?,Why do I get bored with things so quickly and ...,1
1,327668,454117,345117,How do I study for Honeywell company recruitment?,How do I study for Honeywell company recruitme...,1
2,272993,391373,391374,Which search engine algorithm is Quora using?,Why is Quora not using reliable search engine?,0
3,54070,82673,95496,How can I smartly cut myself?,Can someone who thinks about suicide for 7 yea...,0
4,46450,38384,72436,How do I see who is viewing my Instagram videos?,Can one tell who viewed my Instagram videos?,1
5,60040,105079,19068,What is the chance that i damaged my brain smo...,Has anyone ever died from smoking marijuana?,0
6,134252,14287,5883,Do Quora users still see questions that are ma...,What happens to questions marked as needing im...,1
7,176265,271238,271239,Which politcal party would you vote for in 201...,How many minimum marks required for getting ad...,0
8,129602,62620,197112,How much does it cost to get a personal traine...,How much does a day pass for LA Fitness cost?,0
9,202449,304753,200833,What is the evolutionary significance of meter...,What is the evolutionary significance of gover...,0


In [6]:
#Split data into train and test
train_df, val_df = sklearn.model_selection.train_test_split(available_data, test_size=0.1, random_state=123)

train_df.to_csv('train_df.csv')
test_df.to_csv('test_df.csv')
val_df.to_csv('val_df.csv')

# AUX FUNCTIONS

We will use the following functions for some of the models.The first functions are meant to extract, given a vectorizer, the matrix of features for the classifier. The two last functions are used to identify the errors that a classifier is making.

In [7]:
def cast_list_as_strings(mylist):
    """
    return a list of strings
    """
    assert isinstance(mylist, list), f"the input mylist should be a list it is {type(mylist)}"
    
    mylist_aux = []
    
    for i in mylist:
        mylist_aux.append(str(i))
        
    return mylist_aux

from scipy.sparse import hstack

def get_features_from_list(q1,q2,count_vectorizer):
    """
    returns a sparse matrix containing the features build by the count vectorizer.
    """
    
    q1_mat = count_vectorizer.transform(q1)
    q2_mat = count_vectorizer.transform(q2)
    X_q1q2 = hstack([q1_mat,q2_mat], format="csr")
            
    return X_q1q2
    

def get_features_from_df(df, count_vectorizer):
    """
    returns a sparse matrix containing the features build by the count vectorizer.
    Each row should contain features from question1 and question2.
    """
    
    #list of questions where each element of the question is of type string
    q1_casted =  cast_list_as_strings(list(df["question1"]))
    q2_casted =  cast_list_as_strings(list(df["question2"]))    
    
    q1_mat = count_vectorizer.transform(q1_casted)
    q2_mat = count_vectorizer.transform(q2_casted)
    X_q1q2 = hstack([q1_mat,q2_mat], format="csr")
            
    return X_q1q2

def get_mistakes(clf, X_q1q2, y):
    """
    Returns two lists: one containing the indices of the predictions that are not correct
    and another one containing the predictions
    """
    predictions        = clf.predict(X_q1q2).round(0).astype(int)
    incorrect_preds    = predictions != y
    incorrect_indices, = np.where(incorrect_preds)
    incorrect_indices2 = [x for x in  range(len(incorrect_preds)) if incorrect_preds[x] ==True]
    incorrect_indices3 = np.arange(len(incorrect_preds))[incorrect_preds]        
    
    if np.sum(incorrect_preds)==0:
        print("no mistakes in this df")
    else:
        return incorrect_indices, predictions
    
def print_mistake_k(k, dataset, mistake_indices, predictions):
    """
    Auxiliar function to print the k-th mistake made in the prediction
    """
    print("Original q1: ", train_df.iloc[mistake_indices[k]].question1, " Treated q1: ", dataset[mistake_indices[k]])
    print("Original q2: ", train_df.iloc[mistake_indices[k]].question2, " Treated q2: ", dataset[mistake_indices[k]+train_df.shape[0]])
    print("true class:", train_df.iloc[mistake_indices[k]].is_duplicate)
    print("prediction:", predictions[mistake_indices[k]])  

# PREPROCESS DATA

A first naive model was proposed in class: pass the text through the vectorizers and use the returned matrix as the matrix of features. We saw that the classifier wrongly classified some questions with spelling mistakes. For example, the classifiera would identify as different questions those who were written like "whats" from those who were written like "what's". 

We thought that this problem may be common with any model that we try to train, so the first thing we propose to do is correcting the spelling mistakes. We propose to remove "'s", change the negatives "'t" for "not" as well as the plurals "'re" for "are", remove symbols and points. Then, we implemented a spell checking function using the edit distance. 

This cell is necessary in order to obtain a list of documents. This is the structure we usually want, at least for the vectorizers.

In [8]:
#Convert all elements of the documents into strings 
q1_train_raw =  cast_list_as_strings(list(train_df["question1"]))
q2_train_raw =  cast_list_as_strings(list(train_df["question2"]))
q1_val_raw  =  cast_list_as_strings(list(val_df["question1"]))
q2_val_raw  =  cast_list_as_strings(list(val_df["question2"]))
q1_test_raw  =  cast_list_as_strings(list(test_df["question1"]))
q2_test_raw  =  cast_list_as_strings(list(test_df["question2"]))


all_questions_raw = q1_train_raw + q2_train_raw

This code takes a while to compute, so we write the result in a text file. We DON'T NEED TO RUN THE FOLLOWING CELLS, they are here to illustrate the process that we have done. 

In [9]:
#Load the words of our corpus
nltk.download('words')
words = nltk.corpus.words.words()
words.extend(['online', 'Quora'])

[nltk_data] Downloading package words to
[nltk_data]     C:\Users\Ignasi\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


In [7]:
#Create the spelling correction object (it will create the BK tree)
spelling_c = Spelling_Correction_c( words, tol = 1)

In [8]:
def document_cleaner_spelling(spelling, text):
    clean_doc_pattern = re.compile( r"('\w)|([^a-zA-Z0-9.])") #Find words containing alphanumeric or points
    q = re.sub('\'s', '', text) #Remove 's
    q = re.sub('\'t', ' not', q) #Change 't for not'
    q = re.sub('\'re', ' are', q) #Change 're for are'
    q = re.sub('[?%!@#$\'\""]', '', q)#Remove symbols
    q = re.sub('\.\s', ' ', q)#Remove points with a space afterwards
    clean_q = clean_doc_pattern.sub(" ", q)
    correct_q = spelling_c.correct_text(clean_q)#Clean spelling mistakes
    return correct_q

document_cleaner_spelling(spelling_c, "Why does Sasuke have only two cars in ex-girlfriend The Movie?")

'why do sasuke have only two car in ex girlfriend the movie'

In [None]:
i=0
print(len(q1_train))
q1_train_cleaned = []
for quest in q1_train:
    #print(i)
    quest_cl = document_cleaner_spelling(spelling_c,quest)
    q1_train_cleaned.append(quest_cl)
    i+=1

with open('cleaned_data/q1_train_cleaned.txt', 'w') as f:
    for item in q1_train_cleaned:
        f.write("%s\n" % item)

In [None]:
i=0
print(len(q2_train))
q2_train_cleaned = []
for quest in q2_train:
    #print(i)
    quest_cl = document_cleaner_spelling(spelling_c,quest)
    q2_train_cleaned.append(quest_cl)
    i+=1

with open('cleaned_data/q2_train_cleaned.txt', 'w') as f:
    for item in q2_train_cleaned:
        f.write("%s\n" % item)

In [None]:
i=0
print(len(q1_val_raw))
q1_val_cleaned = []
for quest in q1_val_raw:
    #print(i)
    quest_cl = document_cleaner_spelling(spelling_c,quest)
    q1_val_cleaned.append(quest_cl)
    i+=1

with open('cleaned_data/q1_val_cleaned.txt', 'w') as f:
    for item in q1_val_cleaned:
        f.write("%s\n" % item)

In [None]:
i=0
print(len(q2_val_raw))
q2_val_cleaned = []
for quest in q2_val_raw:
    #print(i)
    quest_cl = document_cleaner_spelling(spelling_c,quest)
    q2_val_cleaned.append(quest_cl)
    i+=1

with open('cleaned_data/q2_val_cleaned.txt', 'w') as f:
    for item in q2_val_cleaned:
        f.write("%s\n" % item)

In [None]:
i=0
print(len(q1_test))
q1_test_cleaned = []
for quest in q1_test:
    #print(i)
    quest_cl = document_cleaner_spelling(spelling_c,quest)
    q1_test_cleaned.append(quest_cl)
    i+=1

with open('cleaned_data/q1_test_cleaned.txt', 'w') as f:
    for item in q1_test_cleaned:
        f.write("%s\n" % item)

In [None]:
i=0
print(len(q2_test))
q2_test_cleaned = []
for quest in q2_test:
    #print(i)
    quest_cl = document_cleaner_spelling(spelling_c,quest)
    q2_test_cleaned.append(quest_cl)
    i+=1

with open('cleaned_data/q2_test_cleaned.txt', 'w') as f:
    for item in q2_test_cleaned:
        f.write("%s\n" % item)

In [47]:
with open('cleaned_data/train_labels.txt', 'w') as f:
    for item in train_df['is_duplicate'].values:
        f.write("%s\n" % item)
        
with open('cleaned_data/val_labels.txt', 'w') as f:
    for item in val_df['is_duplicate'].values:
        f.write("%s\n" % item)
        
with open('cleaned_data/test_labels.txt', 'w') as f:
    for item in test_df['is_duplicate'].values:
        f.write("%s\n" % item)


## Run from here

We run these cells to obtain the results of the  cleaned text fromthe txt files.

In [10]:
with open('cleaned_data/q1_train_cleaned.txt') as f:
    content = f.readlines()
# you may also want to remove whitespace characters like `\n` at the end of each line
q1_train = [x.strip() for x in content] 

In [11]:
with open('cleaned_data/q2_train_cleaned.txt') as f:
    content = f.readlines()
# you may also want to remove whitespace characters like `\n` at the end of each line
q2_train = [x.strip() for x in content] 

In [12]:
with open('cleaned_data/q1_val_cleaned.txt') as f:
    content = f.readlines()
# you may also want to remove whitespace characters like `\n` at the end of each line
q1_val = [x.strip() for x in content] 

In [13]:
with open('cleaned_data/q2_val_cleaned.txt') as f:
    content = f.readlines()
# you may also want to remove whitespace characters like `\n` at the end of each line
q2_val = [x.strip() for x in content] 

In [14]:
with open('cleaned_data/q1_test_cleaned.txt') as f:
    content = f.readlines()
# you may also want to remove whitespace characters like `\n` at the end of each line
q1_test = [x.strip() for x in content] 

In [15]:
with open('cleaned_data/q2_test_cleaned.txt') as f:
    content = f.readlines()
# you may also want to remove whitespace characters like `\n` at the end of each line
q2_test = [x.strip() for x in content] 

In [16]:
all_questions= q1_train + q2_train

In [17]:
with open('cleaned_data/train_labels.txt') as f:
    content = f.readlines()
# you may also want to remove whitespace characters like `\n` at the end of each line
train_labels = [int(x.strip()) for x in content] 

with open('cleaned_data/val_labels.txt') as f:
    content = f.readlines()
# you may also want to remove whitespace characters like `\n` at the end of each line
val_labels = [int(x.strip()) for x in content] 

with open('cleaned_data/test_labels.txt') as f:
    content = f.readlines()
# you may also want to remove whitespace characters like `\n` at the end of each line
test_labels = [int(x.strip()) for x in content] 

# FIRST MODEL: NAIVE CLASSIFIER WITH SPELL CHECKING
For the first model, we just wanted to see what difference did the spellchecking do. So, did we improve the results? Did we improve the results as expected? If so, what mistakes is our model doing now?

We will do this checking for bot the CountVectorizer and the TfIdfVectorizer

## First model - with CountVectorizer

In [18]:
#inicialize the CountVectorizer and define its parameters
CountVectorizer = cv(stop_words = set(stopwords.words('english')),
                     ngram_range=(1,3), max_df = 0.99, min_df = 5)

In [19]:
#fit the CountVectorizer
CountVectorizer.fit(all_questions)

CountVectorizer(doc_cleaner_pattern="('\\w+)", document_cleaner_func=None,
        dtype=<class 'numpy.float32'>, max_df=0.99, min_df=5,
        min_word_counts=1, ngram_range=(1, 3),
        stop_words={'for', 'when', 'm', 'ma', "wasn't", "couldn't", 'yourselves', 'wasn', 'too', 'those', "haven't", 'couldn', "weren't", 'your', 're', 'no', 'is', 'other', 'theirs', 'who', 'very', 'itself', 'while', 'that', 'off', 'll', 'am', 'if', 'until', 'them', 'which', 'during', 'wouldn', "you'll", 'b...', "mustn't", 'didn', 'only', 'their', 'of', 'above', 'out', 'mightn', 'aren', 's', "you're", 'was'},
        token_cleaner_func=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer_func=None)

In [18]:
CountVectorizer.dump("models/CountVectorizer.pkl")

In [20]:
countvect_sk = sklearn.feature_extraction.text.CountVectorizer(stop_words = set(stopwords.words('english')),ngram_range=(1, 3),
                                                              max_df = 0.99, min_df = 5)
countvect_sk.fit(all_questions)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.99, max_features=None, min_df=5,
        ngram_range=(1, 3), preprocessor=None,
        stop_words={'for', 'when', 'm', 'ma', "wasn't", "couldn't", 'yourselves', 'wasn', 'too', 'those', "haven't", 'couldn', "weren't", 'your', 're', 'no', 'is', 'other', 'theirs', 'who', 'very', 'itself', 'while', 'that', 'off', 'll', 'am', 'if', 'until', 'them', 'which', 'during', 'wouldn', "you'll", 'b...', "mustn't", 'didn', 'only', 'their', 'of', 'above', 'out', 'mightn', 'aren', 's', "you're", 'was'},
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

We will the result obtained with our implementation of the CountVectorizer with the result obtained using the sklearn version of the CountVectorizer. One of the objectives of this deliverable was to implement and understend how the vectorizers worked, so we set as an objective to obtain the same as the sklearn vectorizers.

In [21]:
X_tr_q1q2 = get_features_from_list(q1_train, q2_train,CountVectorizer)
print("With our CountVectorizer: ", X_tr_q1q2.shape, train_df.shape)

X_tr_q1q2_sk = get_features_from_list(q1_train, q2_train,countvect_sk)
print("With sklearn CountVectorizer: ", X_tr_q1q2_sk.shape, train_df.shape)

X_val_q1q2  = get_features_from_list(q1_val, q2_val, CountVectorizer)

X_val_q1q2_sk  = get_features_from_list(q1_val, q2_val, countvect_sk)

X_te_q1q2  = get_features_from_list(q1_test, q2_test, CountVectorizer)

X_te_q1q2_sk  = get_features_from_list(q1_test, q2_test, countvect_sk)

With our CountVectorizer:  (291088, 285364) (291088, 6)
With sklearn CountVectorizer:  (291088, 285364) (291088, 6)


Result using our implementation of CountVectorizer

In [21]:
logistic = sklearn.linear_model.LogisticRegression(solver="liblinear")
logistic.fit(X_tr_q1q2, train_labels)

#train roc auc metrics
print("Result on train: ", sklearn.metrics.roc_auc_score(y_true = train_labels, y_score = logistic.predict(X_tr_q1q2)))

#test roc auc metrics
print("Result on validation: ", sklearn.metrics.roc_auc_score(y_true = val_labels, y_score = logistic.predict(X_val_q1q2)))

#test roc auc metrics
print("Result on test: ", sklearn.metrics.roc_auc_score(y_true = test_labels, y_score = logistic.predict(X_te_q1q2)))

Result on train:  0.8873892706438838
Result on validation:  0.7564062563763118
Result on test:  0.7533668105876712


Finally, we store the parameters of the logistic regression in a JSON file, so that we can import them later easily.

In [22]:
def save_logistic(logistic, filename):
    logistic_params = {}
    logistic_params['coef_'] = logistic.coef_.tolist()
    logistic_params['classes_'] = logistic.classes_.tolist()
    logistic_params['intercept_'] = logistic.intercept_.tolist()

    #Dump the parameters to a JSON file
    with open(filename, 'w') as fp:
        json.dump(logistic_params, fp)

In [23]:
save_logistic(logistic, 'models/logistic.json')

Result using sklearn implementation of CountVectorizer

In [24]:
logistic_sk = sklearn.linear_model.LogisticRegression(solver="liblinear")
logistic_sk.fit(X_tr_q1q2_sk, train_labels)

#train roc auc metrics
print("Result on train: ", sklearn.metrics.roc_auc_score(y_true = train_labels, y_score = logistic_sk.predict(X_tr_q1q2_sk)))

#test roc auc metrics
print("Result on validation: ", sklearn.metrics.roc_auc_score(y_true = val_labels, y_score = logistic_sk.predict(X_val_q1q2_sk)))

#test roc auc metrics
print("Result on test: ", sklearn.metrics.roc_auc_score(y_true = test_labels, y_score = logistic_sk.predict(X_te_q1q2_sk)))

Result on train:  0.8873667596649119
Result on validation:  0.7563817405636126
Result on test:  0.7533304796390545


In [25]:
save_logistic(logistic_sk, 'models/logistic_sk.json')

### Qualitative information about the mistakes

We wanted to identify the mistakes that the classifier was doing in this case. We saw that the classifier was making mistages mainly for the following reasons:
- The questions are the same, but the sentences have lots of different words.
- The questions are the same, but one sentence is way larger than the other.
- The questions are asking about the same thing but for different years, hence they must be classified as different.
- One of the questions is a subset of the other. This mistake is the harder to solve because sometimes it is even debatable of the questions should be the same or not.

In [26]:
print("Accuracy on training: ", np.sum(train_labels==logistic.predict(X_tr_q1q2))/len(train_labels))
print("Accuracy on validation: ", np.sum(val_labels ==logistic.predict(X_val_q1q2))/len(val_labels))
print("Accuracy on test: ", np.sum(test_labels ==logistic.predict(X_te_q1q2))/len(test_labels))

Accuracy on training:  0.9074094431924367
Accuracy on validation:  0.788647044274054
Accuracy on test:  0.7851665883400529


In [27]:
mistake_indices, predictions = get_mistakes(logistic, X_tr_q1q2, train_labels)
print_mistake_k(4, all_questions, mistake_indices, predictions)

Original q1:  Why do men like women's feet?  Treated q1:  why do men like woman foot
Original q2:  Why do men like womens feet?  Treated q2:  why do men like woman foot
true class: 1
prediction: 0


## First model - with TfIdfVectorizer

In [28]:
tfidf_vectorizer = tf(stop_words = set(stopwords.words('english')), ngram_range=(1,3), max_df = 0.4, min_df = 5)
tfidf_vectorizer.fit(all_questions)
tfidf_vectorizer.dump("models/TfIdfVectorizer.pkl")

tfidf_sk = sklearn.feature_extraction.text.TfidfVectorizer(use_idf=False, smooth_idf=False, sublinear_tf=False,
                                                          stop_words = set(stopwords.words('english')),
                                                          ngram_range=(1,3), max_df = 0.99, min_df = 5)
tfidf_sk.fit(all_questions) 

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.99, max_features=None, min_df=5,
        ngram_range=(1, 3), norm='l2', preprocessor=None, smooth_idf=False,
        stop_words={'y', 'shan', 'hers', 'by', 'against', 'a', 'here', 'down', 'from', 'now', "wouldn't", 'their', 'off', 'an', 'once', 'ourselves', 'mustn', 'there', 'those', 'after', "isn't", 'does', 'where', "you're", 'did', 'both', 've', "hasn't", "she's", 'we', 'very', 'each', 'of', 'will', 'can', 'him..., 'few', 'my', 'her', 'itself', 'have', 'then', "aren't", 'that', 'on', 'between', 'these', 'doesn'},
        strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=False,
        vocabulary=None)

Again, remember that additionally, we want to compare our result with that given by the implementation of sklearn of the TfIdfVectorizer.

In [29]:
X_tr_q1q2 = get_features_from_list(q1_train, q2_train,tfidf_vectorizer)
X_tr_q1q2_sk = get_features_from_list(q1_train, q2_train, tfidf_sk)
X_val_q1q2  = get_features_from_list(q1_val, q2_val, tfidf_vectorizer)
X_val_q1q2_sk  = get_features_from_list(q1_val, q2_val, tfidf_sk)
X_te_q1q2  = get_features_from_list(q1_test, q2_test, tfidf_vectorizer)
X_te_q1q2_sk  = get_features_from_list(q1_test, q2_test, tfidf_sk)

print("With our TfIdf Vectorizer:", X_tr_q1q2.shape, train_df.shape)
print("With sklearn TfIdf Vectorizer:", X_tr_q1q2_sk.shape, train_df.shape)

With our TfIdf Vectorizer: (291088, 285364) (291088, 6)
With sklearn TfIdf Vectorizer: (291088, 285364) (291088, 6)


Result using our implementation of TfIdf Vectorizer

In [31]:
logistic = sklearn.linear_model.LogisticRegression(solver="liblinear")
logistic.fit(X_tr_q1q2, train_labels)

print("Result on train: ", sklearn.metrics.roc_auc_score(y_true = train_labels, y_score = logistic.predict(X_tr_q1q2)))

print("Result on validation: ", sklearn.metrics.roc_auc_score(y_true = val_labels, y_score = logistic.predict(X_val_q1q2)))

#test roc auc metrics
print("Result on test: ", sklearn.metrics.roc_auc_score(y_true = test_labels, y_score = logistic.predict(X_te_q1q2)))

Result on train:  0.8303725534934316
Result on validation:  0.7532320054109183
Result on test:  0.7486764630961492


In [32]:
save_logistic(logistic, 'models/logistic_tfidf.json')

Result using sklearn implementation of TfIdf Vectorizer. Note that the result is different because the formula that sklearn uses is different from ours.

In [33]:
logistic_sk = sklearn.linear_model.LogisticRegression(solver="liblinear")
logistic_sk.fit(X_tr_q1q2_sk, train_labels)

print("Result on train: ", sklearn.metrics.roc_auc_score(y_true = train_labels, y_score = logistic_sk.predict(X_tr_q1q2_sk)))

print("Result on validation: ", sklearn.metrics.roc_auc_score(y_true = val_labels, y_score = logistic_sk.predict(X_val_q1q2_sk)))

#test roc auc metrics
print("Result on test: ", sklearn.metrics.roc_auc_score(y_true = test_labels, y_score = logistic_sk.predict(X_te_q1q2_sk)))

Result on train:  0.8039620081117552
Result on validation:  0.7444771581829623
Result on test:  0.7401463445535814


In [34]:
save_logistic(logistic_sk, 'models/logistic_tfidf_sk.json')

### Qualitative information about the mistakes

So, in this case, the mistakes are practically the same, hence:
- The questions are the same, but the sentences have lots of different words.
- The questions are the same, but one sentence is way larger than the other.
- The questions are asking about the same thing but for different years, hence they must be classified as different.
- One of the questions is a subset of the other. This mistake is the harder to solve because sometimes it is even debatable of the questions should be the same or not.

In [35]:
print("Accuracy on training: ", np.sum(train_labels ==logistic.predict(X_tr_q1q2))/len(train_labels))
print("Accuracy on validation: ", np.sum(val_labels ==logistic.predict(X_val_q1q2))/len(val_labels))
print("Accuracy on test: ", np.sum(test_labels ==logistic.predict(X_te_q1q2))/len(test_labels))

Accuracy on training:  0.8615642005166823
Accuracy on validation:  0.7928827603264902
Accuracy on test:  0.7880729179549334


In [36]:
mistake_indices, predictions = get_mistakes(logistic, X_tr_q1q2, train_labels)
print_mistake_k(4, all_questions, mistake_indices, predictions)

Original q1:  Are Persians considered Caucasian?  Treated q1:  be Persian consider Caucasian
Original q2:  Are Persians White?  Treated q2:  be Persian white
true class: 1
prediction: 0


# SECOND MODEL: NAIVE CLASSIFIER WITH EXTRA FEATURES

Given the mistakes encountered in the previous model, we tried to code some extra features to tackle with those problems.

### Code to obtain the extra features.

Here we give a list of extra features that we could add to the feature vector.

1. Lenght of the question

2. Is there a [math] tag? 

3. Is there a number in the question?

4. Is it the same number in both questions? 

5. % of intersection words?


In [22]:
def get_qlength(questions):
    qlen = []
    for quest in questions:
        clean_doc_pattern = re.compile( r"('\w)|([^a-zA-Z0-9.])") #Find words containing alphanumeric or points
        q = re.sub('\'s', '', quest) #Remove 's
        q = re.sub('\'t', ' not', q) #Change 't for not'
        q = re.sub('\'re', ' are', q) #Change 're for are'
        q = re.sub('[?%!@#$\'\""]', '', q)#Remove symbols
        q = re.sub('\.\s', ' ', q)#Remove points with a space afterwards
        clean_q = clean_doc_pattern.sub(" ", q)
        qlen.append(len(re.findall(r"(?u)\b[\w.,]+\b",q)))
        
    return np.array(qlen).reshape(-1,1)

def is_math(questions):
    math=[]
    for quest in questions:
        if '[math]' in quest:
            math.append(1)
        else:
            math.append(0)
    return np.array(math).reshape(-1,1)
    
def is_number(word):
    try :  
        w = float(word) 
        if(np.isnan(w)):
            return 0
        if(np.isinf(w)):
            return 0
        res = 1
    except : 
        res = 0
    return res    

def has_numbers(questions):
    num=np.zeros((len(questions)))
    which_num = np.zeros((len(questions)))
    i=0
    for quest in questions:
        for w in re.findall(r"(?u)\b[\w.,]+\b",quest):
            is_num = is_number(w)
            if is_num==1:
                num[i]=1
                which_num[i]=float(w)
                if(np.isnan(which_num[i])):
                    print(which_num[i])
                    print(float(w))
                break
        i+=1
    return num.reshape(-1,1), which_num.reshape(-1,1)


def is_different_number(which_num1, which_num2):
    dif = which_num1 - which_num2
    dif[dif>0]=1
    return np.array(dif).reshape(-1,1)

In [23]:
def q1_q2_intersect(row, q1, q2, q_dict):
    set1 = set(q_dict[q1[row]])
    set2 = set(q_dict[q2[row]])
    return(len(set1.intersection(set2))/len(set1.union(set2)))


def intersection(q1_train, q2_train,q1_val,q2_val, q1_test, q2_test):
    q1 = q1_train + q1_val +  q1_test
    q2 = q2_train + q1_val + q2_test
    q_dict = defaultdict(set)
    for i in range(len(q1)):
            q_dict[q1[i]].add(q2[i])
            q_dict[q2[i]].add(q1[i])

    intersect_train = []
    intersect_test = []
    intersect_val = []
    for row in range(len(q1_train)):
        intersect_train.append(q1_q2_intersect(row, q1_train, q2_train, q_dict))
    
    for row in range(len(q1_val)):
        intersect_val.append(q1_q2_intersect(row, q1_val, q2_val, q_dict))
        
    for row in range(len(q1_test)):
        intersect_test.append(q1_q2_intersect(row, q1_test, q2_test, q_dict))
    
    intersect_train = np.array(intersect_train).reshape(-1,1)
    intersect_val = np.array(intersect_val).reshape(-1,1)
    intersect_test = np.array(intersect_test).reshape(-1,1)
    return intersect_train, intersect_val, intersect_test 

In [24]:
intersect_train, intersect_val, intersect_test = intersection(q1_train, q2_train, q1_val, q2_val, q1_test, q2_test)

In [25]:
ndocs, nvars = X_tr_q1q2.shape
nvars = (int)(nvars/2)

In [26]:
def intersection(Xq1,Xq2):
    union = scipy.sum((Xq1!=0)+(Xq2!=0) ,axis=1)
    union[union==0]=1
    intersection = scipy.sum((Xq1!=0).multiply(Xq2!=0), axis=1)
    return intersection/union

In [27]:
intersection_train = intersection(X_tr_q1q2[:,:nvars], X_tr_q1q2[:,nvars:])
intersect_val = intersection(X_val_q1q2[:,:nvars], X_val_q1q2[:,nvars:])
intersect_test = intersection(X_te_q1q2[:,:nvars], X_te_q1q2[:,nvars:])

In [28]:
num1_train, which_num1_train=  has_numbers(q1_train_raw)
num2_train, which_num2_train =  has_numbers(q2_train_raw)
dif_number_train = is_different_number(which_num1_train,which_num2_train)

num1_val, which_num1_val=  has_numbers(q1_val_raw)
num2_val, which_num2_val =  has_numbers(q2_val_raw)
dif_number_val = is_different_number(which_num1_val,which_num2_val)

num1_test, which_num1_test=  has_numbers(q1_test_raw)
num2_test, which_num2_test =  has_numbers(q2_test_raw)
dif_number_test = is_different_number(which_num1_test,which_num2_test)

In [29]:
math1_train = is_math(q1_train_raw)
math2_train = is_math(q2_train_raw)

math1_val = is_math(q1_val_raw)
math2_val = is_math(q2_val_raw)

math1_test = is_math(q1_test_raw)
math2_test = is_math(q2_test_raw)

In [30]:
len1_train = get_qlength(q1_train_raw)
len2_train = get_qlength(q2_train_raw)

len1_val = get_qlength(q1_val_raw)
len2_val = get_qlength(q2_val_raw)

len1_test = get_qlength(q1_test_raw)
len2_test = get_qlength(q2_test_raw)

## Second model - with CountVectorizer

In [None]:
from nltk.stem import PorterStemmer, LancasterStemmer, WordNetLemmatizer, SnowballStemmer
from nltk.corpus import stopwords 
nltk.download('stopwords')
nltk.download('wordnet')


#inicialize the CountVectorizer and define its parameters
CountVectorizer = cv(stop_words = set(stopwords.words('english')),
                     ngram_range=(1,3), max_df = 0.99, min_df = 5)
#fit the CountVectorizer
CountVectorizer.fit(all_questions)

countvect_sk = sklearn.feature_extraction.text.CountVectorizer(stop_words = set(stopwords.words('english')),ngram_range=(1, 3),
                                                              max_df = 0.99, min_df = 5)
countvect_sk.fit(all_questions)

In this case, since we have already check that our CountVectorizer yields the same result as the sklearn one, we will only use ours.

In [31]:
X_tr_q1q2 = get_features_from_list(q1_train, q2_train,CountVectorizer)
X_val_q1q2  = get_features_from_list(q1_val, q2_val, CountVectorizer)
X_te_q1q2  = get_features_from_list(q1_test, q2_test, CountVectorizer)

In [32]:
print('initial shape', X_tr_q1q2.shape)

new_X_tr_q1q2 = sparse.hstack((X_tr_q1q2,intersect_train, num1_train, num2_train,
                               dif_number_train,math1_train,math2_train,len1_train, len2_train))

new_X_te_q1q2 = sparse.hstack((X_te_q1q2,intersect_test, num1_test, num2_test,
                               dif_number_test, math1_test,math2_test,len1_test, len2_test))

new_X_val_q1q2 = sparse.hstack((X_val_q1q2,intersect_val, num1_val, num2_val,
                               dif_number_val, math1_val,math2_val,len1_val, len2_val))

print('final shape', new_X_tr_q1q2.shape)

initial shape (291088, 285364)
final shape (291088, 285372)


We obtain the following result. We see that the AUC has dropped a lot. We think that this may be due to the imbalance of the values of the different features, i.e., we are not normalizing the values of any of the features. We thought that it would be necessary to change the model, then.

In [46]:
logistic = sklearn.linear_model.LogisticRegression(solver="liblinear")
logistic.fit(new_X_tr_q1q2, train_labels)

#train roc auc metrics
print("Result on train: ", sklearn.metrics.roc_auc_score(y_true = train_labels, y_score = logistic.predict(new_X_tr_q1q2)))

#val roc auc metrics
print("Result on validation: ", sklearn.metrics.roc_auc_score(y_true = val_labels, y_score = logistic.predict(new_X_val_q1q2)))

#test roc auc metrics
print("Result on test: ", sklearn.metrics.roc_auc_score(y_true = test_labels, y_score = logistic.predict(new_X_te_q1q2)))

Result on train:  0.4849733415832702
Result on validation:  0.48544005699141674
Result on test:  0.48445389374241093


In [47]:
save_logistic(logistic, 'models/logistic_extra_features.json')

## Second model - with TfIdfVectorizer

In [48]:
tfidf_vectorizer = tf(stop_words = set(stopwords.words('english')), ngram_range=(1,3), max_df = 0.4, min_df = 5)
tfidf_vectorizer.fit(all_questions)

tfidf_sk = sklearn.feature_extraction.text.TfidfVectorizer(use_idf=False, smooth_idf=False, sublinear_tf=False,
                                                          stop_words = set(stopwords.words('english')),
                                                          ngram_range=(1,3), max_df = 0.99, min_df = 5)
tfidf_sk.fit(all_questions) 

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=0.99, max_features=None,
                min_df=5, ngram_range=(1, 3), norm='l2', preprocessor=None,
                smooth_idf=False,
                stop_words={'a', 'about', 'above', 'after', 'again', 'against',
                            'ain', 'all', 'am', 'an', 'and', 'any', 'are',
                            'aren', "aren't", 'as', 'at', 'be', 'because',
                            'been', 'before', 'being', 'below', 'between',
                            'both', 'but', 'by', 'can', 'couldn', "couldn't", ...},
                strip_accents=None, sublinear_tf=False,
                token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None,
                use_idf=False, vocabulary=None)

Again we will only run the code for our TfIdfVectorizer.

In [49]:
X_tr_q1q2 = get_features_from_list(q1_train, q2_train,tfidf_vectorizer)
X_val_q1q2  = get_features_from_list(q1_val, q2_val, tfidf_vectorizer)
X_te_q1q2  = get_features_from_list(q1_test, q2_test, tfidf_vectorizer)

In [50]:
print('initial shape', X_tr_q1q2.shape)

new_X_tr_q1q2 = sparse.hstack((X_tr_q1q2,intersect_train, num1_train, num2_train,
                               dif_number_train,math1_train,math2_train,len1_train, len2_train))
new_X_val_q1q2 = sparse.hstack((X_val_q1q2,intersect_val, num1_val, num2_val,
                               dif_number_val, math1_val,math2_val,len1_val, len2_val))
new_X_te_q1q2 = sparse.hstack((X_te_q1q2,intersect_test, num1_test, num2_test,
                               dif_number_test, math1_test,math2_test,len1_test, len2_test))

print('final shape', new_X_tr_q1q2.shape)

initial shape (291088, 285364)
final shape (291088, 285372)


A very similar thing happens with the tfidfVectorizer.

In [51]:
logistic = sklearn.linear_model.LogisticRegression(solver="liblinear")
logistic.fit(new_X_tr_q1q2, train_labels)

print("Result on train: ", sklearn.metrics.roc_auc_score(y_true = train_labels, y_score = logistic.predict(new_X_tr_q1q2)))

print("Result on validation: ", sklearn.metrics.roc_auc_score(y_true = val_labels, y_score = logistic.predict(new_X_val_q1q2)))

#test roc auc metrics
print("Result on test: ", sklearn.metrics.roc_auc_score(y_true = test_labels, y_score = logistic.predict(new_X_te_q1q2)))

Result on train:  0.4849733415832702
Result on validation:  0.48544005699141674
Result on test:  0.48445389374241093


In [52]:
save_logistic(logistic, 'models/logistic_extra_features_tfidf.json')

# THIRD MODEL: XGBOOST

Given all the previous results, a thing was clear: we needed to change the classifier. So our take was: combine everything we have done until now (text with the spell checking and the extra features) but with a more sophisticated model. We chose the XGBoost.

## Third model - with CountVectorizer

In [87]:
from nltk.stem import PorterStemmer, LancasterStemmer, WordNetLemmatizer, SnowballStemmer
from nltk.corpus import stopwords 
nltk.download('stopwords')
nltk.download('wordnet')


#inicialize the CountVectorizer and define its parameters
CountVectorizer = cv(stop_words = set(stopwords.words('english')),
                     ngram_range=(1,3), max_df = 0.99, min_df = 6)
#fit the CountVectorizer
CountVectorizer.fit(all_questions)

countvect_sk = sklearn.feature_extraction.text.CountVectorizer(stop_words = set(stopwords.words('english')),ngram_range=(1, 3),
                                                              max_df = 0.99, min_df = 5)
countvect_sk.fit(all_questions)

X_tr_q1q2 = get_features_from_list(q1_train, q2_train,CountVectorizer)
X_val_q1q2 = get_features_from_list(q1_val, q2_val,CountVectorizer)
X_te_q1q2  = get_features_from_list(q1_test, q2_test, CountVectorizer)

print('initial shape', X_tr_q1q2.shape)
new_X_tr_q1q2 = sparse.hstack((X_tr_q1q2,intersect_train, num1_train, num2_train,
                               dif_number_train,math1_train,math2_train,len1_train, len2_train))
new_X_val_q1q2 = sparse.hstack((X_val_q1q2,intersect_val, num1_val, num2_val,
                               dif_number_val, math1_val,math2_val,len1_val, len2_val))
new_X_te_q1q2 = sparse.hstack((X_te_q1q2,intersect_test, num1_test, num2_test,
                               dif_number_test, math1_test,math2_test,len1_test, len2_test))
print('final shape', new_X_tr_q1q2.shape)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Ignasi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Ignasi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


initial shape (291088, 227068)
final shape (291088, 227076)


In [None]:
import xgboost as xgb

params = {}
params['objective'] = 'binary:logistic'
params['eval_metric'] = 'auc'
params['eta'] = 0.02
params['max_depth'] = 4

d_train = xgb.DMatrix(new_X_tr_q1q2, label=train_labels)
d_test = xgb.DMatrix(new_X_te_q1q2, label=test_labels)
d_val = xgb.DMatrix(new_X_val_q1q2, label=val_labels)

evallist = [(d_train, 'train'), (d_test, 'test'), (d_val, 'val')]

num_iters = 50000

xgb_count = xgb.train(params, d_train, num_iters, evallist, early_stopping_rounds=50, verbose_eval=10)


Save the model

In [57]:
xgb_count.save_model('models/xgb_count')

In [69]:
d_train = xgb.DMatrix(new_X_tr_q1q2, label=train_labels)
d_test = xgb.DMatrix(new_X_te_q1q2, label=test_labels)
d_val = xgb.DMatrix(new_X_val_q1q2, label=val_labels)

pred_test = xgb_count.predict(d_test)
pred_train = xgb_count.predict(d_train)
pred_val = xgb_count.predict(d_val)

print("Accuracy on training: ", np.sum(train_labels==pred_train.round(0).astype(int))/len(train_labels))
print("Accuracy on validation: ", np.sum(val_labels==pred_val.round(0).astype(int))/len(val_labels))
print("Accuracy on test: ", np.sum(test_labels ==pred_test.round(0).astype(int))/len(test_labels))

#train roc auc metrics
print("AUC on train: ", sklearn.metrics.roc_auc_score(y_true = train_labels, y_score = pred_train))

#test roc auc metrics
print("AUC on validation: ", sklearn.metrics.roc_auc_score(y_true = val_labels, y_score = pred_val))

#test roc auc metrics
print("AUC on test: ", sklearn.metrics.roc_auc_score(y_true = test_labels, y_score = pred_test))

mistake_indices, predictions = get_mistakes(xgb_count, d_train, train_labels)
print_mistake_k(4, all_questions, mistake_indices, predictions)      




Accuracy on training:  0.8415084098279558
Accuracy on validation:  0.8431548355181796
Accuracy on test:  0.8305547997724406
AUC on train:  0.9159446541394345
AUC on validation:  0.8989376768402263
AUC on test:  0.8939774284673486
Original q1:  What are 10 things you would tell your 19 year old self?  Treated q1:  what be 10 thing you would tell your 19 year old self
Original q2:  What are some of the most important things you would tell your 19 year old self?  Treated q2:  what be some of the most important thing you would tell your 19 year old self
true class: 1
prediction: 0


## Third model - with TfIdfVectorizer

In [70]:
tfidf_vectorizer = tf(stop_words = set(stopwords.words('english')), ngram_range=(1,3), max_df = 0.4, min_df = 5)
tfidf_vectorizer.fit(all_questions)

tfidf_sk = sklearn.feature_extraction.text.TfidfVectorizer(use_idf=False, smooth_idf=False, sublinear_tf=False,
                                                          stop_words = set(stopwords.words('english')),
                                                          ngram_range=(1,3), max_df = 0.99, min_df = 5)
tfidf_sk.fit(all_questions)

X_tr_q1q2 = get_features_from_list(q1_train, q2_train,tfidf_vectorizer)
X_val_q1q2 = get_features_from_list(q1_val, q2_val,tfidf_vectorizer)
X_te_q1q2  = get_features_from_list(q1_test, q2_test, tfidf_vectorizer)

print('initial shape', X_tr_q1q2.shape)
new_X_tr_q1q2 = sparse.hstack((X_tr_q1q2,intersect_train, num1_train, num2_train,
                               dif_number_train,math1_train,math2_train,len1_train, len2_train))
new_X_val_q1q2 = sparse.hstack((X_val_q1q2,intersect_val, num1_val, num2_val,
                               dif_number_val, math1_val,math2_val,len1_val, len2_val))
new_X_te_q1q2 = sparse.hstack((X_te_q1q2,intersect_test, num1_test, num2_test,
                               dif_number_test, math1_test,math2_test,len1_test, len2_test))
print('final shape', new_X_tr_q1q2.shape)

initial shape (291088, 285364)
final shape (291088, 285372)


In [71]:
import xgboost as xgb

params = {}
params['objective'] = 'binary:logistic'
params['eval_metric'] = 'auc'
params['eta'] = 0.02
params['max_depth'] = 4

d_train = xgb.DMatrix(new_X_tr_q1q2, label=train_labels)
d_test = xgb.DMatrix(new_X_te_q1q2, label=test_labels)
d_val = xgb.DMatrix(new_X_val_q1q2, label=val_labels)

evallist = [(d_train, 'train'), (d_test, 'test'), (d_val, 'val')]

num_iters = 50000

xgb_tfidf = xgb.train(params, d_train, num_iters, evallist, early_stopping_rounds=50, verbose_eval=10)


[0]	train-auc:0.82935	test-auc:0.82916	val-auc:0.83357
Multiple eval metrics have been passed: 'val-auc' will be used for early stopping.

Will train until val-auc hasn't improved in 50 rounds.
[10]	train-auc:0.83838	test-auc:0.83868	val-auc:0.84108
[20]	train-auc:0.84049	test-auc:0.84095	val-auc:0.84337
[30]	train-auc:0.84172	test-auc:0.84195	val-auc:0.84498
[40]	train-auc:0.84266	test-auc:0.84295	val-auc:0.84631
[50]	train-auc:0.84302	test-auc:0.84349	val-auc:0.84688
[60]	train-auc:0.84321	test-auc:0.84352	val-auc:0.84705
[70]	train-auc:0.84330	test-auc:0.84355	val-auc:0.84719
[80]	train-auc:0.84556	test-auc:0.84539	val-auc:0.84932
[90]	train-auc:0.84538	test-auc:0.84523	val-auc:0.84918
[100]	train-auc:0.84570	test-auc:0.84549	val-auc:0.84942
[110]	train-auc:0.84674	test-auc:0.84681	val-auc:0.85088
[120]	train-auc:0.84779	test-auc:0.84781	val-auc:0.85189
[130]	train-auc:0.84844	test-auc:0.84837	val-auc:0.85277
[140]	train-auc:0.84923	test-auc:0.84915	val-auc:0.85364
[150]	train-auc:0

[1410]	train-auc:0.87788	test-auc:0.87283	val-auc:0.87758
[1420]	train-auc:0.87795	test-auc:0.87286	val-auc:0.87762
[1430]	train-auc:0.87803	test-auc:0.87291	val-auc:0.87765
[1440]	train-auc:0.87815	test-auc:0.87296	val-auc:0.87772
[1450]	train-auc:0.87825	test-auc:0.87302	val-auc:0.87778
[1460]	train-auc:0.87833	test-auc:0.87306	val-auc:0.87785
[1470]	train-auc:0.87842	test-auc:0.87313	val-auc:0.87789
[1480]	train-auc:0.87848	test-auc:0.87317	val-auc:0.87792
[1490]	train-auc:0.87856	test-auc:0.87320	val-auc:0.87799
[1500]	train-auc:0.87863	test-auc:0.87325	val-auc:0.87804
[1510]	train-auc:0.87875	test-auc:0.87334	val-auc:0.87812
[1520]	train-auc:0.87884	test-auc:0.87340	val-auc:0.87812
[1530]	train-auc:0.87893	test-auc:0.87346	val-auc:0.87820
[1540]	train-auc:0.87901	test-auc:0.87351	val-auc:0.87827
[1550]	train-auc:0.87913	test-auc:0.87360	val-auc:0.87833
[1560]	train-auc:0.87922	test-auc:0.87366	val-auc:0.87839
[1570]	train-auc:0.87930	test-auc:0.87371	val-auc:0.87844
[1580]	train-a

[2830]	train-auc:0.88783	test-auc:0.87850	val-auc:0.88334
[2840]	train-auc:0.88789	test-auc:0.87853	val-auc:0.88338
[2850]	train-auc:0.88795	test-auc:0.87856	val-auc:0.88342
[2860]	train-auc:0.88800	test-auc:0.87858	val-auc:0.88345
[2870]	train-auc:0.88805	test-auc:0.87859	val-auc:0.88347
[2880]	train-auc:0.88810	test-auc:0.87861	val-auc:0.88347
[2890]	train-auc:0.88818	test-auc:0.87866	val-auc:0.88355
[2900]	train-auc:0.88823	test-auc:0.87867	val-auc:0.88355
[2910]	train-auc:0.88831	test-auc:0.87874	val-auc:0.88362
[2920]	train-auc:0.88837	test-auc:0.87876	val-auc:0.88366
[2930]	train-auc:0.88841	test-auc:0.87879	val-auc:0.88367
[2940]	train-auc:0.88847	test-auc:0.87883	val-auc:0.88369
[2950]	train-auc:0.88852	test-auc:0.87885	val-auc:0.88371
[2960]	train-auc:0.88856	test-auc:0.87888	val-auc:0.88373
[2970]	train-auc:0.88861	test-auc:0.87890	val-auc:0.88377
[2980]	train-auc:0.88866	test-auc:0.87892	val-auc:0.88380
[2990]	train-auc:0.88871	test-auc:0.87896	val-auc:0.88383
[3000]	train-a

[4250]	train-auc:0.89469	test-auc:0.88172	val-auc:0.88676
[4260]	train-auc:0.89472	test-auc:0.88173	val-auc:0.88677
[4270]	train-auc:0.89477	test-auc:0.88175	val-auc:0.88678
[4280]	train-auc:0.89481	test-auc:0.88177	val-auc:0.88679
[4290]	train-auc:0.89485	test-auc:0.88179	val-auc:0.88682
[4300]	train-auc:0.89489	test-auc:0.88181	val-auc:0.88683
[4310]	train-auc:0.89492	test-auc:0.88182	val-auc:0.88684
[4320]	train-auc:0.89496	test-auc:0.88182	val-auc:0.88687
[4330]	train-auc:0.89502	test-auc:0.88184	val-auc:0.88691
[4340]	train-auc:0.89506	test-auc:0.88186	val-auc:0.88693
[4350]	train-auc:0.89509	test-auc:0.88187	val-auc:0.88694
[4360]	train-auc:0.89512	test-auc:0.88188	val-auc:0.88694
[4370]	train-auc:0.89516	test-auc:0.88189	val-auc:0.88696
[4380]	train-auc:0.89522	test-auc:0.88192	val-auc:0.88698
[4390]	train-auc:0.89527	test-auc:0.88194	val-auc:0.88702
[4400]	train-auc:0.89530	test-auc:0.88196	val-auc:0.88704
[4410]	train-auc:0.89533	test-auc:0.88197	val-auc:0.88704
[4420]	train-a

[5670]	train-auc:0.89996	test-auc:0.88380	val-auc:0.88892
[5680]	train-auc:0.90000	test-auc:0.88382	val-auc:0.88892
[5690]	train-auc:0.90003	test-auc:0.88383	val-auc:0.88893
[5700]	train-auc:0.90006	test-auc:0.88385	val-auc:0.88895
[5710]	train-auc:0.90009	test-auc:0.88387	val-auc:0.88896
[5720]	train-auc:0.90012	test-auc:0.88387	val-auc:0.88897
[5730]	train-auc:0.90015	test-auc:0.88388	val-auc:0.88897
[5740]	train-auc:0.90018	test-auc:0.88389	val-auc:0.88898
[5750]	train-auc:0.90021	test-auc:0.88391	val-auc:0.88899
[5760]	train-auc:0.90025	test-auc:0.88392	val-auc:0.88899
[5770]	train-auc:0.90028	test-auc:0.88394	val-auc:0.88901
[5780]	train-auc:0.90031	test-auc:0.88394	val-auc:0.88902
[5790]	train-auc:0.90034	test-auc:0.88396	val-auc:0.88903
[5800]	train-auc:0.90037	test-auc:0.88396	val-auc:0.88903
[5810]	train-auc:0.90041	test-auc:0.88396	val-auc:0.88904
[5820]	train-auc:0.90045	test-auc:0.88398	val-auc:0.88907
[5830]	train-auc:0.90049	test-auc:0.88400	val-auc:0.88909
[5840]	train-a

[7090]	train-auc:0.90439	test-auc:0.88534	val-auc:0.89044
[7100]	train-auc:0.90442	test-auc:0.88535	val-auc:0.89045
[7110]	train-auc:0.90445	test-auc:0.88536	val-auc:0.89046
[7120]	train-auc:0.90448	test-auc:0.88537	val-auc:0.89047
[7130]	train-auc:0.90451	test-auc:0.88538	val-auc:0.89048
[7140]	train-auc:0.90455	test-auc:0.88541	val-auc:0.89051
[7150]	train-auc:0.90458	test-auc:0.88542	val-auc:0.89051
[7160]	train-auc:0.90460	test-auc:0.88542	val-auc:0.89051
[7170]	train-auc:0.90463	test-auc:0.88543	val-auc:0.89052
[7180]	train-auc:0.90465	test-auc:0.88544	val-auc:0.89051
[7190]	train-auc:0.90468	test-auc:0.88545	val-auc:0.89051
[7200]	train-auc:0.90470	test-auc:0.88546	val-auc:0.89052
[7210]	train-auc:0.90473	test-auc:0.88546	val-auc:0.89052
[7220]	train-auc:0.90476	test-auc:0.88546	val-auc:0.89053
[7230]	train-auc:0.90479	test-auc:0.88547	val-auc:0.89054
[7240]	train-auc:0.90481	test-auc:0.88548	val-auc:0.89054
[7250]	train-auc:0.90483	test-auc:0.88549	val-auc:0.89055
[7260]	train-a

[8510]	train-auc:0.90815	test-auc:0.88651	val-auc:0.89160
[8520]	train-auc:0.90817	test-auc:0.88652	val-auc:0.89162
[8530]	train-auc:0.90819	test-auc:0.88652	val-auc:0.89162
[8540]	train-auc:0.90822	test-auc:0.88653	val-auc:0.89164
[8550]	train-auc:0.90824	test-auc:0.88654	val-auc:0.89164
[8560]	train-auc:0.90827	test-auc:0.88654	val-auc:0.89165
[8570]	train-auc:0.90829	test-auc:0.88654	val-auc:0.89166
[8580]	train-auc:0.90832	test-auc:0.88655	val-auc:0.89166
[8590]	train-auc:0.90834	test-auc:0.88655	val-auc:0.89167
[8600]	train-auc:0.90837	test-auc:0.88655	val-auc:0.89167
[8610]	train-auc:0.90839	test-auc:0.88656	val-auc:0.89169
[8620]	train-auc:0.90842	test-auc:0.88657	val-auc:0.89171
[8630]	train-auc:0.90845	test-auc:0.88657	val-auc:0.89172
[8640]	train-auc:0.90847	test-auc:0.88658	val-auc:0.89173
[8650]	train-auc:0.90849	test-auc:0.88658	val-auc:0.89174
[8660]	train-auc:0.90852	test-auc:0.88659	val-auc:0.89175
[8670]	train-auc:0.90855	test-auc:0.88660	val-auc:0.89177
[8680]	train-a

[9930]	train-auc:0.91153	test-auc:0.88747	val-auc:0.89269
[9940]	train-auc:0.91155	test-auc:0.88747	val-auc:0.89271
[9950]	train-auc:0.91158	test-auc:0.88748	val-auc:0.89272
[9960]	train-auc:0.91160	test-auc:0.88748	val-auc:0.89272
[9970]	train-auc:0.91162	test-auc:0.88749	val-auc:0.89273
[9980]	train-auc:0.91164	test-auc:0.88750	val-auc:0.89273
[9990]	train-auc:0.91167	test-auc:0.88751	val-auc:0.89274
[10000]	train-auc:0.91169	test-auc:0.88752	val-auc:0.89275
[10010]	train-auc:0.91172	test-auc:0.88753	val-auc:0.89275
[10020]	train-auc:0.91173	test-auc:0.88753	val-auc:0.89276
[10030]	train-auc:0.91175	test-auc:0.88754	val-auc:0.89277
[10040]	train-auc:0.91178	test-auc:0.88754	val-auc:0.89277
[10050]	train-auc:0.91180	test-auc:0.88754	val-auc:0.89277
[10060]	train-auc:0.91182	test-auc:0.88755	val-auc:0.89277
[10070]	train-auc:0.91184	test-auc:0.88755	val-auc:0.89277
[10080]	train-auc:0.91186	test-auc:0.88756	val-auc:0.89277
[10090]	train-auc:0.91189	test-auc:0.88757	val-auc:0.89277
[101

[11320]	train-auc:0.91448	test-auc:0.88827	val-auc:0.89358
[11330]	train-auc:0.91450	test-auc:0.88828	val-auc:0.89359
[11340]	train-auc:0.91452	test-auc:0.88828	val-auc:0.89359
[11350]	train-auc:0.91454	test-auc:0.88828	val-auc:0.89360
[11360]	train-auc:0.91456	test-auc:0.88829	val-auc:0.89360
[11370]	train-auc:0.91458	test-auc:0.88830	val-auc:0.89361
[11380]	train-auc:0.91460	test-auc:0.88830	val-auc:0.89361
[11390]	train-auc:0.91462	test-auc:0.88831	val-auc:0.89362
[11400]	train-auc:0.91465	test-auc:0.88831	val-auc:0.89362
[11410]	train-auc:0.91467	test-auc:0.88831	val-auc:0.89363
[11420]	train-auc:0.91468	test-auc:0.88831	val-auc:0.89364
[11430]	train-auc:0.91470	test-auc:0.88832	val-auc:0.89364
[11440]	train-auc:0.91472	test-auc:0.88833	val-auc:0.89366
[11450]	train-auc:0.91474	test-auc:0.88835	val-auc:0.89366
[11460]	train-auc:0.91477	test-auc:0.88836	val-auc:0.89368
[11470]	train-auc:0.91480	test-auc:0.88838	val-auc:0.89369
[11480]	train-auc:0.91482	test-auc:0.88839	val-auc:0.893

Save the model

In [72]:
xgb_tfidf.save_model('models/xgb_tfidf')

In [74]:
d_train = xgb.DMatrix(new_X_tr_q1q2, label=train_labels)
d_test = xgb.DMatrix(new_X_te_q1q2, label=test_labels)
d_val = xgb.DMatrix(new_X_val_q1q2, label=val_labels)

pred_test = xgb_tfidf.predict(d_test)
pred_val = xgb_tfidf.predict(d_val)
pred_train = xgb_tfidf.predict(d_train)

print("Accuracy on training: ", np.sum(train_labels==pred_train.round(0).astype(int))/len(train_labels))
print("Accuracy on validation: ", np.sum(val_labels==pred_val.round(0).astype(int))/len(val_labels))
print("Accuracy on test: ", np.sum(test_labels ==pred_test.round(0).astype(int))/len(test_labels))

#train roc auc metrics
print("AUC on train: ", sklearn.metrics.roc_auc_score(y_true = train_labels, y_score = pred_train))

#test roc auc metrics
print("AUC on validation: ", sklearn.metrics.roc_auc_score(y_true = val_labels, y_score = pred_val))

#test roc auc metrics
print("AUC on test: ", sklearn.metrics.roc_auc_score(y_true = test_labels, y_score = pred_test))


mistake_indices, predictions = get_mistakes(xgb_tfidf, d_train, train_labels)
print_mistake_k(4, all_questions, mistake_indices, predictions)      


Accuracy on training:  0.8403609904908481
Accuracy on validation:  0.8428765767994064
Accuracy on test:  0.8288975735239555
AUC on train:  0.9168234628206468
AUC on validation:  0.8942311188490448
AUC on test:  0.8889539442559593
Original q1:  What are 10 things you would tell your 19 year old self?  Treated q1:  what be 10 thing you would tell your 19 year old self
Original q2:  What are some of the most important things you would tell your 19 year old self?  Treated q2:  what be some of the most important thing you would tell your 19 year old self
true class: 1
prediction: 0


# FOURTH MODEL: DIFFERENT APPROACH WITH DEEP LEARNING

Our main objective for this deliverable was to work with a more classic approach for natural language processing, mainly to implement and understand the CountVectorizer and TfIdfVectorizer. Additionally, we tried to work on the mistakes and limitations that this approach had, hence having to do a bit of feature engeenireing to tackle those problems.

However, nowadays deep learning is used practically to solve anything, so, how well could it work to solve this problem? In this section we explore a completely different approach using deep learning.

# LAST BUT NOT LEAST: LET'S DO PIPELINES

In [33]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_validate
import extra_features
import xgboost as xgb

To pass data from CountVectorizer to model automatically, we need to transform the data first. This is just a hacky way to do it. 

Note that the correct way is to modify the CountVectorized that we implemented so that the output of the  transform is already the desired matrix (i.e. doing the hstack inside). We tried to implement the most general CountVectorized that we could, so we have to resort to this way. It is in no way slower in terms of speed. It is just ugly in the sense that is doing something hack-ish. 

In [34]:
from pipeline_classes import CountVectorizerTransformer, XGBModel

In [35]:
model_cv = Pipeline([
    ('countVectorizer', CountVectorizerTransformer(stop_words = set(stopwords.words('english')),
                     ngram_range=(1,3), max_df = 0.99, min_df = 5)),
    ('model', XGBModel(objective='binary:logistic', eval_metric='auc', eta=0.02, max_depth=4))])

In [36]:
model_cv.fit(all_questions, train_labels)

Pipeline(memory=None,
     steps=[('countVectorizer', CountVectorizerTransformer()), ('model', XGBModel())])

In [37]:
all_questions_test = q1_test+q2_test
all_questions_val = q1_val+q2_val

In [38]:
sklearn.metrics.roc_auc_score(val_labels, model_cv.predict(all_questions_val))

0.832060706396399

In [39]:
sklearn.metrics.roc_auc_score(test_labels, model_cv.predict(all_questions_test))

0.8356496066563877

In [40]:
model_cv.get_params()['countVectorizer'].dump("models/Pipeline_CountVectorizer.pkl")
model_cv.get_params()['model'].dump("models/Pipeline_XGBoost.pkl")

### Make Pipeline Great Again: GridSearch Hyperparameters

To select samples to cross validate, we need the samples to be in pair with the labels. Right now we dont have that, so we are going to (once again) hack our way through it.

Again note that the correct way to do it is to modify the CountVectorizer so that it automatically does this process inside the fit and transform functions. But following our filosophy of having a general CountVectorizer, this is a good way to solve the problem.

In [25]:
ndocs = (int)(len(all_questions)/2)
all_questions_tuples = [(q1_train[i], q2_train[i]) for i in range(ndocs)]

class MiddleTransformer(sklearn.base.BaseEstimator, sklearn.base.TransformerMixin):
    def __init__(self):
        return
    
    def fit(self, X, y=None):
        return self
    
    def fit_transform(self, X, y=None):
        return self.transform(X)
    
    def transform(self, X):
        X_tr = self.CountVectorizer.transform(X)
        nexamples, nvars = X_tr.shape
        split = (int)(nexamples/2)
        XX1 = X_tr[:split,:]
        XX2 = X_tr[split:,:]
        return sparse.hstack([XX1,XX2], format='csr')

class CrossValidationTransformer(sklearn.base.BaseEstimator, sklearn.base.TransformerMixin):
    def __init__(self):
        return
    
    def fit(self, X, y=None):
        return self
    
    def fit_transform(self, X, y=None):
        return self.transform(X)
    
    def transform(self, X):
        XX = [x[0] for x in X]
        for x in X:
            XX.append(x[1])
        return XX

In [52]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV

mnb = MultinomialNB()

model_cv = Pipeline([
    ('crossValidationTransformer', CrossValidationTransformer()),
    ('countVectorizer', tf(#token_cleaner_func = lambda doc: WordNetLemmatizer().lemmatize(doc,pos="v"),
                     stop_words = set(stopwords.words('english')))),
    ('middleTransformer', MiddleTransformer()),
    ('model', MultinomialNB())
])

params = {'countVectorizer__ngram_range':[(1,3)], 
         'countVectorizer__min_df':[1,5,10,15,],
         'countVectorizer__max_df':[.4,.3],
         }

gs = GridSearchCV(model_cv, params, scoring='roc_auc', cv=5, n_jobs=-1, verbose=2)

In [53]:
results = gs.fit(all_questions_tuples, train_labels)

Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed: 10.8min
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed: 12.1min finished


In [56]:
results.best_params_

{'countVectorizer__max_df': 0.4,
 'countVectorizer__min_df': 1,
 'countVectorizer__ngram_range': (1, 3)}

In [51]:
results.best_score_

0.8436795654397367