In [1]:
import pandas as pd
import scipy
import sklearn
from sklearn import *
import numpy as np
import collections
from scipy import sparse
import nltk
from collections import defaultdict
import re
from CountVectorizer_BagOfWords import CountVectorizer as cv
from TfIdfVectorizer import TfIdfVectorizer as tf
from Spelling_Correction_c  import Spelling_Correction_c 
import xgboost as xgb
import pickle
import json
from nltk.corpus import stopwords 
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/laia/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# DATA

We are trying to solve the following problem: given a pair of different questions of Quora, decide if they are asking the same or not. In this notebook, we will discuss the process we have followed to solve the problem, the different models that we have used as well as the mistakes that each model makes.

In [2]:
#read the data
train_df =pd.read_csv('train_df.csv')
test_df =pd.read_csv('test_df.csv')
val_df = pd.read_csv('val_df.csv')
train_df

Unnamed: 0.1,Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,311380,370141,108248,500659,How do I get home tutors?,How can I trust a home tutor?,0
1,62646,400219,349589,66001,"What is the difference between 'had been', 'ha...","When should I use ""has been"", ""have been"" and ...",0
2,98055,15247,29146,29147,If my ATM card is blocked for online transacti...,My credit card was used for fraud transactions...,0
3,127437,124101,200474,42953,How do I add USB 3.0 port in a laptop without ...,Can I use a USB 3.0 device in a USB 2.0 port?,0
4,111569,1333,2657,2658,What is the best Advantage of using Quora?,What is the benefit to Quora?,1
...,...,...,...,...,...,...,...
291083,192476,217697,306239,324116,Which is the best app for learning Yoga?,What is the best app to learn yoga?,1
291084,17730,81327,49754,89884,What's the main reason behind 500 & 1000 rs no...,What is the reason behind PM Modi's decision t...,1
291085,28030,401928,179454,143531,How can I find out my drivers license number u...,What's the best way to get my driver’s license...,0
291086,277869,231706,341577,341578,How has Bill Gates charity foundation helped t...,How has Bill Gates affected us and the world?,0


# AUX FUNCTIONS

We will use the following functions for some of the models.The first functions are meant to extract, given a vectorizer, the matrix of features for the classifier. The two last functions are used to identify the errors that a classifier is making.

In [3]:
def cast_list_as_strings(mylist):
    """
    return a list of strings
    """
    assert isinstance(mylist, list), f"the input mylist should be a list it is {type(mylist)}"
    
    mylist_aux = []
    
    for i in mylist:
        mylist_aux.append(str(i))
        
    return mylist_aux

from scipy.sparse import hstack

def get_features_from_list(q1,q2,count_vectorizer):
    """
    returns a sparse matrix containing the features build by the count vectorizer.
    """
    
    q1_mat = count_vectorizer.transform(q1)
    q2_mat = count_vectorizer.transform(q2)
    X_q1q2 = hstack([q1_mat,q2_mat], format="csr")
            
    return X_q1q2
    

def get_features_from_df(df, count_vectorizer):
    """
    returns a sparse matrix containing the features build by the count vectorizer.
    Each row should contain features from question1 and question2.
    """
    
    #list of questions where each element of the question is of type string
    q1_casted =  cast_list_as_strings(list(df["question1"]))
    q2_casted =  cast_list_as_strings(list(df["question2"]))    
    
    q1_mat = count_vectorizer.transform(q1_casted)
    q2_mat = count_vectorizer.transform(q2_casted)
    X_q1q2 = hstack([q1_mat,q2_mat], format="csr")
            
    return X_q1q2

def get_mistakes(clf, X_q1q2, y):
    """
    Returns two lists: one containing the indices of the predictions that are not correct
    and another one containing the predictions
    """
    predictions        = clf.predict(X_q1q2).round(0).astype(int)
    incorrect_preds    = predictions != y
    incorrect_indices, = np.where(incorrect_preds)
    incorrect_indices2 = [x for x in  range(len(incorrect_preds)) if incorrect_preds[x] ==True]
    incorrect_indices3 = np.arange(len(incorrect_preds))[incorrect_preds]        
    
    if np.sum(incorrect_preds)==0:
        print("no mistakes in this df")
    else:
        return incorrect_indices, predictions
    
def print_mistake_k(k, dataset, mistake_indices, predictions):
    """
    Auxiliar function to print the k-th mistake made in the prediction
    """
    print("Original q1: ", train_df.iloc[mistake_indices[k]].question1, " Treated q1: ", dataset[mistake_indices[k]])
    print("Original q2: ", train_df.iloc[mistake_indices[k]].question2, " Treated q2: ", dataset[mistake_indices[k]+train_df.shape[0]])
    print("true class:", train_df.iloc[mistake_indices[k]].is_duplicate)
    print("prediction:", predictions[mistake_indices[k]])  
    
    
def load_logistic(filename):
    # Load the logistic parameters
    with open(filename, 'rb') as fp:
        logistic_params = json.load(fp)
        
    #Create logistic object
    logistic = sklearn.linear_model.LogisticRegression(solver="liblinear")
    logistic.coef_ = np.array(logistic_params['coef_'])
    logistic.classes_ = np.array(logistic_params['classes_'])
    logistic.intercept_ = np.array(logistic_params['intercept_'])
    
    return logistic



# PREPROCESS DATA

A first naive model was proposed in class: pass the text through the vectorizers and use the returned matrix as the matrix of features. We saw that the classifier wrongly classified some questions with spelling mistakes. For example, the classifiera would identify as different questions those who were written like "whats" from those who were written like "what's". 

We thought that this problem may be common with any model that we try to train, so the first thing we propose to do is correcting the spelling mistakes. We propose to remove "'s", change the negatives "'t" for "not" as well as the plurals "'re" for "are", remove symbols and points. Then, we implemented a spell checking function using the edit distance. 

The cells that transform the raw questions into cleaned questions can be found in notebook 1. Since the computations are very long, we load the already cleaned questions in this notebook.

In [4]:
#Convert all elements of the documents into strings 
q1_train_raw =  cast_list_as_strings(list(train_df["question1"]))
q2_train_raw =  cast_list_as_strings(list(train_df["question2"]))
q1_val_raw  =  cast_list_as_strings(list(val_df["question1"]))
q2_val_raw  =  cast_list_as_strings(list(val_df["question2"]))
q1_test_raw  =  cast_list_as_strings(list(test_df["question1"]))
q2_test_raw  =  cast_list_as_strings(list(test_df["question2"]))


all_questions_raw = q1_train_raw + q2_train_raw

In [5]:
with open('cleaned_data/q1_train_cleaned.txt') as f:
    content = f.readlines()
# you may also want to remove whitespace characters like `\n` at the end of each line
q1_train = [x.strip() for x in content] 

In [6]:
with open('cleaned_data/q2_train_cleaned.txt') as f:
    content = f.readlines()
# you may also want to remove whitespace characters like `\n` at the end of each line
q2_train = [x.strip() for x in content] 

In [7]:
with open('cleaned_data/q1_val_cleaned.txt') as f:
    content = f.readlines()
# you may also want to remove whitespace characters like `\n` at the end of each line
q1_val = [x.strip() for x in content] 

In [8]:
with open('cleaned_data/q2_val_cleaned.txt') as f:
    content = f.readlines()
# you may also want to remove whitespace characters like `\n` at the end of each line
q2_val = [x.strip() for x in content] 

In [9]:
with open('cleaned_data/q1_test_cleaned.txt') as f:
    content = f.readlines()
# you may also want to remove whitespace characters like `\n` at the end of each line
q1_test = [x.strip() for x in content] 

In [10]:
with open('cleaned_data/q2_test_cleaned.txt') as f:
    content = f.readlines()
# you may also want to remove whitespace characters like `\n` at the end of each line
q2_test = [x.strip() for x in content] 

In [11]:
all_questions= q1_train + q2_train

In [12]:
with open('cleaned_data/train_labels.txt') as f:
    content = f.readlines()
# you may also want to remove whitespace characters like `\n` at the end of each line
train_labels = [int(x.strip()) for x in content] 

with open('cleaned_data/val_labels.txt') as f:
    content = f.readlines()
# you may also want to remove whitespace characters like `\n` at the end of each line
val_labels = [int(x.strip()) for x in content] 

with open('cleaned_data/test_labels.txt') as f:
    content = f.readlines()
# you may also want to remove whitespace characters like `\n` at the end of each line
test_labels = [int(x.strip()) for x in content] 

# FIRST MODEL: NAIVE CLASSIFIER WITH SPELL CHECKING
For the first model, we just wanted to see what difference did the spellchecking do. So, did we improve the results? Did we improve the results as expected? If so, what mistakes is our model doing now?

We will do this checking for both the CountVectorizer and the TfIdfVectorizer. The classifier will be a sklearn Logistic Regression. This is a simple classifier that will serve as a benchmark.

## First model - with CountVectorizer

In [13]:
#inicialize the CountVectorizer and define its parameters
CountVectorizer = cv(stop_words = set(stopwords.words('english')),
                     ngram_range=(1,3), max_df = 0.99, min_df = 5)

**Replace fit by loading the model here**

In [14]:
#fit the CountVectorizer
CountVectorizer.fit(all_questions)

CountVectorizer(doc_cleaner_pattern="('\\w+)", document_cleaner_func=None,
                dtype=<class 'numpy.float32'>, max_df=0.99, min_df=5,
                min_word_counts=1, ngram_range=(1, 3),
                stop_words={'a', 'about', 'above', 'after', 'again', 'against',
                            'ain', 'all', 'am', 'an', 'and', 'any', 'are',
                            'aren', "aren't", 'as', 'at', 'be', 'because',
                            'been', 'before', 'being', 'below', 'between',
                            'both', 'but', 'by', 'can', 'couldn', "couldn't", ...},
                token_cleaner_func=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer_func=None)

In [15]:
countvect_sk = sklearn.feature_extraction.text.CountVectorizer(stop_words = set(stopwords.words('english')),ngram_range=(1, 3),
                                                              max_df = 0.99, min_df = 5)
countvect_sk.fit(all_questions)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=0.99, max_features=None, min_df=5,
                ngram_range=(1, 3), preprocessor=None,
                stop_words={'a', 'about', 'above', 'after', 'again', 'against',
                            'ain', 'all', 'am', 'an', 'and', 'any', 'are',
                            'aren', "aren't", 'as', 'at', 'be', 'because',
                            'been', 'before', 'being', 'below', 'between',
                            'both', 'but', 'by', 'can', 'couldn', "couldn't", ...},
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

We will the result obtained with our implementation of the CountVectorizer with the result obtained using the sklearn version of the CountVectorizer. One of the objectives of this deliverable was to implement and understend how the vectorizers worked, so we set as an objective to obtain the same as the sklearn vectorizers.

In [16]:
X_tr_q1q2 = get_features_from_list(q1_train, q2_train,CountVectorizer)

X_tr_q1q2_sk = get_features_from_list(q1_train, q2_train,countvect_sk)

X_val_q1q2  = get_features_from_list(q1_val, q2_val, CountVectorizer)

X_val_q1q2_sk  = get_features_from_list(q1_val, q2_val, countvect_sk)

X_te_q1q2  = get_features_from_list(q1_test, q2_test, CountVectorizer)

X_te_q1q2_sk  = get_features_from_list(q1_test, q2_test, countvect_sk)

Result of the logistic regression using our implementation of CountVectorizer. We load the logistic regression model from a json file.

In [17]:
logistic = load_logistic('models/logistic.json')

#train roc auc metrics
print("Result on train: ", sklearn.metrics.roc_auc_score(y_true = train_labels, y_score = logistic.predict(X_tr_q1q2)))

#test roc auc metrics
print("Result on validation: ", sklearn.metrics.roc_auc_score(y_true = val_labels, y_score = logistic.predict(X_val_q1q2)))

#test roc auc metrics
print("Result on test: ", sklearn.metrics.roc_auc_score(y_true = test_labels, y_score = logistic.predict(X_te_q1q2)))

Result on train:  0.8873714184551172
Result on validation:  0.756423585069429
Result on test:  0.7533599677258676


Result using sklearn implementation of CountVectorizer

In [18]:
logistic_sk = load_logistic('models/logistic_sk.json')

#train roc auc metrics
print("Result on train: ", sklearn.metrics.roc_auc_score(y_true = train_labels, y_score = logistic_sk.predict(X_tr_q1q2_sk)))

#test roc auc metrics
print("Result on validation: ", sklearn.metrics.roc_auc_score(y_true = val_labels, y_score = logistic_sk.predict(X_val_q1q2_sk)))

#test roc auc metrics
print("Result on test: ", sklearn.metrics.roc_auc_score(y_true = test_labels, y_score = logistic_sk.predict(X_te_q1q2_sk)))

Result on train:  0.8874152854760406
Result on validation:  0.7564062563763118
Result on test:  0.7533834828117459


### Qualitative information about the mistakes

We wanted to identify the mistakes that the classifier was doing in this case. We saw that the classifier was making mistages mainly for the following reasons:
- The questions are the same, but the sentences have lots of different words.
- The questions are the same, but one sentence is way larger than the other.
- The questions are asking about the same thing but for different years, hence they must be classified as different.
- One of the questions is a subset of the other. This mistake is the harder to solve because sometimes it is even debatable of the questions should be the same or not.

In [19]:
print("Accuracy on training: ", np.sum(train_labels==logistic.predict(X_tr_q1q2))/len(train_labels))
print("Accuracy on validation: ", np.sum(val_labels ==logistic.predict(X_val_q1q2))/len(val_labels))
print("Accuracy on test: ", np.sum(test_labels ==logistic.predict(X_te_q1q2))/len(test_labels))

Accuracy on training:  0.907399137030726
Accuracy on validation:  0.788647044274054
Accuracy on test:  0.7851665883400529


In [20]:
mistake_indices, predictions = get_mistakes(logistic, X_tr_q1q2, train_labels)
print_mistake_k(4, all_questions, mistake_indices, predictions)

Original q1:  Why do men like women's feet?  Treated q1:  why do men like woman foot
Original q2:  Why do men like womens feet?  Treated q2:  why do men like woman foot
true class: 1
prediction: 0


## First model - with TfIdfVectorizer

**Replace fit by loading the model**

In [21]:
tfidf_vectorizer = tf(stop_words = set(stopwords.words('english')), ngram_range=(1,3), max_df = 0.4, min_df = 5)
tfidf_vectorizer.fit(all_questions)

tfidf_sk = sklearn.feature_extraction.text.TfidfVectorizer(use_idf=False, smooth_idf=False, sublinear_tf=False,
                                                          stop_words = set(stopwords.words('english')),
                                                          ngram_range=(1,3), max_df = 0.99, min_df = 5)
tfidf_sk.fit(all_questions) 

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=0.99, max_features=None,
                min_df=5, ngram_range=(1, 3), norm='l2', preprocessor=None,
                smooth_idf=False,
                stop_words={'a', 'about', 'above', 'after', 'again', 'against',
                            'ain', 'all', 'am', 'an', 'and', 'any', 'are',
                            'aren', "aren't", 'as', 'at', 'be', 'because',
                            'been', 'before', 'being', 'below', 'between',
                            'both', 'but', 'by', 'can', 'couldn', "couldn't", ...},
                strip_accents=None, sublinear_tf=False,
                token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None,
                use_idf=False, vocabulary=None)

Again, remember that additionally, we want to compare our result with that given by the implementation of sklearn of the TfIdfVectorizer.

In [22]:
X_tr_q1q2_tfidf = get_features_from_list(q1_train, q2_train,tfidf_vectorizer)
X_tr_q1q2_sk_tfidf = get_features_from_list(q1_train, q2_train, tfidf_sk)
X_val_q1q2_tfidf  = get_features_from_list(q1_val, q2_val, tfidf_vectorizer)
X_val_q1q2_sk_tfidf  = get_features_from_list(q1_val, q2_val, tfidf_sk)
X_te_q1q2_tfidf  = get_features_from_list(q1_test, q2_test, tfidf_vectorizer)
X_te_q1q2_sk_tfidf  = get_features_from_list(q1_test, q2_test, tfidf_sk)

print("With our TfIdf Vectorizer:", X_tr_q1q2_tfidf.shape, train_df.shape)
print("With sklearn TfIdf Vectorizer:", X_tr_q1q2_sk_tfidf.shape, train_df.shape)

With our TfIdf Vectorizer: (291088, 285364) (291088, 7)
With sklearn TfIdf Vectorizer: (291088, 285364) (291088, 7)


Result using our implementation of TfIdf Vectorizer

In [23]:
logistic = load_logistic('models/logistic_tfidf.json')

print("Result on train: ", sklearn.metrics.roc_auc_score(y_true = train_labels, y_score = logistic.predict(X_tr_q1q2_tfidf)))

print("Result on validation: ", sklearn.metrics.roc_auc_score(y_true = val_labels, y_score = logistic.predict(X_val_q1q2_tfidf)))

#test roc auc metrics
print("Result on test: ", sklearn.metrics.roc_auc_score(y_true = test_labels, y_score = logistic.predict(X_te_q1q2_tfidf)))

Result on train:  0.8303725534934316
Result on validation:  0.7532320054109183
Result on test:  0.7486764630961492


Result using sklearn implementation of TfIdf Vectorizer. Note that the result is different because the formula that sklearn uses is different from ours.

In [24]:
logisitc_sk = load_logistic('models/logistic_tfidf_sk.json')

print("Result on train: ", sklearn.metrics.roc_auc_score(y_true = train_labels, y_score = logistic_sk.predict(X_tr_q1q2_sk_tfidf)))

print("Result on validation: ", sklearn.metrics.roc_auc_score(y_true = val_labels, y_score = logistic_sk.predict(X_val_q1q2_sk_tfidf)))

#test roc auc metrics
print("Result on test: ", sklearn.metrics.roc_auc_score(y_true = test_labels, y_score = logistic_sk.predict(X_te_q1q2_sk_tfidf)))

Result on train:  0.8598897999557084
Result on validation:  0.7480028045154151
Result on test:  0.7448268895587145


### Qualitative information about the mistakes

So, in this case, the mistakes are practically the same, hence:
- The questions are the same, but the sentences have lots of different words.
- The questions are the same, but one sentence is way larger than the other.
- The questions are asking about the same thing but for different years, hence they must be classified as different.
- One of the questions is a subset of the other. This mistake is the harder to solve because sometimes it is even debatable of the questions should be the same or not.

In [25]:
print("Accuracy on training: ", np.sum(train_labels ==logistic.predict(X_tr_q1q2_tfidf))/len(train_labels))
print("Accuracy on validation: ", np.sum(val_labels ==logistic.predict(X_val_q1q2_tfidf))/len(val_labels))
print("Accuracy on test: ", np.sum(test_labels ==logistic.predict(X_te_q1q2_tfidf))/len(test_labels))

Accuracy on training:  0.8615642005166823
Accuracy on validation:  0.7928827603264902
Accuracy on test:  0.7880729179549334


In [26]:
mistake_indices, predictions = get_mistakes(logistic, X_tr_q1q2_tfidf, train_labels)
print_mistake_k(4, all_questions, mistake_indices, predictions)

Original q1:  Are Persians considered Caucasian?  Treated q1:  be Persian consider Caucasian
Original q2:  Are Persians White?  Treated q2:  be Persian white
true class: 1
prediction: 0


# SECOND MODEL: NAIVE CLASSIFIER WITH EXTRA FEATURES

Given the mistakes encountered in the previous model, we tried to code some extra features to tackle with those problems.

### Code to obtain the extra features.

Here we give a list of extra features that we could add to the feature vector.

1. Lenght of the question

2. Is there a [math] tag? 

3. Is there a number in the question?

4. Is it the same number in both questions? 

5. % of intersection words?


In [27]:
def get_qlength(questions):
    qlen = []
    for quest in questions:
        clean_doc_pattern = re.compile( r"('\w)|([^a-zA-Z0-9.])") #Find words containing alphanumeric or points
        q = re.sub('\'s', '', quest) #Remove 's
        q = re.sub('\'t', ' not', q) #Change 't for not'
        q = re.sub('\'re', ' are', q) #Change 're for are'
        q = re.sub('[?%!@#$\'\""]', '', q)#Remove symbols
        q = re.sub('\.\s', ' ', q)#Remove points with a space afterwards
        clean_q = clean_doc_pattern.sub(" ", q)
        qlen.append(len(re.findall(r"(?u)\b[\w.,]+\b",q)))
        
    return np.array(qlen).reshape(-1,1)

def is_math(questions):
    math=[]
    for quest in questions:
        if '[math]' in quest:
            math.append(1)
        else:
            math.append(0)
    return np.array(math).reshape(-1,1)
    
def is_number(word):
    try :  
        w = float(word) 
        if(np.isnan(w)):
            return 0
        if(np.isinf(w)):
            return 0
        res = 1
    except : 
        res = 0
    return res    

def has_numbers(questions):
    num=np.zeros((len(questions)))
    which_num = np.zeros((len(questions)))
    i=0
    for quest in questions:
        for w in re.findall(r"(?u)\b[\w.,]+\b",quest):
            is_num = is_number(w)
            if is_num==1:
                num[i]=1
                which_num[i]=float(w)
                if(np.isnan(which_num[i])):
                    print(which_num[i])
                    print(float(w))
                break
        i+=1
    return num.reshape(-1,1), which_num.reshape(-1,1)


def is_different_number(which_num1, which_num2):
    dif = which_num1 - which_num2
    dif[dif>0]=1
    return np.array(dif).reshape(-1,1)

In [28]:
def q1_q2_intersect(row, q1, q2, q_dict):
    set1 = set(q_dict[q1[row]])
    set2 = set(q_dict[q2[row]])
    return(len(set1.intersection(set2))/len(set1.union(set2)))


def intersection(q1_train, q2_train,q1_val,q2_val, q1_test, q2_test):
    q1 = q1_train + q1_val +  q1_test
    q2 = q2_train + q1_val + q2_test
    q_dict = defaultdict(set)
    for i in range(len(q1)):
            q_dict[q1[i]].add(q2[i])
            q_dict[q2[i]].add(q1[i])

    intersect_train = []
    intersect_test = []
    intersect_val = []
    for row in range(len(q1_train)):
        intersect_train.append(q1_q2_intersect(row, q1_train, q2_train, q_dict))
    
    for row in range(len(q1_val)):
        intersect_val.append(q1_q2_intersect(row, q1_val, q2_val, q_dict))
        
    for row in range(len(q1_test)):
        intersect_test.append(q1_q2_intersect(row, q1_test, q2_test, q_dict))
    
    intersect_train = np.array(intersect_train).reshape(-1,1)
    intersect_val = np.array(intersect_val).reshape(-1,1)
    intersect_test = np.array(intersect_test).reshape(-1,1)
    return intersect_train, intersect_val, intersect_test 

In [29]:
intersect_train, intersect_val, intersect_test = intersection(q1_train, q2_train, q1_val, q2_val, q1_test, q2_test)

In [30]:
num1_train, which_num1_train=  has_numbers(q1_train_raw)
num2_train, which_num2_train =  has_numbers(q2_train_raw)
dif_number_train = is_different_number(which_num1_train,which_num2_train)

num1_val, which_num1_val=  has_numbers(q1_val_raw)
num2_val, which_num2_val =  has_numbers(q2_val_raw)
dif_number_val = is_different_number(which_num1_val,which_num2_val)

num1_test, which_num1_test=  has_numbers(q1_test_raw)
num2_test, which_num2_test =  has_numbers(q2_test_raw)
dif_number_test = is_different_number(which_num1_test,which_num2_test)

In [31]:
math1_train = is_math(q1_train_raw)
math2_train = is_math(q2_train_raw)

math1_val = is_math(q1_val_raw)
math2_val = is_math(q2_val_raw)

math1_test = is_math(q1_test_raw)
math2_test = is_math(q2_test_raw)

In [32]:
len1_train = get_qlength(q1_train_raw)
len2_train = get_qlength(q2_train_raw)

len1_val = get_qlength(q1_val_raw)
len2_val = get_qlength(q2_val_raw)

len1_test = get_qlength(q1_test_raw)
len2_test = get_qlength(q2_test_raw)

## Second model - with CountVectorizer

In this case, since we have already check that our CountVectorizer yields the same result as the sklearn one, we will only use ours. We already loaded the count vectorizer, so we use it again.

In [33]:
print('initial shape', X_tr_q1q2.shape)

new_X_tr_q1q2 = sparse.hstack((X_tr_q1q2,intersect_train, num1_train, num2_train,
                               dif_number_train,math1_train,math2_train,len1_train, len2_train))

new_X_te_q1q2 = sparse.hstack((X_te_q1q2,intersect_test, num1_test, num2_test,
                               dif_number_test, math1_test,math2_test,len1_test, len2_test))

new_X_val_q1q2 = sparse.hstack((X_val_q1q2,intersect_val, num1_val, num2_val,
                               dif_number_val, math1_val,math2_val,len1_val, len2_val))

print('final shape', new_X_tr_q1q2.shape)

initial shape (291088, 285364)
final shape (291088, 285372)


We obtain the following result. We see that the AUC has dropped a lot. We think that this may be due to the imbalance of the values of the different features, i.e., we are not normalizing the values of any of the features. We thought that it would be necessary to change the model, then.

In [34]:
logistic = load_logistic('models/logistic_extra_features.json')

#train roc auc metrics
print("Result on train: ", sklearn.metrics.roc_auc_score(y_true = train_labels, y_score = logistic.predict(new_X_tr_q1q2)))

#val roc auc metrics
print("Result on validation: ", sklearn.metrics.roc_auc_score(y_true = val_labels, y_score = logistic.predict(new_X_val_q1q2)))

#test roc auc metrics
print("Result on test: ", sklearn.metrics.roc_auc_score(y_true = test_labels, y_score = logistic.predict(new_X_te_q1q2)))

Result on train:  0.4849733415832702
Result on validation:  0.48544005699141674
Result on test:  0.48445389374241093


## Second model - with TfIdfVectorizer

We already loaded the TfIdfVectorizer.

Again we will only run the code for our TfIdfVectorizer.

In [35]:
print('initial shape', X_tr_q1q2.shape)

new_X_tr_q1q2_tfidf = sparse.hstack((X_tr_q1q2_tfidf,intersect_train, num1_train, num2_train,
                               dif_number_train,math1_train,math2_train,len1_train, len2_train))
new_X_val_q1q2_tfidf = sparse.hstack((X_val_q1q2_tfidf,intersect_val, num1_val, num2_val,
                               dif_number_val, math1_val,math2_val,len1_val, len2_val))
new_X_te_q1q2_tfidf = sparse.hstack((X_te_q1q2_tfidf,intersect_test, num1_test, num2_test,
                               dif_number_test, math1_test,math2_test,len1_test, len2_test))

print('final shape', new_X_tr_q1q2.shape)

initial shape (291088, 285364)
final shape (291088, 285372)


A very similar thing happens with the tfidfVectorizer.

In [36]:
logistic = load_logistic('models/logistic_extra_features_tfidf.json')

print("Result on train: ", sklearn.metrics.roc_auc_score(y_true = train_labels, y_score = logistic.predict(new_X_tr_q1q2_tfidf)))

print("Result on validation: ", sklearn.metrics.roc_auc_score(y_true = val_labels, y_score = logistic.predict(new_X_val_q1q2_tfidf)))

#test roc auc metrics
print("Result on test: ", sklearn.metrics.roc_auc_score(y_true = test_labels, y_score = logistic.predict(new_X_te_q1q2_tfidf)))

Result on train:  0.4849733415832702
Result on validation:  0.48544005699141674
Result on test:  0.48445389374241093


# THIRD MODEL: XGBOOST

Given all the previous results, a thing was clear: we needed to change the classifier. So our take was: combine everything we have done until now (text with the spell checking and the extra features) but with a more sophisticated model. We chose the XGBoost.

## Third model - with CountVectorizer

In [37]:
xgb_count = xgb.Booster()

xgb_count.load_model('models/xgb_count')

In [38]:
d_train = xgb.DMatrix(new_X_tr_q1q2, label=train_labels)
d_test = xgb.DMatrix(new_X_te_q1q2, label=test_labels)
d_val = xgb.DMatrix(new_X_val_q1q2, label=val_labels)

pred_test = xgb_count.predict(d_test)
pred_train = xgb_count.predict(d_train)
pred_val = xgb_count.predict(d_val)

print("Accuracy on training: ", np.sum(train_labels==pred_train.round(0).astype(int))/len(train_labels))
print("Accuracy on validation: ", np.sum(val_labels==pred_val.round(0).astype(int))/len(val_labels))
print("Accuracy on test: ", np.sum(test_labels ==pred_test.round(0).astype(int))/len(test_labels))

#train roc auc metrics
print("AUC on train: ", sklearn.metrics.roc_auc_score(y_true = train_labels, y_score = pred_train))

#test roc auc metrics
print("AUC on validation: ", sklearn.metrics.roc_auc_score(y_true = val_labels, y_score = pred_val))

#test roc auc metrics
print("AUC on test: ", sklearn.metrics.roc_auc_score(y_true = test_labels, y_score = pred_test))

mistake_indices, predictions = get_mistakes(xgb_count, d_train, train_labels)
print_mistake_k(4, all_questions, mistake_indices, predictions)      




Accuracy on training:  0.8415084098279558
Accuracy on validation:  0.8431548355181796
Accuracy on test:  0.8305547997724406
AUC on train:  0.9159446541394345
AUC on validation:  0.8989376768402263
AUC on test:  0.8939774284673486
Original q1:  What are 10 things you would tell your 19 year old self?  Treated q1:  what be 10 thing you would tell your 19 year old self
Original q2:  What are some of the most important things you would tell your 19 year old self?  Treated q2:  what be some of the most important thing you would tell your 19 year old self
true class: 1
prediction: 0


## Third model - with TfIdfVectorizer

In [39]:
xgb_tfidf = xgb.Booster()

xgb_tfidf.load_model('models/xgb_tfidf')

In [40]:
d_train = xgb.DMatrix(new_X_tr_q1q2_tfidf, label=train_labels)
d_test = xgb.DMatrix(new_X_te_q1q2_tfidf, label=test_labels)
d_val = xgb.DMatrix(new_X_val_q1q2_tfidf, label=val_labels)

pred_test = xgb_tfidf.predict(d_test)
pred_val = xgb_tfidf.predict(d_val)
pred_train = xgb_tfidf.predict(d_train)

print("Accuracy on training: ", np.sum(train_labels==pred_train.round(0).astype(int))/len(train_labels))
print("Accuracy on validation: ", np.sum(val_labels==pred_val.round(0).astype(int))/len(val_labels))
print("Accuracy on test: ", np.sum(test_labels ==pred_test.round(0).astype(int))/len(test_labels))

#train roc auc metrics
print("AUC on train: ", sklearn.metrics.roc_auc_score(y_true = train_labels, y_score = pred_train))

#test roc auc metrics
print("AUC on validation: ", sklearn.metrics.roc_auc_score(y_true = val_labels, y_score = pred_val))

#test roc auc metrics
print("AUC on test: ", sklearn.metrics.roc_auc_score(y_true = test_labels, y_score = pred_test))


mistake_indices, predictions = get_mistakes(xgb_tfidf, d_train, train_labels)
print_mistake_k(4, all_questions, mistake_indices, predictions)      


Accuracy on training:  0.8403609904908481
Accuracy on validation:  0.8428765767994064
Accuracy on test:  0.8288975735239555
AUC on train:  0.9168234628206468
AUC on validation:  0.8942311188490448
AUC on test:  0.8889539442559593
Original q1:  What are 10 things you would tell your 19 year old self?  Treated q1:  what be 10 thing you would tell your 19 year old self
Original q2:  What are some of the most important things you would tell your 19 year old self?  Treated q2:  what be some of the most important thing you would tell your 19 year old self
true class: 1
prediction: 0


# FOURTH MODEL: DIFFERENT APPROACH WITH DEEP LEARNING

Our main objective for this deliverable was to work with a more classic approach for natural language processing, mainly to implement and understand the CountVectorizer and TfIdfVectorizer. Additionally, we tried to work on the mistakes and limitations that this approach had, hence having to do a bit of feature engeenireing to tackle those problems.

However, nowadays deep learning is used practically to solve anything, so, how well could it work to solve this problem? In this section we explore a completely different approach using deep learning.

# LAST BUT NOT LEAST: LET'S DO PIPELINES

In [76]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_validate

To pass data from CountVectorizer to model automatically, we need to transform the data first. This is just a hacky way to do it. 

Note that the correct way is to modify the CountVectorized that we implemented so that the output of the  transform is already the desired matrix (i.e. doing the hstack inside). We tried to implement the most general CountVectorized that we could, so we have to resort to this way. It is in no way slower in terms of speed. It is just ugly in the sense that is doing something hack-ish. 

In [77]:
class MiddleTransformer(sklearn.base.BaseEstimator, sklearn.base.TransformerMixin):
    def __init__(self):
        return
    
    def fit(self, X, y=None):
        return self
    
    def fit_transform(self, X, y=None):
        return self.transform(X)
    
    def transform(self, X):
        nexamples, nvars = X.shape
        split = (int)(nexamples/2)
        XX1 = X[:split,:]
        XX2 = X[split:,:]
        XX = sparse.hstack([XX1, XX2], format='csr')
        return XX

In [78]:
model_cv = Pipeline([
    ('countVectorizer', cv(#token_cleaner_func = lambda doc: WordNetLemmatizer().lemmatize(doc,pos="v"),
                     stop_words = set(stopwords.words('english')),
                     ngram_range=(1,3))),
    ('middleTransformer', MiddleTransformer()),
    ('model', sklearn.linear_model.LogisticRegression(solver="liblinear"))])

In [79]:
model_cv.fit(all_questions, train_labels)

Pipeline(memory=None,
     steps=[('countVectorizer', CountVectorizer(doc_cleaner_pattern="('\\w+)", document_cleaner_func=None,
        dtype=<class 'numpy.float32'>, max_df=1.0, min_df=1,
        min_word_counts=1, ngram_range=(1, 3),
        stop_words={'them', 'an', 'herself', 'but', 'd', 'some', 'had', 'yours', "isn't", ...ty='l2', random_state=None, solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False))])

In [80]:
all_questions_test = q1_test+q2_test
all_questions_val = q1_val+q2_val

In [None]:
sklearn.metrics.roc_auc_score(val_labels, model_cv.predict(all_questions_val))

In [81]:
sklearn.metrics.roc_auc_score(test_labels, model_cv.predict(all_questions_test))

0.7778368046218165

### Make Pipeline Great Again: GridSearch Hyperparameters

To select samples to cross validate, we need the samples to be in pair with the labels. Right now we dont have that, so we are going to (once again) hack our way through it.

Again note that the correct way to do it is to modify the CountVectorizer so that it automatically does this process inside the fit and transform functions. But following our filosophy of having a general CountVectorizer, this is a good way to solve the problem.

In [44]:
ndocs = (int)(len(all_questions)/2)
all_questions_tuples = [(q1_train[i], q2_train[i]) for i in range(ndocs)]

class CrossValidationTransformer(sklearn.base.BaseEstimator, sklearn.base.TransformerMixin):
    def __init__(self):
        return
    
    def fit(self, X, y=None):
        return self
    
    def fit_transform(self, X, y=None):
        return self.transform(X)
    
    def transform(self, X):
        XX = [x[0] for x in X]
        for x in X:
            XX.append(x[1])
        return XX

In [52]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV

mnb = MultinomialNB()

model_cv = Pipeline([
    ('crossValidationTransformer', CrossValidationTransformer()),
    ('countVectorizer', tf(#token_cleaner_func = lambda doc: WordNetLemmatizer().lemmatize(doc,pos="v"),
                     stop_words = set(stopwords.words('english')))),
    ('middleTransformer', MiddleTransformer()),
    ('model', MultinomialNB())
])

params = {'countVectorizer__ngram_range':[(1,3)], 
         'countVectorizer__min_df':[1,5,10,15,],
         'countVectorizer__max_df':[.4,.3],
         }

gs = GridSearchCV(model_cv, params, scoring='roc_auc', cv=5, n_jobs=-1, verbose=2)

In [53]:
results = gs.fit(all_questions_tuples, train_labels)

Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed: 10.8min
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed: 12.1min finished


In [56]:
results.best_params_

{'countVectorizer__max_df': 0.4,
 'countVectorizer__min_df': 1,
 'countVectorizer__ngram_range': (1, 3)}

In [51]:
results.best_score_

0.8436795654397367