In [3]:
import pandas as pd
import scipy
import sklearn
from sklearn import *
import numpy as np
import collections
from scipy import sparse
import nltk
from collections import defaultdict
import re
import CountVectorizer_BagOfWords as cv

# DATA

In [4]:
#read the data
available_data = pd.read_csv("quora_train_data.csv")
available_data

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,346692,38482,10706,Why do I get easily bored with everything?,Why do I get bored with things so quickly and ...,1
1,327668,454117,345117,How do I study for Honeywell company recruitment?,How do I study for Honeywell company recruitme...,1
2,272993,391373,391374,Which search engine algorithm is Quora using?,Why is Quora not using reliable search engine?,0
3,54070,82673,95496,How can I smartly cut myself?,Can someone who thinks about suicide for 7 yea...,0
4,46450,38384,72436,How do I see who is viewing my Instagram videos?,Can one tell who viewed my Instagram videos?,1
...,...,...,...,...,...,...
323427,192476,292119,292120,Is it okay to use a laptop while it is chargin...,Is it OK to use your phone while charging?,0
323428,17730,33641,33642,How can dogs understand human language?,Can dogs understand the human language?,0
323429,28030,52012,52013,What's your favourite lotion?,What's your favourite skin lotion?,1
323430,277869,397054,120852,How does one become a hedge fund manager?,What should I do to become a hedge fund manager?,1


In [5]:
#Split data into train and test
train_df, test_df = sklearn.model_selection.train_test_split(available_data, test_size=0.1, random_state=123)

# PREPROCESS DATA

# COUNTVECTORIZER

Let us now create the *CountVectorizer* class. It will have the following attributes and methods.

ATTRIBUTES
* **stop_words**: is a list (or set) of stop words. That is, these words will be ignored. By default, it is an empty list.
* **ngram_range**: is the tuple giving the range of n-gram sizes to consider. By default it takes value (1,1).

METHODS
* **document_cleaner**: it defines the function to be used so as to perform the cleaning of the document. By default, such cleaning consists in lower casing the words, removing all characters after an apostrophe and removing all non alphanumeric characters.
* **tokenizer**: defines the function to be used so as to convert the string into a list of tokens. By default, the tokens will be the sets of alphanumeric characters separated by white spaces. Notice that a token may be composed of a single character.
* **token_cleaner**: defines the function to be used so as to perform the cleaning of the tokens (stemming, lemmatizing, doing nothing). By default, it returns the tokens as they are.
* **fit**: it creates the vocabulary using the three above functions. It defines the attributes *self.vocabulary*, *self.n_features* and *self.word_to_ind* of the object.
* **transform**: converts a document into a feature vector using the above methods.
* **fit_transform**: performs the *fit* and *transform* methods in a single call.

In [4]:
class CountVectorizer(sklearn.base.BaseEstimator, sklearn.base.TransformerMixin):
    
    def __init__(self,
                 min_word_counts=1,
                 doc_cleaner_pattern=r"('\w+)|([^a-zA-Z0-9])", #pattern for cleaning document
                 token_pattern=r"(?u)\b\w+\b", #pattern defining what a token is
                 dtype=np.float32,
                 document_cleaner_func=None,
                 tokenizer_func=None,
                 token_cleaner_func=None,
                 stop_words=[],
                 ngram_range=(1, 1)):
        
        self._retype = type(re.compile('hello, world'))

        self.min_word_counts     = min_word_counts
        self.doc_cleaner_pattern = doc_cleaner_pattern
        self.token_pattern       = token_pattern #definition of what a token is
        self.dtype               = dtype
        
        self.document_cleaner_func      = document_cleaner_func #function to perform the document cleaning
        self.tokenizer_func        = tokenizer_func #function to split the document into tokens
        self.token_cleaner_func = token_cleaner_func #function to perform the cleaning of the tokens
        
        self.vocabulary = set() #set containing all words in our vocabulary
        self.word_to_ind = collections.OrderedDict() #dictionary of the vocabulary (key=word, value=integer)   
        self.stop_words = stop_words #set of stop words
        self.ngram_range = ngram_range
    
    
    def document_cleaner(self, lower=True):        
        
        if self.document_cleaner_func: #inputted one
            return self.document_cleaner_func
        
        else: #default 
            clean_doc_pattern = re.compile(self.doc_cleaner_pattern)
            if lower:
                 return lambda doc: clean_doc_pattern.sub(" ", doc).lower()
            else:
                 return lambda doc: clean_doc_pattern.sub(" ", doc)

    def tokenizer(self):
                
        if self.tokenizer_func: #inputted one
            return self.tokenizer_func
        
        else: #default
            token_pattern_aux = re.compile(self.token_pattern)
            return lambda doc: token_pattern_aux.findall(doc)

    
    def token_cleaner(self):
                
        if self.token_cleaner_func: #inputted one
            return self.token_cleaner_func
        else: #default
            return lambda word: word #identity function
        
    
   
    def fit(self, X):

        assert self.vocabulary == set(), "self.vocabulary is not empty it has {} words".format(len(self.vocabulary))
        assert isinstance(X,list), "X is expected to be a list of documents"
        
           
        word_to_ind = collections.OrderedDict() #vocab dictionary
        doc_cleaner      = self.document_cleaner()
        doc_tokenizer    = self.tokenizer()
        word_transformer = self.token_cleaner()
        
        
        for x in X: #X is the whole set of documents           
            
            #Create the dictionary of the words
            x = doc_cleaner(x) #preprocess the string by cleaning it
            tokens = doc_tokenizer(x) #creates the tokens
            tokens_aux=[]
            for w in tokens:
                tokens_aux.append(word_transformer(w)) #stemming, lemmatizing or nothing
            tokens = tokens_aux
            
            tokens = [tok for tok in tokens if tok not in set(self.stop_words)] #remove stopping words
            
            #ngrams
            for n in np.arange(self.ngram_range[0], self.ngram_range[1]+1): 
                for token in tokens:
                    inx = tokens.index(token)
                    if inx+n < len(tokens):
                        ngram = tokens[inx:inx+n]
                        ngram = ' '.join(ngram)
                    
                        if ngram not in word_to_ind.keys(): #if token is not yet in the vocab dictionary, add it
                            word_to_ind[ngram] = len(word_to_ind)
            

        self.word_to_ind =  word_to_ind     
        self.n_features = len(word_to_ind)        
        self.vocabulary = set(word_to_ind.keys())
        
        return self
    
    
    
    def transform(self, X):
        
        doc_cleaner      = self.document_cleaner()
        doc_tokenizer    = self.tokenizer()
        word_transformer = self.token_cleaner()
        
        data = []
        row = []
        col = []
        
        for m, doc in enumerate(X):            
            doc = doc_cleaner(doc)
            tokens = doc_tokenizer(doc)
            tokens_aux=[]
            for w in tokens:
                tokens_aux.append(word_transformer(w)) #stemming, lemmatizing or nothing
            tokens = tokens_aux
            
            tokens = [tok for tok in tokens if tok not in set(self.stop_words)] #remove stopping words
            
            #ngrams
            for n in np.arange(self.ngram_range[0], self.ngram_range[1]+1): 
                for token in tokens:
                    inx = tokens.index(token)
                    if inx+n < len(tokens):
                        ngram = tokens[inx:inx+n]
                        ngram = ' '.join(ngram)
                    
                        if ngram in self.word_to_ind.keys(): #if the word is not in the vocab, ignore it
                            ngram_index = self.word_to_ind[ngram]
                            row.append(m) #we are dealing with the m-th document
                            col.append(ngram_index)
                            data.append(1)
                
                
        encoded_X = scipy.sparse.csr_matrix((data, (row,col)), shape=(m+1,len(self.word_to_ind)))    
                
        return encoded_X
    
    
    
    def fit_transform(self, X, y=None):
        self.fit(X)
        encoded_X = self.transform(X)
        return encoded_X
    


#### Testing the doc_cleaner_pattern and token_pattern

In [5]:
doc_cleaner_pattern = r"('\w+)|([^a-zA-Z0-9])"

In [6]:
clean_doc_pattern = re.compile(doc_cleaner_pattern)

In [7]:
doc = clean_doc_pattern.sub(" ", "I'll was', born'is Here in 1995 a)?").lower()
doc

'i  was   born  here in 1995 a  '

In [8]:
token_pattern=r"(?u)\b\w+\b"

In [9]:
token_pattern_aux = re.compile(token_pattern)

In [10]:
auxi = token_pattern_aux.findall(doc)
auxi

['i', 'was', 'born', 'here', 'in', '1995', 'a']

# IMPLEMENT COUNTVECTORIZER

In [6]:
def cast_list_as_strings(mylist):
    """
    return a list of strings
    """
    assert isinstance(mylist, list), f"the input mylist should be a list it is {type(mylist)}"
    
    mylist_aux = []
    
    for i in mylist:
        mylist_aux.append(str(i))
        
    return mylist_aux


#Convert all elements of the documents into strings 
q1_train =  cast_list_as_strings(list(train_df["question1"]))
q2_train =  cast_list_as_strings(list(train_df["question2"]))
q1_test  =  cast_list_as_strings(list(test_df["question1"]))
q2_test  =  cast_list_as_strings(list(test_df["question2"]))

all_questions = q1_train + q2_train

In [7]:
from nltk.stem import PorterStemmer, LancasterStemmer, WordNetLemmatizer, SnowballStemmer
from nltk.corpus import stopwords 

#How to call such stemmers and lemmatizer in the CountVectorizer object:
#PorterStemmer(): token_cleaner_func = PorterStemmer().stem
#LancasterStemmer(): token_cleaner_func = LancasterStemmer().stem
#SnowballStemmer(language='english'): token_cleaner_func = SnowballStemmer(language='english').stem
#WordNetLemmatizer(): token_cleaner_func = lambda doc: WordNetLemmatizer().lemmatize(doc,pos="v")



In [8]:
#inicialize the CountVectorizer and define its parameters
CountVectorizer = cv(token_cleaner_func = lambda doc: WordNetLemmatizer().lemmatize(doc,pos="v"),
                                 stop_words = set(stopwords.words('english')),
                                 ngram_range=(1,3))

TypeError: 'module' object is not callable

In [14]:
#fit the CountVectorizer
CountVectorizer.fit(all_questions)

CountVectorizer(doc_cleaner_pattern="('\\w+)|([^a-zA-Z0-9])",
                document_cleaner_func=None, dtype=<class 'numpy.float32'>,
                min_word_counts=1, ngram_range=(1, 3),
                stop_words={'a', 'about', 'above', 'after', 'again', 'against',
                            'ain', 'all', 'am', 'an', 'and', 'any', 'are',
                            'aren', "aren't", 'as', 'at', 'be', 'because',
                            'been', 'before', 'being', 'below', 'between',
                            'both', 'but', 'by', 'can', 'couldn', "couldn't", ...},
                token_cleaner_func=<function <lambda> at 0x0000029D1D657E18>,
                token_pattern='(?u)\\b\\w+\\b', tokenizer_func=None)

### Train the model

In [10]:
from scipy.sparse import hstack

def get_features_from_df(df, count_vectorizer):
    """
    returns a sparse matrix containing the features build by the count vectorizer.
    Each row should contain features from question1 and question2.
    """
    
    #list of questions where each element of the question is of type string
    q1_casted =  cast_list_as_strings(list(df["question1"]))
    q2_casted =  cast_list_as_strings(list(df["question2"]))    
    
    q1_mat = count_vectorizer.transform(q1_casted)
    q2_mat = count_vectorizer.transform(q2_casted)
    X_q1q2 = hstack([q1_mat,q2_mat])
            
    return X_q1q2

In [11]:
X_tr_q1q2 = get_features_from_df(train_df,CountVectorizer)
X_tr_q1q2.shape, train_df.shape

((291088, 3524936), (291088, 6))

In [12]:
logistic = sklearn.linear_model.LogisticRegression(solver="liblinear")
logistic.fit(X_tr_q1q2, train_df["is_duplicate"].values)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [13]:
#train roc auc metrics
sklearn.metrics.roc_auc_score(y_true = train_df["is_duplicate"].values, y_score = logistic.predict(X_tr_q1q2))

0.9541938511772036

### Test the model

In [14]:
X_te_q1q2  = get_features_from_df(test_df, CountVectorizer)
test_df.shape, X_te_q1q2.shape

((32344, 6), (32344, 3524936))

In [15]:
#test roc auc metrics
sklearn.metrics.roc_auc_score(y_true = test_df["is_duplicate"].values, y_score = logistic.predict(X_te_q1q2))

0.763361391823561