In [None]:
import pandas as pd
import scipy
import sklearn
from sklearn import *
import numpy as np
import collections
from scipy import sparse
import nltk
from collections import defaultdict
import re
from CountVectorizer_BagOfWords import CountVectorizer as cv
from TfIdfVectorizer import TfIdfVectorizer as tf

# DATA

In [None]:
#read the data
available_data = pd.read_csv("quora_train_data.csv")
available_data

In [None]:
#Split data into train and test
train_df, test_df = sklearn.model_selection.train_test_split(available_data, test_size=0.1, random_state=123)

# PREPROCESS DATA

# AUX FUNCTIONS

These functions are used in order to create the feature matrices to feed the models. They are valid for both vectorizers. 

In [None]:
def cast_list_as_strings(mylist):
    """
    return a list of strings
    """
    assert isinstance(mylist, list), f"the input mylist should be a list it is {type(mylist)}"
    
    mylist_aux = []
    
    for i in mylist:
        mylist_aux.append(str(i))
        
    return mylist_aux

from scipy.sparse import hstack

def get_features_from_df(df, count_vectorizer):
    """
    returns a sparse matrix containing the features build by the count vectorizer.
    Each row should contain features from question1 and question2.
    """
    
    #list of questions where each element of the question is of type string
    q1_casted =  cast_list_as_strings(list(df["question1"]))
    q2_casted =  cast_list_as_strings(list(df["question2"]))    
    
    q1_mat = count_vectorizer.transform(q1_casted)
    q2_mat = count_vectorizer.transform(q2_casted)
    X_q1q2 = hstack([q1_mat,q2_mat])
            
    return X_q1q2

This cell is necessary in order to obtain a list of documents. This is the structure we usually want, at least for the vectorizers.

In [None]:
#Convert all elements of the documents into strings 
q1_train =  cast_list_as_strings(list(train_df["question1"]))
q2_train =  cast_list_as_strings(list(train_df["question2"]))
q1_test  =  cast_list_as_strings(list(test_df["question1"]))
q2_test  =  cast_list_as_strings(list(test_df["question2"]))

all_questions = q1_train + q2_train

# COUNTVECTORIZER

Let us now create the *CountVectorizer* class. It will have the following attributes and methods.

ATTRIBUTES
* **stop_words**: is a list (or set) of stop words. That is, these words will be ignored. By default, it is an empty list.
* **ngram_range**: is the tuple giving the range of n-gram sizes to consider. By default it takes value (1,1).

METHODS
* **document_cleaner**: it defines the function to be used so as to perform the cleaning of the document. By default, such cleaning consists in lower casing the words, removing all characters after an apostrophe and removing all non alphanumeric characters.
* **tokenizer**: defines the function to be used so as to convert the string into a list of tokens. By default, the tokens will be the sets of alphanumeric characters separated by white spaces. Notice that a token may be composed of a single character.
* **token_cleaner**: defines the function to be used so as to perform the cleaning of the tokens (stemming, lemmatizing, doing nothing). By default, it returns the tokens as they are.
* **fit**: it creates the vocabulary using the three above functions. It defines the attributes *self.vocabulary*, *self.n_features* and *self.word_to_ind* of the object.
* **transform**: converts a document into a feature vector using the above methods.
* **fit_transform**: performs the *fit* and *transform* methods in a single call.

#### Testing the doc_cleaner_pattern and token_pattern

In [None]:
doc_cleaner_pattern = r"('\w+)|([^a-zA-Z0-9])"

In [None]:
clean_doc_pattern = re.compile(doc_cleaner_pattern)

In [None]:
doc = clean_doc_pattern.sub(" ", "I'll was', born'is Here in 1995 a)?").lower()
doc

In [None]:
token_pattern=r"(?u)\b\w+\b"

In [None]:
token_pattern_aux = re.compile(token_pattern)

In [None]:
auxi = token_pattern_aux.findall(doc)
auxi

# IMPLEMENT COUNTVECTORIZER

In [None]:
from nltk.stem import PorterStemmer, LancasterStemmer, WordNetLemmatizer, SnowballStemmer
from nltk.corpus import stopwords 

#How to call such stemmers and lemmatizer in the CountVectorizer object:
#PorterStemmer(): token_cleaner_func = PorterStemmer().stem
#LancasterStemmer(): token_cleaner_func = LancasterStemmer().stem
#SnowballStemmer(language='english'): token_cleaner_func = SnowballStemmer(language='english').stem
#WordNetLemmatizer(): token_cleaner_func = lambda doc: WordNetLemmatizer().lemmatize(doc,pos="v")



In [None]:
#inicialize the CountVectorizer and define its parameters
CountVectorizer = cv(token_cleaner_func = lambda doc: WordNetLemmatizer().lemmatize(doc,pos="v"),
                                 stop_words = set(stopwords.words('english')),
                                 ngram_range=(1,3))

In [None]:
#fit the CountVectorizer
CountVectorizer.fit(all_questions)

### Train the model

In [None]:
X_tr_q1q2 = get_features_from_df(train_df,CountVectorizer)
X_tr_q1q2.shape, train_df.shape

In [None]:
logistic = sklearn.linear_model.LogisticRegression(solver="liblinear")
logistic.fit(X_tr_q1q2, train_df["is_duplicate"].values)

In [None]:
#train roc auc metrics
sklearn.metrics.roc_auc_score(y_true = train_df["is_duplicate"].values, y_score = logistic.predict(X_tr_q1q2))

### Test the model

In [None]:
X_te_q1q2  = get_features_from_df(test_df, CountVectorizer)
test_df.shape, X_te_q1q2.shape

In [None]:
#test roc auc metrics
sklearn.metrics.roc_auc_score(y_true = test_df["is_duplicate"].values, y_score = logistic.predict(X_te_q1q2))

# IMPLEMENT TFIDF VECTORIZER

In [None]:
tfidf_vectorizer = tf()
tfidf_vectorizer.fit(all_questions)

In [None]:
X_tr_q1q2 = get_features_from_df(train_df,tfidf_vectorizer)
X_tr_q1q2.shape, train_df.shape

In [None]:
logistic = sklearn.linear_model.LogisticRegression(solver="liblinear")
logistic.fit(X_tr_q1q2, train_df["is_duplicate"].values)

sklearn.metrics.roc_auc_score(y_true = train_df["is_duplicate"].values, y_score = logistic.predict(X_tr_q1q2))

In [None]:
X_te_q1q2  = get_features_from_df(test_df, tfidf_vectorizer)
test_df.shape, X_te_q1q2.shape

#test roc auc metrics
sklearn.metrics.roc_auc_score(y_true = test_df["is_duplicate"].values, y_score = logistic.predict(X_te_q1q2))