In [None]:
import os
import sys
import time
import re
import pickle
import logging
import string
import warnings
import math

import pandas as pd
import numpy as np
import pylab
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties

import nltk
import nltk.data
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer

from sklearn import model_selection
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.cluster import KMeans
from sklearn.metrics import roc_auc_score as AUC

import gensim
from gensim import corpora
from gensim import models
from gensim import similarities
from gensim.models import Word2Vec

from bs4 import BeautifulSoup
from sklearn.metrics import log_loss

## Functions 

In [None]:
import scipy.spatial.distance

from fuzzywuzzy import fuzz

def make_basic_features(data):
    data["len_q1"] = data.question1.apply(lambda x: len(str(x)))
    data["len_q2"] = data.question2.apply(lambda x: len(str(x)))
    data["diff_len"] = data.len_q1 - data.len_q2
    data["len_char_q1"] = data.question1.apply(lambda x: len(''.join(set(str(x).replace(' ', '')))))
    data["len_char_q2"] = data.question2.apply(lambda x: len(''.join(set(str(x).replace(' ', '')))))
    data["len_word_q1"] = data.question1.apply(lambda x: len(str(x).split()))
    data["len_word_q2"] = data.question2.apply(lambda x: len(str(x).split()))
    data['common_words'] = data.apply(lambda x: len(set(str(x['question1']).lower().split()).intersection(set(str(x['question2']).lower().split()))), axis=1)
    return data

def make_fuzz_features(data):
    data['fuzz_qratio'] = data.apply(lambda x: fuzz.QRatio(str(x['question1']), str(x['question2'])), axis=1) 
    data['fuzz_WRatio'] = data.apply(lambda x: fuzz.WRatio(str(x['question1']), str(x['question2'])), axis=1) 
    data['fuzz_partial_ratio'] = data.apply(lambda x: fuzz.partial_ratio(str(x['question1']), str(x['question2'])), axis=1) 
    data['fuzz_partial_token _set_ratio'] = data.apply(lambda x : fuzz.partial_token_set_ratio(str(x['question1']), str(x['question2'])), axis=1) 
    data['fuzz_partial_token_sort_ratio'] = data.apply(lambda x: fuzz. partial_token_sort_ratio(str(x['question1']), str(x['question2'])), axis=1) 
    data ['fuzz_token_set_ratio'] = data.apply(lambda x: fuzz.token_set_ratio(str(x['question1']), str(x['question2'])), axis=1) 
    data['fuzz_token_sort_ratio'] = data.apply(lambda x: fuzz.token_sort_ratio(str(x['question1']), str(x['question2'])), axis=1)
    return data

def add_dotprod_eucli(df1, df2):
    """
    Receives two numpy arrays sums them then adds two new colunms 
    with dotprod and euclidean distance of each row
    :param df1: Numpy array
    :param df2: Numpy array
    :return: New numpy array
    """
    df3 = np.add(df1,df2)/2
    dotprod = []
    euclid = []
    for index in range(0,len(df1)):
        dotprod.append(np.dot(df1[index],df2[index]))
        euclid.append(scipy.spatial.distance.euclidean(df1[index],df2[index]))
    dotprod = np.transpose(dotprod)
    euclid = np.transpose(euclid)
    df_final = np.column_stack((df3,dotprod,euclid))
    return (df_final)

def makeFeatureVec(words, model,index2word_set, num_features):
    # Function to average all of the word vectors in a given paragraph
    # Pre-initialize an empty numpy array (for speed)
    featureVec = np.zeros((num_features,),dtype="float32")
    nwords = 0.
    # Index2word is a list that contains the names of the words in the model's vocabulary. 
    #Convert it to a set, for speed
    # Loop over each word in the review and, if it is in the model's
    # vocabulary, add its feature vector to the total
    for word in words.split(" "):
        if word in index2word_set:
            nwords = nwords + 1.
            featureVec = np.add(featureVec,model[word])
    # Divide the result by the number of words to get the average
    featureVec = np.divide(featureVec,nwords)
    return featureVec

In [None]:
# Editing questions with NLTK package

def sub_na(data):
    index_na1 = pd.isnull(data.question1)
    index_na2 = pd.isnull(data.question2)
    
    data.question1[index_na1] == "Z"
    data.question2[index_na2] == "Z"
    
    return data
    
def remove_stopwords(phrase,list_stopwords):
    """
    Receives a phrase and removes all stopwords from a list
    :param phrase: String. A phrase.
    :param list_stopwords: List. A list of stopwords
    :return: The same phrase without stopwords
    """
    final_phrase = []
    words = phrase.split(" ")
    for word in words:
        if word not in list_stopwords:
            final_phrase.append((word))
    
    final_phrase = ' '.join(final_phrase)
    
    return final_phrase
    
def remove_punctuation(phrase):
    """
    Receives a phrase and removes all punctuation from it
    :param phrase: String. A phrase.
    :return: The same phrase without punctuation
    """
    #Check if NA
    if type(phrase) is float:
        if math.isnan(phrase):
            return ("")
    
    translator = str.maketrans('', '', string.punctuation)
    phrase = phrase.translate(translator) #removing punctuation
        
    return phrase

def lemm_wordnet(phrase):
    """
    Receives a phrase and removes lemmatizes it
    :param phrase: String. A phrase.
    :return: The same phrase in lemmas
    """
    lemm = WordNetLemmatizer()
    
    #NA is a float type, so this if is to avoid conflict
    if type(phrase) is not float:
        phrase = [lemm.lemmatize(i) for i in phrase.split()]
        phrase = ' '.join(phrase)
    else:
        return ""
    return phrase
    
def remove_duplicate(phrase):
    """
    Receives a phrase and removes all duplicate words
    :param phrase: String. A phrase.
    :return: The same phrase with just unique words
    """
    aux_phrase = []
        
    if type(phrase) is not float:
        
        for i in phrase.split():
            
            if i not in aux_phrase:
                aux_phrase.append(i)
    
    phrase = ' '.join(aux_phrase)
    
    return phrase
    
    
def all_lower_case(phrase):    
    """
    Receives a phrase and makes it lower case
    :param phrase: String. A phrase.
    :return: The same phrase in lower case
    """
    if type(phrase) is not float:
            phrase = phrase.lower()
    return phrase
    
def stem_snowball(phrase):
    """
    Receives a phrase and returns the same phrase stemmed, lowercase phrase without stopwords
    :param phrase: String. A phrase.
    :return: String. Stemmed, lowercase phrase without stopwords
    """
    stemmer = SnowballStemmer("english")
    
    #Stem words according to stemmer
    final_phrase = []
    words = phrase.split(" ")
    for word in words:
        final_phrase.append((stemmer.stem(word)))
    
    final_phrase = ' '.join(final_phrase)
    
    return final_phrase

stem_snowball("What is the step by step guide to invest in share market in india?")

#This function will return a Bag of words of our two questions using TF method
def vectorizer_tf(data, features = 5000):
    """
    Receives the data frame. Merges all words in question1 and question2 and vectorizes with tf algorithm.
    :param phrase: data frame.
    :param features: number of features for the vectorizes.
    :return: An array with #number of features
    """
    vectorizer_count = CountVectorizer(ngram_range=(1, 2), max_features = features)
    
    merge = data.question1.append([data.question2])
    
    vector_fitt = vectorizer_count.fit(merge)
    
    question1 = vector_fitt.transform(data.question1)
    question2 = vector_fitt.transform(data.question2)
    
    question1 = question1.toarray()
    question2 = question2.toarray()
    
    return add_dotprod_eucli(question1,question2)

def vectorizer_tf_batch(data, batch ,features = 5000):
    """
    Receives the data frame. Merges all words in question1 and question2 and vectorizes with tf algorithm.
    :param phrase: data frame.
    :param features: number of features for the vectorizes.
    :param batch: number of batchs if you are having problem with memory
    :return: An array with #number of features
    """
    if batch <= 1:
        return print("Use vectorizer_tf instead, this function only work for batch equal or more than two")
    
    vectorizer_count = CountVectorizer(ngram_range=(1, 2), max_features = features)
    
    merge = data.question1.append([data.question2])
    
    vector_fitt = vectorizer_count.fit(merge)
    
    question1 = vector_fitt.transform(data.question1)
    question2 = vector_fitt.transform(data.question2)
    
    del vector_fitt, merge, vectorizer_count
    
    aux_batch_mean = data.shape[0]/batch
    
    aux_batch_number = []
    
    for i in range(0, batch - 1):
        aux_batch_number.append(round(i*aux_batch_mean))
    
    aux_batch_number.append(data.shape[0])
    
    question = question1[0].toarray()
    
    question = np.append(question,[0,0])
    
    for i in range(0, batch - 1):    
        
        print(i+1, 'Batch')
        
        question1_aux = question1[aux_batch_number[i]:aux_batch_number[i+1]].toarray()
        question2_aux = question2[aux_batch_number[i]:aux_batch_number[i+1]].toarray()
    
        question = np.vstack((question, add_dotprod_eucli(question1_aux, question2_aux)))
        
    question = question[1:]
    
    #Normalize two last colunms (dotprod and euclidean)
    question[:,-1] = question[:,-1]/max(question[:,-1])
    question[:,-2] = question[:,-2]/max(question[:,-2])
        
    return question

#This function will return a Bag of words of our two questions using TF-idf method

def vectorizer_tf_idf(data, features = 5000):
    """
    Receives the data frame. Merges all words in question1 and question2 and vectorizes with tf-idf algorithm.
    :param data: data frame.
    :param features: number of features for the vectorizes.
    :return: An array with #number of features
    """
    vectorizer_tf_idf = TfidfVectorizer(ngram_range=(1, 2), max_features = features, sublinear_tf=True)
    
    merge = data.question1.append([data.question2])
    
    vector_tf_idf_fitt = vectorizer_tf_idf.fit(merge)
    
    question1 = vector_tf_idf_fitt.transform(data.question1)
    question2 = vector_tf_idf_fitt.transform(data.question2)
        
    question1 = question1.toarray()
    question2 = question2.toarray()
    
    return add_dotprod_eucli(question1, question2)

def vectorizer_tf_idf_batch(data, batch ,features = 5000):
    """
    Receives the data frame. Merges all words in question1 and question2 and vectorizes with tf-idf algorithm.
    :param data: data frame.
    :param features: number of features for the vectorizes.
    :param batch: number of batchs if you are having problem with memory
    :return: An array with #number of features
    """
    if batch <= 1:
        return print("Use vectorizer_tf_idf instead, this function only work for batch equal or more than two")
    
    vectorizer_tf_idf = TfidfVectorizer(ngram_range=(1, 2), max_features = features, sublinear_tf=True)
    
    merge = data.question1.append([data.question2])
    
    vector_tf_idf_fitt = vectorizer_tf_idf.fit(merge)
    
    question1 = vector_tf_idf_fitt.transform(data.question1)
    question2 = vector_tf_idf_fitt.transform(data.question2)
    
    del vector_tf_idf_fitt, merge, vectorizer_tf_idf
    
    aux_batch_mean = data.shape[0]/batch
    
    aux_batch_number = []
    
    for i in range(0, batch - 1):
        aux_batch_number.append(round(i*aux_batch_mean))
    
    aux_batch_number.append(data.shape[0])
    
    question = question1[0].toarray()
    
    question = np.append(question,[0,0])
    
    for i in range(0, batch - 1):    
        
        print(i+1, 'Batch')
        
        question1_aux = question1[aux_batch_number[i]:aux_batch_number[i+1]].toarray()
        question2_aux = question2[aux_batch_number[i]:aux_batch_number[i+1]].toarray()
    
        question = np.vstack((question, add_dotprod_eucli(question1_aux, question2_aux)))
        
    question = question[1:]
    
    #Normalize two last colunms (dotprod and euclidean)
    question[:,-1] = question[:,-1]/max(question[:,-1])
    question[:,-2] = question[:,-2]/max(question[:,-2])
        
    return question

In [None]:
#cleaning tool is used so you can easily choose which functions you want to use to clean te text
def cleaning_tool_test(data, sub_na_test = True, lower_case = True, rm_duplicate = False, stopwords = False, 
                  punctuation = True, lemm = False, stem = False, list_of_stopwords = None):
    """
    Function to process all data using calling functions from above, according to what was chosen.
    :param data: data frame.
    :param drop_na: If True drop all lines of data frame with NA
    :param lower_case: If True transform for lower case
    :param rm_duplicate: If True remove all duplicate words in questions
    :param stopwords: If True removes stopwords
    :param punctuation: If True removes punctuation
    :param lemm: If True returns the phrase lemmatized
    :param stem: If True returns the phrase stemmed
    :param list_of_stopwords: List of stopwords to be used
    :return: Question1 and Question2 processed according to parameters
    """
    if sub_na_test == True:
        data = sub_na(data)
    
    if rm_duplicate == True:
        data["question1"] = data["question1"].apply(lambda x: remove_duplicate(x))
        data["question2"] = data["question2"].apply(lambda x: remove_duplicate(x))
    
    if lower_case == True:
        data["question1"] = data["question1"].apply(lambda x: all_lower_case(x))
        data["question2"] = data["question2"].apply(lambda x: all_lower_case(x))
    
    if stopwords == True:
        data["question1"] = data["question1"].apply(lambda x: remove_stopwords(x, list_of_stopwords))
        data["question2"] = data["question2"].apply(lambda x: remove_stopwords(x, list_of_stopwords))
       
    if punctuation == True:
        data["question1"] = data["question1"].apply(lambda x: remove_punctuation(x))
        data["question2"] = data["question2"].apply(lambda x: remove_punctuation(x))
        
    if lemm_wordnet == True:
        data["question1"] = data["question1"].apply(lambda x: lemm_wordnet(x))
        data["question2"] = data["question2"].apply(lambda x: lemm_wordnet(x))
        
    if stem_snowball == True:
        data["question1"] = data["question1"].apply(lambda x: stem_snowball(x))
        data["question2"] = data["question2"].apply(lambda x: stem_snowball(x))
    
    #We used it two times if some function create a new NA.
    if sub_na_test == True:
        data = sub_na(data)
        
    return data

## Reading and Cleaning Dataset

In [None]:
quora_test = pd.read_csv('../Dados/Kaggle/test.csv')

In [None]:
quora_test = quora_test[0:5000]

In [None]:
quora_test = cleaning_tool_test(quora_test, lemm = True, rm_duplicate = True)

In [None]:
quora_train test= cleaning_tool_test(quora_test, lemm = False, rm_duplicate = False, punctuation = True)

In [None]:
quora_test = make_basic_features(quora_test)
quora_test = make_fuzz_features(quora_test)
print(quora_test.shape)

for i in range(1,16):
    quora_test.iloc[:,-i] = quora_test.iloc[:,-i]/max(quora_test.iloc[:,-i])

fuzzy_features = quora_test.iloc[:,-15:-1]

## Getting Model name

In [None]:
mylist = os.listdir("../Dados/Kaggle")
mylist

In [None]:
model_name = mylist[-4]

## Function to do the predict in batchs

In [None]:
def making_sub_test(data, model_name, batch, method, features):
    
    #Getting the model
    path = "../Dados/Kaggle/"+model_name
    fileObject = open(path,'rb')  
    model = pickle.load(fileObject)
    
    submission = pd.Series(0)
    
    if method == "vectorizer_tf":
    
        vectorizer_count = CountVectorizer(ngram_range=(1, 2), max_features = features)
    
        merge = data.question1.append([data.question2])
    
        vector_fitt = vectorizer_count.fit(merge)
                
        question1 = vector_fitt.transform(data.question1)
        question2 = vector_fitt.transform(data.question2)
    
        del vector_fitt, merge, vectorizer_count
    
        aux_batch_mean = data.shape[0]/batch
    
        aux_batch_number = []
    
        for i in range(0, batch - 1):
            aux_batch_number.append(round(i*aux_batch_mean))
    
        aux_batch_number.append(data.shape[0])
    
        for i in range(0, batch - 1):    
        
            print(i+1, 'Batch')
        
            question1_aux = question1[aux_batch_number[i]:aux_batch_number[i+1]].toarray()
            question2_aux = question2[aux_batch_number[i]:aux_batch_number[i+1]].toarray()
    
            question = add_dotprod_eucli(question1_aux, question2_aux)
        
            # We need to adjust this part
            question[:,-1] = question[:,-1]/max(question[:,-1])
            question[:,-2] = question[:,-2]/max(question[:,-2])
            
            is_duplicate = [row[1] for row in model.predict_proba(question)]
            del question
            
            submission = submission.append(pd.Series(is_duplicate))
            
        submission = submission[1:]


        return pd.DataFrame({'test_id' : range(0, len(submission)),
                            'is_duplicate': submission})
    
    if method == "vectorizer_tf_idf":
    
        vectorizer_tf_idf = TfidfVectorizer(ngram_range=(1, 2), max_features = features, sublinear_tf=True)
    
        merge = data.question1.append([data.question2])
    
        vector_tf_idf_fitt = vectorizer_tf_idf.fit(merge)
    
        question1 = vector_tf_idf_fitt.transform(data.question1)
        question2 = vector_tf_idf_fitt.transform(data.question2)
    
        del vector_tf_idf_fitt, merge, vectorizer_tf_idf
    
        aux_batch_mean = data.shape[0]/batch
    
        aux_batch_number = []
    
        for i in range(0, batch - 1):
            aux_batch_number.append(round(i*aux_batch_mean))
    
        aux_batch_number.append(data.shape[0])
    
        for i in range(0, batch - 1):    
        
            print(i+1, 'Batch')
        
            question1_aux = question1[aux_batch_number[i]:aux_batch_number[i+1]].toarray()
            question2_aux = question2[aux_batch_number[i]:aux_batch_number[i+1]].toarray()
    
            question = add_dotprod_eucli(question1_aux, question2_aux)
        
            # We need to adjust this part
            question[:,-1] = question[:,-1]/max(question[:,-1])
            question[:,-2] = question[:,-2]/max(question[:,-2])
            
            is_duplicate = [row[1] for row in model.predict_proba(question)]
            del question
            
            submission = submission.append(pd.Series(is_duplicate))
            
        submission = submission[1:]

        return pd.DataFrame({'test_id' : range(0, len(submission)),
                            'is_duplicate': submission})
    
    if method == "word2vec":
        
        model_wikimedia = Word2Vec.load("../Dados/Word2vec/model_wikimedia_w2v")
    
        num_features = features

        index2word_set = set(model_wikimedia.wv.index2word)

        question1 = []
        question2 = []

        for index in range(0, len(data)):
            question1.append(makeFeatureVec(data["question1"].iloc[index],model_wikimedia,index2word_set,num_features))
            question2.append(makeFeatureVec(data["question2"].iloc[index],model_wikimedia,index2word_set,num_features))
            if (index % 50000 == 0):
                print (index)

        aux_batch_mean = data.shape[0]/batch
    
        aux_batch_number = []
    
        for i in range(0, batch - 1):
            aux_batch_number.append(round(i*aux_batch_mean))
    
        aux_batch_number.append(data.shape[0])
    
        for i in range(0, batch - 1):    
        
            print(i+1, 'Batch')
            
            question1_aux = np.array(question1[aux_batch_number[i]:aux_batch_number[i+1]])
            question2_aux = np.array(question2[aux_batch_number[i]:aux_batch_number[i+1]])

        #NAs to 0
            where_are_NaNs_1 = np.isnan(question1_aux)
            where_are_NaNs_2 = np.isnan(question2_aux)
            question1_aux[where_are_NaNs_1] = 0
            question2_aux[where_are_NaNs_2] = 0

            features_wikimedia = (add_dotprod_eucli(question1_aux,question2_aux))

        #Normalize two last colunms (dotprod and euclidean)
            features_wikimedia[:,-1] = features_wikimedia[:,-1]/max(features_wikimedia[:,-1])
            features_wikimedia[:,-2] = features_wikimedia[:,-2]/max(features_wikimedia[:,-2])

            features_wikimedia = np.column_stack((features_wikimedia,fuzzy_features))
    
            is_duplicate = [row[1] for row in model.predict_proba(features_wikimedia)]
            
            submission = submission.append(pd.Series(is_duplicate))

        submission = submission[1:]
    
        return pd.DataFrame({'test_id' : range(0, len(submission)),
                            'is_duplicate': submission})
    

In [None]:
submission = making_sub_test(quora_test, model_name, 120, 'vectorizer_tf', 5000)
submission.head()

In [None]:
cols = submission.columns.tolist()
cols = cols[-1:] + cols[:-1]
submission = submission[cols]
submission.shape

In [None]:
submission.is_duplicate = submission.is_duplicate * 5

In [None]:
submission.is_duplicate = submission.is_duplicate * 0.3

In [None]:
submission.head()

In [None]:
submission.to_csv('../Dados/Kaggle/submission.csv', index = False)