# Imports

In [48]:
import os
import sys
import time
import re
import pickle
import logging
import string
import warnings
import math

import pandas as pd
import numpy as np
import pylab
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties

import nltk
import nltk.data
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer

from sklearn import model_selection
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.cluster import KMeans
from sklearn.metrics import roc_auc_score as AUC

import gensim
from gensim import corpora
from gensim import models
from gensim import similarities
from gensim.models import Word2Vec

from bs4 import BeautifulSoup
from sklearn.metrics import log_loss

# Get Data

In [8]:
#nltk.download("stopwords")
quora_train = pd.read_csv("/../Dados/Kaggle/train.csv")

In [None]:
#print (type(quora_train))
#print(quora_train.head())

# Functions to process data

In [49]:
import scipy.spatial.distance

from fuzzywuzzy import fuzz

def make_basic_features(data):
    data["len_q1"] = data.question1.apply(lambda x: len(str(x)))
    data["len_q2"] = data.question2.apply(lambda x: len(str(x)))
    data["diff_len"] = data.len_q1 - data.len_q2
    data["len_char_q1"] = data.question1.apply(lambda x: len(''.join(set(str(x).replace(' ', '')))))
    data["len_char_q2"] = data.question2.apply(lambda x: len(''.join(set(str(x).replace(' ', '')))))
    data["len_word_q1"] = data.question1.apply(lambda x: len(str(x).split()))
    data["len_word_q2"] = data.question2.apply(lambda x: len(str(x).split()))
    data['common_words'] = data.apply(lambda x: len(set(str(x['question1']).lower().split()).intersection(set(str(x['question2']).lower().split()))), axis=1)
    return data

def make_fuzz_features(data):
    data['fuzz_qratio'] = data.apply(lambda x: fuzz.QRatio(str(x['question1']), str(x['question2'])), axis=1) 
    data['fuzz_WRatio'] = data.apply(lambda x: fuzz.WRatio(str(x['question1']), str(x['question2'])), axis=1) 
    data['fuzz_partial_ratio'] = data.apply(lambda x: fuzz.partial_ratio(str(x['question1']), str(x['question2'])), axis=1) 
    data['fuzz_partial_token _set_ratio'] = data.apply(lambda x : fuzz.partial_token_set_ratio(str(x['question1']), str(x['question2'])), axis=1) 
    data['fuzz_partial_token_sort_ratio'] = data.apply(lambda x: fuzz. partial_token_sort_ratio(str(x['question1']), str(x['question2'])), axis=1) 
    data ['fuzz_token_set_ratio'] = data.apply(lambda x: fuzz.token_set_ratio(str(x['question1']), str(x['question2'])), axis=1) 
    data['fuzz_token_sort_ratio'] = data.apply(lambda x: fuzz.token_sort_ratio(str(x['question1']), str(x['question2'])), axis=1)
    return data

def add_dotprod_eucli(df1, df2):
    """
    Receives two numpy arrays sums them then adds two new colunms 
    with dotprod and euclidean distance of each row
    :param df1: Numpy array
    :param df2: Numpy array
    :return: New numpy array
    """
    df3 = np.add(df1,df2)/2
    dotprod = []
    euclid = []
    for index in range(0,len(df1)):
        dotprod.append(np.dot(df1[index],df2[index]))
        euclid.append(scipy.spatial.distance.euclidean(df1[index],df2[index]))
    dotprod = np.transpose(dotprod)
    euclid = np.transpose(euclid)
    df_final = np.column_stack((df3,dotprod,euclid))
    return (df_final)





In [50]:
# Editing questions with NLTK package

def remove_stopwords(phrase,list_stopwords):
    """
    Receives a phrase and removes all stopwords from a list
    :param phrase: String. A phrase.
    :param list_stopwords: List. A list of stopwords
    :return: The same phrase without stopwords
    """
    final_phrase = []
    words = phrase.split(" ")
    for word in words:
        if word not in list_stopwords:
            final_phrase.append((word))
    
    final_phrase = ' '.join(final_phrase)
    
    return final_phrase
    
def remove_punctuation(phrase):
    """
    Receives a phrase and removes all punctuation from it
    :param phrase: String. A phrase.
    :return: The same phrase without punctuation
    """
    #Check if NA
    if type(phrase) is float:
        if math.isnan(phrase):
            return ("")
    
    translator = str.maketrans('', '', string.punctuation)
    phrase = phrase.translate(translator) #removing punctuation
        
    return phrase

def lemm_wordnet(phrase):
    """
    Receives a phrase and removes lemmatizes it
    :param phrase: String. A phrase.
    :return: The same phrase in lemmas
    """
    lemm = WordNetLemmatizer()
    
    #NA is a float type, so this if is to avoid conflict
    if type(phrase) is not float:
        phrase = [lemm.lemmatize(i) for i in phrase.split()]
        phrase = ' '.join(phrase)
    else:
        return ""
    return phrase
    
def remove_duplicate(phrase):
    """
    Receives a phrase and removes all duplicate words
    :param phrase: String. A phrase.
    :return: The same phrase with just unique words
    """
    aux_phrase = []
        
    if type(phrase) is not float:
        
        for i in phrase.split():
            
            if i not in aux_phrase:
                aux_phrase.append(i)
    
    phrase = ' '.join(aux_phrase)
    
    return phrase
    
    
def all_lower_case(phrase):    
    """
    Receives a phrase and makes it lower case
    :param phrase: String. A phrase.
    :return: The same phrase in lower case
    """
    if type(phrase) is not float:
            phrase = phrase.lower()
    return phrase
    
def stem_snowball(phrase):
    """
    Receives a phrase and returns the same phrase stemmed, lowercase phrase without stopwords
    :param phrase: String. A phrase.
    :return: String. Stemmed, lowercase phrase without stopwords
    """
    stemmer = SnowballStemmer("english")
    
    #Stem words according to stemmer
    final_phrase = []
    words = phrase.split(" ")
    for word in words:
        final_phrase.append((stemmer.stem(word)))
    
    final_phrase = ' '.join(final_phrase)
    
    return final_phrase

stem_snowball("What is the step by step guide to invest in share market in india?")

#This function will return a Bag of words of our two questions using TF method
def vectorizer_tf(data, features = 5000):
    """
    Receives the data frame. Merges all words in question1 and question2 and vectorizes with tf algorithm.
    :param phrase: data frame.
    :param features: number of features for the vectorizes.
    :return: An array with #number of features
    """
    vectorizer_count = CountVectorizer(ngram_range=(1, 2), max_features = features)
    
    merge = data.question1.append([data.question2])
    
    vector_fitt = vectorizer_count.fit(merge)
    
    question1 = vector_fitt.transform(data.question1)
    question2 = vector_fitt.transform(data.question2)
    
    question1 = question1.toarray()
    question2 = question2.toarray()
    
    return add_dotprod_eucli(question1,question2)

def vectorizer_tf_batch(data, batch ,features = 5000):
    """
    Receives the data frame. Merges all words in question1 and question2 and vectorizes with tf algorithm.
    :param phrase: data frame.
    :param features: number of features for the vectorizes.
    :param batch: number of batchs if you are having problem with memory
    :return: An array with #number of features
    """
    if batch <= 1:
        return print("Use vectorizer_tf instead, this function only work for batch equal or more than two")
    
    vectorizer_count = CountVectorizer(ngram_range=(1, 2), max_features = features)
    
    merge = data.question1.append([data.question2])
    
    vector_fitt = vectorizer_count.fit(merge)
    
    question1 = vector_fitt.transform(data.question1)
    question2 = vector_fitt.transform(data.question2)
    
    del vector_fitt, merge, vectorizer_count
    
    aux_batch_mean = data.shape[0]/batch
    
    aux_batch_number = []
    
    for i in range(0, batch - 1):
        aux_batch_number.append(round(i*aux_batch_mean))
    
    aux_batch_number.append(data.shape[0])
    
    question = question1[0].toarray()
    
    question = np.append(question,[0,0])
    
    for i in range(0, batch - 1):    
        
        print(i+1, 'Batch')
        
        question1_aux = question1[aux_batch_number[i]:aux_batch_number[i+1]].toarray()
        question2_aux = question2[aux_batch_number[i]:aux_batch_number[i+1]].toarray()
    
        question = np.vstack((question, add_dotprod_eucli(question1_aux, question2_aux)))
        
    question = question[1:]
    
    #Normalize two last colunms (dotprod and euclidean)
    question[:,-1] = question[:,-1]/max(question[:,-1])
    question[:,-2] = question[:,-2]/max(question[:,-2])
        
    return question

#This function will return a Bag of words of our two questions using TF-idf method

def vectorizer_tf_idf(data, features = 5000):
    """
    Receives the data frame. Merges all words in question1 and question2 and vectorizes with tf-idf algorithm.
    :param data: data frame.
    :param features: number of features for the vectorizes.
    :return: An array with #number of features
    """
    vectorizer_tf_idf = TfidfVectorizer(ngram_range=(1, 2), max_features = features, sublinear_tf=True)
    
    merge = data.question1.append([data.question2])
    
    vector_tf_idf_fitt = vectorizer_tf_idf.fit(merge)
    
    question1 = vector_tf_idf_fitt.transform(data.question1)
    question2 = vector_tf_idf_fitt.transform(data.question2)
        
    question1 = question1.toarray()
    question2 = question2.toarray()
    
    return add_dotprod_eucli(question1, question2)

def vectorizer_tf_idf_batch(data, batch ,features = 5000):
    """
    Receives the data frame. Merges all words in question1 and question2 and vectorizes with tf-idf algorithm.
    :param data: data frame.
    :param features: number of features for the vectorizes.
    :param batch: number of batchs if you are having problem with memory
    :return: An array with #number of features
    """
    if batch <= 1:
        return print("Use vectorizer_tf_idf instead, this function only work for batch equal or more than two")
    
    vectorizer_tf_idf = TfidfVectorizer(ngram_range=(1, 2), max_features = features, sublinear_tf=True)
    
    merge = data.question1.append([data.question2])
    
    vector_tf_idf_fitt = vectorizer_tf_idf.fit(merge)
    
    question1 = vector_tf_idf_fitt.transform(data.question1)
    question2 = vector_tf_idf_fitt.transform(data.question2)
    
    del vector_tf_idf_fitt, merge, vectorizer_tf_idf
    
    aux_batch_mean = data.shape[0]/batch
    
    aux_batch_number = []
    
    for i in range(0, batch - 1):
        aux_batch_number.append(round(i*aux_batch_mean))
    
    aux_batch_number.append(data.shape[0])
    
    question = question1[0].toarray()
    
    question = np.append(question,[0,0])
    
    for i in range(0, batch - 1):    
        
        print(i+1, 'Batch')
        
        question1_aux = question1[aux_batch_number[i]:aux_batch_number[i+1]].toarray()
        question2_aux = question2[aux_batch_number[i]:aux_batch_number[i+1]].toarray()
    
        question = np.vstack((question, add_dotprod_eucli(question1_aux, question2_aux)))
        
    question = question[1:]
    
    #Normalize two last colunms (dotprod and euclidean)
    question[:,-1] = question[:,-1]/max(question[:,-1])
    question[:,-2] = question[:,-2]/max(question[:,-2])
        
    return question

In [51]:
#cleaning tool is used so you can easily choose which functions you want to use to clean te text
def cleaning_tool(data, drop_na = True, lower_case = True, rm_duplicate = False, stopwords = False, 
                  punctuation = True, lemm = False, stem = False, list_of_stopwords = None):
    """
    Function to process all data using calling functions from above, according to what was chosen.
    :param data: data frame.
    :param drop_na: If True drop all lines of data frame with NA
    :param lower_case: If True transform for lower case
    :param rm_duplicate: If True remove all duplicate words in questions
    :param stopwords: If True removes stopwords
    :param punctuation: If True removes punctuation
    :param lemm: If True returns the phrase lemmatized
    :param stem: If True returns the phrase stemmed
    :param list_of_stopwords: List of stopwords to be used
    :return: Question1 and Question2 processed according to parameters
    """
    if drop_na == True:
        data = data.dropna(0)
    
    if rm_duplicate == True:
        data["question1"] = data["question1"].apply(lambda x: remove_duplicate(x))
        data["question2"] = data["question2"].apply(lambda x: remove_duplicate(x))
    
    if lower_case == True:
        data["question1"] = data["question1"].apply(lambda x: all_lower_case(x))
        data["question2"] = data["question2"].apply(lambda x: all_lower_case(x))
    
    if stopwords == True:
        data["question1"] = data["question1"].apply(lambda x: remove_stopwords(x, list_of_stopwords))
        data["question2"] = data["question2"].apply(lambda x: remove_stopwords(x, list_of_stopwords))
       
    if punctuation == True:
        data["question1"] = data["question1"].apply(lambda x: remove_punctuation(x))
        data["question2"] = data["question2"].apply(lambda x: remove_punctuation(x))
        
    if lemm_wordnet == True:
        data["question1"] = data["question1"].apply(lambda x: lemm_wordnet(x))
        data["question2"] = data["question2"].apply(lambda x: lemm_wordnet(x))
        
    if stem_snowball == True:
        data["question1"] = data["question1"].apply(lambda x: stem_snowball(x))
        data["question2"] = data["question2"].apply(lambda x: stem_snowball(x))
    
    #We used it two times if some function create a new NA.
    if drop_na == True:
        data = data.dropna(0)    
        
    return data

# Cleaning Quora Train

In [None]:
quora_train = cleaning_tool(quora_train, lemm = True, rm_duplicate = True)

# Bag of Words

In [None]:
#Just for test
quora_train = quora_train.head(15000)

In [None]:
quora_train_tf_idf = vectorizer_tf_idf_batch(quora_train, 25, 5000)

In [None]:
quora_train_tf = vectorizer_tf_batch(quora_train, 25, 5000)

# Store edited databases w/ Pickle TF

In [None]:
quora_train_tf_part1 = quora_train_tf[0:200000]
quora_train_tf_part2 = quora_train_tf[200000:]

del quora_train_tf

fileObject = open("../Dados/Kaggle/quora_train_Lemm_Ponc_NA_lower_tf_part1",'wb') 
pickle.dump(quora_train_tf_part1,fileObject, protocol = 4)  
fileObject.close()

del quora_train_tf_part1

fileObject = open("../Dados/Kaggle/quora_train_Lemm_Ponc_NA_lower_tf_part2",'wb') 
pickle.dump(quora_train_tf_part2,fileObject, protocol = 4)  
fileObject.close()

del quora_train_tf_part2

# Store edited databases w/ Pickle TF IDF

In [None]:
quora_train_tf_idf_part1 = quora_train_tf_idf[0:200000]
quora_train_tf_idf_part2 = quora_train_tf_idf[200000:]

del quora_train_tf_idf

fileObject = open("../Dados/Kaggle/quora_train_Lemm_Ponc_NA_lower_tf_idf_part1",'wb') 
pickle.dump(quora_train_tf_idf_part1,fileObject, protocol = 4)  
fileObject.close()

del quora_train_tf_idf_part1

fileObject = open("../Dados/Kaggle/quora_train_Lemm_Ponc_NA_lower_tf_idf_part2",'wb') 
pickle.dump(quora_train_tf_idf_part2,fileObject, protocol = 4)  
fileObject.close()

del quora_train_tf_idf_part2

# Loading Stored databases w/ Pickle TF

In [None]:
fileObject = open("../Dados/Kaggle/quora_train_Lemm_Ponc_NA_lower_tf_part1",'rb')  
quora_train_tf_part1 = pickle.load(fileObject)

fileObject = open("../Dados/Kaggle/quora_train_Lemm_Ponc_NA_lower_tf_part2",'rb')  
quora_train_tf_part2 = pickle.load(fileObject)

quora_train_tf = np.vstack((quora_train_tf_part1, quora_train_tf_part2))

del quora_train_tf_part1, quora_train_tf_part2

In [None]:
quora_train = pd.read_csv("/../Dados/Kaggle/train.csv")
quora_train = cleaning_tool(quora_train, lemm = True, rm_duplicate = True)

# Loading Stored databases w/ Pickle TF IDF

In [None]:
fileObject = open("../Dados/Kaggle/quora_train_Lemm_Ponc_NA_lower_tf_idf_part1",'rb')  
quora_train_tf_idf_part1 = pickle.load(fileObject)

fileObject = open("../Dados/Kaggle/quora_train_Lemm_Ponc_NA_lower_tf_idf_part2",'rb')  
quora_train_tf_idf_part2 = pickle.load(fileObject)

quora_train_tf_idf = np.vstack((quora_train_tf_idf_part1, quora_train_tf_idf_part2))

del quora_train_tf_idf_part1, quora_train_tf_idf_part2

In [None]:
quora_train_tf_idf.shape

In [None]:
quora_train = pd.read_csv("../Dados/Kaggle/train.csv")
quora_train = cleaning_tool(quora_train, lemm = True, rm_duplicate = True)

# Split data into training/testing

In [None]:
# for testing purposes
#quora_train_tf = quora_train_tf[:,:-1]
#quora_train_tf = quora_train_tf[:,:-1]
#quora_train_tf_idf = quora_train_tf_idf[:,:-1]
#quora_train_tf_idf = quora_train_tf_idf[:,:-1]

In [None]:
'''
The function train_test_split transform your databse in 4 parts, 
the first one is the new "train" database without the independent variable, 
the second one is the new test database without the independent variable,
the third one is just the independent variable from the first part and
the fourht one is just the independent variable from the second part.
'''

quora_train_features_tf, quora_test_features_tf, quora_train_y_tf, quora_test_y_tf = model_selection.train_test_split(
    quora_train_tf, quora_train['is_duplicate'], test_size = 0.3, random_state = 0)

In [None]:
quora_train_features_tf_idf, quora_test_features_tf_idf, quora_train_y_tf_idf, quora_test_y_tf_idf = model_selection.train_test_split(
    quora_train_tf_idf, quora_train['is_duplicate'], test_size = 0.3, random_state = 0)

In [None]:
del quora_train

# Learning Models

## Random Forest - TF

In [None]:
#http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
from sklearn.ensemble import RandomForestClassifier

randomforest_tf = RandomForestClassifier(n_estimators=300, max_features='auto', bootstrap=False, 
                               oob_score=False, n_jobs=-1, random_state=0).fit(quora_train_features_tf, quora_train_y_tf)

fileObject = open("../Dados/Kaggle/randomforest_tf",'wb') 
pickle.dump(randomforest_tf,fileObject, protocol = 4)  
fileObject.close()

In [None]:
fileObject = open("../Dados/Kaggle/randomforest_tf",'rb')  
randomforest_tf = pickle.load(fileObject)

randomforest_tf_score = randomforest_tf.score(quora_test_features_tf, quora_test_y_tf)
print(randomforest_tf_score)

predict_tf = randomforest_tf.predict_proba(quora_test_features_tf)
print(log_loss(quora_test_y_tf,predict_tf))

## Random Forest TF - IDF

In [None]:
from sklearn.ensemble import RandomForestClassifier

randomforest_tf_idf = RandomForestClassifier(n_estimators=300, max_features='auto', 
                                             bootstrap=False, oob_score=False, 
                                             n_jobs=-1, 
                                             random_state=0).fit(quora_train_features_tf_idf, quora_train_y_tf_idf)

fileObject = open("../Dados/Kaggle/randomforest_tf_idf",'wb') 
pickle.dump(randomforest_tf_idf,fileObject, protocol = 4)  
fileObject.close()

In [None]:
fileObject = open("../Dados/Kaggle/randomforest_tf_idf",'rb')  
randomforest_tf_idf = pickle.load(fileObject)

randomforest_tf_score_idf = randomforest_tf_idf.score(quora_test_features_tf_idf, quora_test_y_tf_idf)
print(randomforest_tf_score)

predict_tf_idf = randomforest_tf_idf.predict_proba(quora_test_features_tf_idf)
print(log_loss(quora_test_y_tf_idf,predict_tf_idf))

## Logistic Regression TF

In [None]:
from sklearn.linear_model import LogisticRegression as LR

LR_tf = LR(penalty='l2',
               dual=False,
               tol=0.0001,
               C=1.0,
               fit_intercept=True,
               intercept_scaling=1,
               class_weight=None,
               random_state=0,
               solver='liblinear',
               max_iter=100,
               multi_class='ovr',
               verbose=0).fit(quora_train_features_tf, quora_train_y_tf)

fileObject = open("../Dados/Kaggle/LR_tf",'wb') 
pickle.dump(LR_tf,fileObject, protocol = 4)  
fileObject.close()

In [None]:
fileObject = open("../Dados/Kaggle/LR_tf",'rb')  
LR_tf = pickle.load(fileObject)

eval_LR_tf_tts = LR_tf.score(quora_test_features_tf, quora_test_y_tf)
print(eval_LR_tf_tts)

predict_tf = LR_tf.predict_proba(quora_test_features_tf)
print(log_loss(quora_test_y_tf,predict_tf))

## Logistic Regression TF - IDF

In [None]:
from sklearn.linear_model import LogisticRegression as LR

LR_tf_idf = LR(penalty='l2',
                  dual=False,
                  tol=0.0001,
                  C=1.0,
                  fit_intercept=True,
                  intercept_scaling=1,
                  class_weight=None,
                  random_state=0,
                  solver='liblinear',
                  max_iter=100,
                  multi_class='ovr',
                  verbose=0).fit(quora_train_features_tf_idf, quora_train_y_tf_idf)

fileObject = open("../Dados/Kaggle/LR_tf_idf",'wb') 
pickle.dump(LR_tf_idf,fileObject, protocol = 4)  
fileObject.close()

In [None]:
fileObject = open("../Dados/Kaggle/LR_tf_idf",'rb')  
LR_tf_idf = pickle.load(fileObject)

eval_LR_tf_idf_tts = LR_tf_idf.score(quora_test_features_tf_idf, quora_test_y_tf_idf)
print(eval_LR_tf_idf_tts)

predict_tf_idf = LR_tf_idf.predict_proba(quora_test_features_tf_idf)
print(log_loss(quora_test_y_tf_idf,predict_tf_idf))

## Gradient Boost TF

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

GBC_tf = GradientBoostingClassifier(loss='deviance',
                                        learning_rate=0.1,
                                        n_estimators=300,
                                        subsample=1.0,
                                        min_samples_split=2,
                                        min_samples_leaf=1,
                                        min_weight_fraction_leaf=0.0,
                                        max_depth=3,
                                        init=None,
                                        random_state=0,
                                        max_features=None,
                                        verbose=0,
                                        max_leaf_nodes=None,
                                        warm_start=False,
                                        presort='auto').fit(quora_train_features_tf, quora_train_y_tf)

fileObject = open("../Dados/Kaggle/GBC_tf",'wb') 
pickle.dump(GBC_tf,fileObject, protocol = 4)  
fileObject.close()

In [None]:
fileObject = open("../Dados/Kaggle/GBC_tf",'rb')  
GBC_tf = pickle.load(fileObject)

eval_GBC_tf_tts = GBC_tf.score(quora_test_features_tf, quora_test_y_tf)
print(eval_GBC_tf_tts)

predict_tf = GBC_tf.predict_proba(quora_test_features_tf)
print(log_loss(quora_test_y_tf,predict_tf))

## Gradient Boost TF - IDF

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

GBC_tf_idf = GradientBoostingClassifier(loss='deviance',
                                           learning_rate=0.1,
                                           n_estimators=100,
                                           subsample=1.0,
                                           min_samples_split=2,
                                           min_samples_leaf=1,
                                           min_weight_fraction_leaf=0.0,
                                           max_depth=3,
                                           init=None,
                                           random_state=0,
                                           max_features=None,
                                           verbose=0,
                                           max_leaf_nodes=None,
                                           warm_start=False,
                                           presort='auto').fit(quora_train_features_tf_idf, quora_train_y_tf_idf)

fileObject = open("../Dados/Kaggle/GBC_tf_idf",'wb') 
pickle.dump(GBC_tf_idf,fileObject, protocol = 4)  
fileObject.close()

In [None]:
fileObject = open("../Dados/Kaggle/GBC_tf_idf",'rb')  
GBC_tf_idf = pickle.load(fileObject)

eval_GBC_tf_idf_tts = GBC_tf_idf.score(quora_test_features_tf_idf, quora_test_y_tf_idf)
print(eval_GBC_tf_idf_tts)

predict_tf_idf = GBC_tf_idf.predict_proba(quora_test_features_tf_idf)
print(log_loss(quora_test_y_tf_idf,predict_tf_idf))

## Voting TF

In [None]:
from sklearn.ensemble import VotingClassifier

vot_tf = VotingClassifier(estimators=[('rf', randomforest_tf),
                                          ('lr', LR_tf),
                                          ('gbc', GBC_tf)], voting='soft').fit(quora_train_features_tf,
                                                                                   quora_train_y_tf)

fileObject = open("../Dados/Kaggle/vot_tf",'wb') 
pickle.dump(vot_tf,fileObject, protocol = 4)  
fileObject.close()

In [None]:
fileObject = open("../Dados/Kaggle/vot_tf",'rb')  
vot_tf = pickle.load(fileObject)

eval_vot_tf_tts = vot_tf.score(quora_test_features_tf, quora_test_y_tf)
print(eval_vot_tf_tts)

predict_tf = vot_tf.predict_proba(quora_test_features_tf)
print(log_loss(quora_test_y_tf, predict_tf))

## Voting TF - IDF

In [None]:
from sklearn.ensemble import VotingClassifier

vot_tf_idf = VotingClassifier(estimators=[('rf', randomforest_tf_idf),
                                             ('lr', LR_tf_idf),
                                             ('gbc', GBC_tf_idf)], voting='soft').fit(quora_train_features_tf_idf, 
                                                                                         quora_train_y_tf_idf)

fileObject = open("../Dados/Kaggle/vot_tf_idf",'wb') 
pickle.dump(vot_tf_idf,fileObject, protocol = 4)  
fileObject.close()

In [None]:
fileObject = open("../Dados/Kaggle/vot_tf_idf",'rb')  
vot_tf_idf = pickle.load(fileObject)

eval_vot_tf_idf_tts = vot_tf_idf.score(quora_test_features_tf_idf, quora_test_y_tf_idf)
print(eval_vot_tf_idf_tts)

predict_tf_idf = vot_tf_idf.predict_proba(quora_test_features_tf_idf)
print(log_loss(quora_test_y_tf_idf,predict_tf_idf))

## Keras - Tensorflow

In [None]:
import tensorflow as tf
sess = tf.Session()
from keras import backend as K
K.set_session(sess)
from keras.layers import Dense
from keras.objectives import categorical_crossentropy

# Word2Vec

In [30]:
model_wikimedia = Word2Vec.load("/../Dados/Word2vec/model_wikimedia_w2v")

In [52]:
num_features = 400    # Word vector dimensionality

def makeFeatureVec(words, model,index2word_set, num_features):
    # Function to average all of the word vectors in a given paragraph
    # Pre-initialize an empty numpy array (for speed)
    featureVec = np.zeros((num_features,),dtype="float32")
    nwords = 0.
    # Index2word is a list that contains the names of the words in the model's vocabulary. 
    #Convert it to a set, for speed
    # Loop over each word in the review and, if it is in the model's
    # vocabulary, add its feature vector to the total
    for word in words.split(" "):
        if word in index2word_set:
            nwords = nwords + 1.
            featureVec = np.add(featureVec,model[word])
    # Divide the result by the number of words to get the average
    featureVec = np.divide(featureVec,nwords)
    return featureVec



#makeFeatureVec("How can Internet speed be increased by hacking",model_googlenews,num_features)

In [53]:
quora_train = pd.read_csv("/../Dados/Kaggle/train.csv")

#quora_train = quora_train.head(500)
quora_train = cleaning_tool(quora_train, lemm = False, rm_duplicate = False, punctuation = True)

quora_train = make_basic_features(quora_train)
quora_train = make_fuzz_features(quora_train)
print(quora_train.shape)

for i in range(1,16):
    quora_train.iloc[:,-i] = quora_train.iloc[:,-i]/max(quora_train.iloc[:,-i])

fuzzy_features = quora_train.iloc[:,-15:-1]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


(404288, 21)


In [54]:
import time

index2word_set = set(model_wikimedia.wv.index2word)

t0 = time.time()
question1 = []
question2 = []
for index in range(0,len(quora_train)):
    question1.append(makeFeatureVec(quora_train["question1"].iloc[index],model_wikimedia,index2word_set,num_features))
    question2.append(makeFeatureVec(quora_train["question2"].iloc[index],model_wikimedia,index2word_set,num_features))
    if (index % 50000 == 0):
        print (index)
        print(time.time()-t0)
question1 = np.array(question1)
question2 = np.array(question2)

#NAs to 0
where_are_NaNs_1 = np.isnan(question1)
where_are_NaNs_2 = np.isnan(question2)
question1[where_are_NaNs_1] = 0
question2[where_are_NaNs_2] = 0

features_wikimedia = (add_dotprod_eucli(question1,question2))

#Normalize two last colunms (dotprod and euclidean)
features_wikimedia[:,-1] = features_wikimedia[:,-1]/max(features_wikimedia[:,-1])
features_wikimedia[:,-2] = features_wikimedia[:,-2]/max(features_wikimedia[:,-2])

features_wikimedia = np.column_stack((features_wikimedia,fuzzy_features))

0
0.0012972354888916016




50000
14.090894222259521
100000
28.294624090194702
150000
42.56831169128418
200000
56.57384395599365
250000
70.69206666946411
300000
84.6734230518341
350000
98.82018685340881
400000
113.15909790992737


# Store Pickle

In [55]:
quora_train_wikimedia_part1 = features_wikimedia[0:200000]
quora_train_wikimedia_part2 = features_wikimedia[200000:]

del features_wikimedia

fileObject = open("../Dados/Kaggle/wikimedia_Ponc_NA_lower_wikimedia_part1",'wb') 
pickle.dump(quora_train_wikimedia_part1,fileObject, protocol = 4)  
fileObject.close()

del quora_train_wikimedia_part1

fileObject = open("../Dados/Kaggle/wikimedia_Ponc_NA_lower_wikimedia_part2",'wb') 
pickle.dump(quora_train_wikimedia_part2,fileObject, protocol = 4)  
fileObject.close()

del quora_train_wikimedia_part2

# Load Pickle

In [56]:
fileObject = open("../Dados/Kaggle/wikimedia_Ponc_NA_lower_wikimedia_part1",'rb')  
quora_train_wikimedia_part1 = pickle.load(fileObject)

fileObject = open("../Dados/Kaggle/wikimedia_Ponc_NA_lower_wikimedia_part2",'rb')  
quora_train_wikimedia_part2 = pickle.load(fileObject)

features_wikimedia = np.vstack((quora_train_wikimedia_part1, quora_train_wikimedia_part2))

del quora_train_wikimedia_part1, quora_train_wikimedia_part2

# Cross Validation

In [57]:
print(features_wikimedia.shape)

quora_train_features_wikimedia, quora_test_features_wikimedia, quora_train_y_wikimedia, quora_test_y_wikimedia = model_selection.train_test_split(features_wikimedia, quora_train['is_duplicate'], 
                                                       test_size = 0.3, random_state = 0)

print(quora_train_features_wikimedia.shape)
print(quora_test_features_wikimedia.shape)
print(quora_train_y_wikimedia.shape)
print(quora_test_y_wikimedia.shape)

(404288, 416)
(283001, 416)
(121287, 416)
(283001,)
(121287,)


## Linear Model W2V

In [58]:
from sklearn.linear_model import LogisticRegression as LR

LR_wikimedia = LR(penalty='l2',
               dual=False,
               tol=0.0001,
               C=1.0,
               fit_intercept=True,
               intercept_scaling=1,
               class_weight=None,
               random_state=0,
               solver='liblinear',
               max_iter=100,
               multi_class='ovr',
               verbose=0).fit(quora_train_features_wikimedia, quora_train_y_wikimedia)

fileObject = open("../Dados/Kaggle/LR_wikimedia",'wb') 
pickle.dump(LR_wikimedia,fileObject, protocol = 4)  
fileObject.close()

In [59]:
fileObject = open("../Dados/Kaggle/LR_wikimedia",'rb')  
LR_wikimedia = pickle.load(fileObject)

eval_LR_wikimedia_tts = LR_wikimedia.score(quora_test_features_wikimedia, quora_test_y_wikimedia)
print(eval_LR_wikimedia_tts)

predict_wikimedia = LR_wikimedia.predict_proba(quora_test_features_wikimedia)
print(log_loss(quora_test_y_wikimedia,predict_wikimedia))

0.740714173819
0.492142999097


## Random Forest Wikimedia

In [60]:
from sklearn.ensemble import RandomForestClassifier

randomforest_wikimedia = RandomForestClassifier(n_estimators=300, max_features='auto', bootstrap=False, 
                               oob_score=False, n_jobs=-1, random_state=0).fit(quora_train_features_wikimedia, 
                                                                               quora_train_y_wikimedia)

fileObject = open("../Dados/Kaggle/randomforest_wikimedia",'wb') 
pickle.dump(randomforest_wikimedia,fileObject, protocol = 4)  
fileObject.close()

In [None]:
fileObject = open("../Dados/Kaggle/randomforest_wikimedia",'rb')  
randomforest_wikimedia = pickle.load(fileObject)

randomforest_wikimedia_score = randomforest_wikimedia.score(quora_test_features_wikimedia, quora_test_y_wikimedia)
print(randomforest_wikimedia_score)

predict_wikimedia = randomforest_wikimedia.predict_proba(quora_test_features_wikimedia)

print(log_loss(quora_test_y_wikimedia,predict_wikimedia))

0.818043153842
0.408686326446


## Gradient Boosting Wikimedia

In [44]:
from sklearn.ensemble import GradientBoostingClassifier

GBC_wikimedia = GradientBoostingClassifier(loss='deviance',
                                        learning_rate=0.1,
                                        n_estimators=300,
                                        subsample=1.0,
                                        min_samples_split=2,
                                        min_samples_leaf=1,
                                        min_weight_fraction_leaf=0.0,
                                        max_depth=3,
                                        init=None,
                                        random_state=0,
                                        max_features=None,
                                        verbose=0,
                                        max_leaf_nodes=None,
                                        warm_start=False,
                                        presort='auto').fit(quora_train_features_wikimedia, quora_train_y_wikimedia)

fileObject = open("../Dados/Kaggle/GBC_wikimedia",'wb') 
pickle.dump(GBC_wikimedia,fileObject, protocol = 4)  
fileObject.close()

In [45]:
fileObject = open("../Dados/Kaggle/GBC_wikimedia",'rb')  
GBC_wikimedia = pickle.load(fileObject)

eval_GBC_wikimedia = GBC_wikimedia.score(quora_test_features_wikimedia, quora_test_y_wikimedia)
print(eval_GBC_wikimedia)

predict_wikimedia = GBC_wikimedia.predict_proba(quora_test_features_wikimedia)
print(log_loss(quora_test_y_wikimedia,predict_wikimedia))

0.68
0.861538711172


## Voting Wikimedia

In [None]:
from sklearn.ensemble import VotingClassifier
vot_wikimedia = VotingClassifier(estimators=[('rf', randomforest_wikimedia),
                                             ('lr', LR_wikimedia)],voting='soft').fit(quora_train_features_wikimedia,quora_train_y_wikimedia)
                                             #('gbc', GBC_wikimedia)], 
                        

fileObject = open("../Dados/Kaggle/vot_wikimedia",'wb') 
pickle.dump(vot_wikimedia,fileObject, protocol = 4)  
fileObject.close()

In [None]:
fileObject = open("../Dados/Kaggle/vot_wikimedia",'rb')  
vot_wikimedia = pickle.load(fileObject)

eval_vot_wikimedia_tts = vot_wikimedia.score(quora_test_features_wikimedia, quora_test_y_wikimedia)
print(eval_vot_wikimedia_tts)

predict_wikimedia = vot_wikimedia.predict_proba(quora_test_features_wikimedia)
print(log_loss(quora_test_y_wikimedia, predict_wikimedia))