# Imports

In [None]:
import os
import sys
import time
import re
import pickle
import logging
import string
import warnings
import math

import pandas as pd
import numpy as np
import pylab
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties

import nltk
import nltk.data
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer

from sklearn import model_selection
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.cluster import KMeans
from sklearn.metrics import roc_auc_score as AUC

import gensim
from gensim import corpora
from gensim import models
from gensim import similarities
from gensim.models import Word2Vec

from bs4 import BeautifulSoup
from sklearn.metrics import log_loss

# Get Data

In [None]:
#nltk.download("stopwords")
quora_train = pd.read_csv("data/train.csv")

In [None]:
print (type(quora_train))
print(quora_train.head())

# Functions to process data

In [None]:
# Editing questions with NLTK package

def remove_stopwords(phrase,list_stopwords):
    """
    Receives a phrase and removes all stopwords from a list
    :param phrase: String. A phrase.
    :param list_stopwords: List. A list of stopwords
    :return: The same phrase without stopwords
    """
    final_phrase = []
    words = phrase.split(" ")
    for word in words:
        if word not in list_stopwords:
            final_phrase.append((word))
    
    final_phrase = ' '.join(final_phrase)
    
    return final_phrase
    
def remove_punctuation(phrase):
    """
    Receives a phrase and removes all punctuation from it
    :param phrase: String. A phrase.
    :return: The same phrase without punctuation
    """
    #Check if NA
    if type(phrase) is float:
        if math.isnan(phrase):
            return ("")
    
    translator = str.maketrans('', '', string.punctuation)
    phrase = phrase.translate(translator) #removing punctuation
        
    return phrase

def lemm_wordnet(phrase):
    """
    Receives a phrase and removes lemmatizes it
    :param phrase: String. A phrase.
    :return: The same phrase in lemmas
    """
    lemm = WordNetLemmatizer()
    
    #NA is a float type, so this if is to avoid conflict
    if type(phrase) is not float:
        phrase = [lemm.lemmatize(i) for i in phrase.split()]
        phrase = ' '.join(phrase)
    else:
        return ""
    return phrase
    
def remove_duplicate(phrase):
    """
    Receives a phrase and removes all duplicate words
    :param phrase: String. A phrase.
    :return: The same phrase with just unique words
    """
    aux_phrase = []
        
    if type(phrase) is not float:
        
        for i in phrase.split():
            
            if i not in aux_phrase:
                aux_phrase.append(i)
    
    phrase = ' '.join(aux_phrase)
    
    return phrase
    
    
def all_lower_case(phrase):    
    """
    Receives a phrase and makes it lower case
    :param phrase: String. A phrase.
    :return: The same phrase in lower case
    """
    if type(phrase) is not float:
            phrase = phrase.lower()
    return phrase
    
def stem_snowball(phrase):
    """
    Receives a phrase and returns the same phrase stemmed, lowercase phrase without stopwords
    :param phrase: String. A phrase.
    :return: String. Stemmed, lowercase phrase without stopwords
    """
    stemmer = SnowballStemmer("english")
    
    #Stem words according to stemmer
    final_phrase = []
    words = phrase.split(" ")
    for word in words:
        final_phrase.append((stemmer.stem(word)))
    
    final_phrase = ' '.join(final_phrase)
    
    return final_phrase

stem_snowball("What is the step by step guide to invest in share market in india?")

#This function will return a Bag of words of our two questions using TF method
def vectorizer_tf(data, features = 5000):
    """
    Receives the data frame. Merges all words in question1 and question2 and vectorizes with tf algorithm.
    :param phrase: data frame.
    :param features: number of features for the vectorizes.
    :return: An array with #number of features
    """
    vectorizer_count = CountVectorizer(ngram_range=(1, 2), max_features = features)
    
    merge = data.question1.append([data.question2])
    
    vector_fitt = vectorizer_count.fit(merge)
    
    question1 = vector_fitt.transform(data.question1)
    question2 = vector_fitt.transform(data.question2)
    
    question1 = question1.toarray()
    question2 = question2.toarray()
    
    return (question1 + question2)/2


#This function will return a Bag of words of our two questions using TF-idf method

def vectorizer_tf_idf(data, features = 5000):
    """
    Receives the data frame. Merges all words in question1 and question2 and vectorizes with tf-idf algorithm.
    :param data: data frame.
    :param features: number of features for the vectorizes.
    :return: An array with #number of features
    """
    vectorizer_tf_idf = TfidfVectorizer(ngram_range=(1, 2), max_features = features, sublinear_tf=True)
    
    merge = data.question1.append([data.question2])
    
    vector_tf_idf_fitt = vectorizer_tf_idf.fit(merge)
    
    question1 = vector_tf_idf_fitt.transform(data.question1)
    question2 = vector_tf_idf_fitt.transform(data.question2)
        
    question1 = question1.toarray()
    question2 = question2.toarray()
    
    return (question1 + question2)/2

In [None]:
#cleaning tool is used so you can easily choose which functions you want to use to clean te text
def cleaning_tool(data, drop_na = True, lower_case = True, rm_duplicate = False, stopwords = False, 
                  punctuation = False, lemm = False, stem = False, list_of_stopwords = None):
    """
    Function to process all data using calling functions from above, according to what was chosen.
    :param data: data frame.
    :param drop_na: If True drop all lines of data frame with NA
    :param lower_case: If True transform for lower case
    :param rm_duplicate: If True remove all duplicate words in questions
    :param stopwords: If True removes stopwords
    :param punctuation: If True removes punctuation
    :param lemm: If True returns the phrase lemmatized
    :param stem: If True returns the phrase stemmed
    :param list_of_stopwords: List of stopwords to be used
    :return: Question1 and Question2 processed according to parameters
    """
    if drop_na == True:
        data = data.dropna(0)
    
    if rm_duplicate == True:
        data["question1"] = data["question1"].apply(lambda x: remove_duplicate(x))
        data["question2"] = data["question2"].apply(lambda x: remove_duplicate(x))
    
    if lower_case == True:
        data["question1"] = data["question1"].apply(lambda x: all_lower_case(x))
        data["question2"] = data["question2"].apply(lambda x: all_lower_case(x))
    
    if stopwords == True:
        data["question1"] = data["question1"].apply(lambda x: remove_stopwords(x, list_of_stopwords))
        data["question2"] = data["question2"].apply(lambda x: remove_stopwords(x, list_of_stopwords))
       
    if punctuation == True:
        data["question1"] = data["question1"].apply(lambda x: remove_punctuation(x))
        data["question2"] = data["question2"].apply(lambda x: remove_punctuation(x))
        
    if lemm_wordnet == True:
        data["question1"] = data["question1"].apply(lambda x: lemm_wordnet(x))
        data["question2"] = data["question2"].apply(lambda x: lemm_wordnet(x))
        
    if stem_snowball == True:
        data["question1"] = data["question1"].apply(lambda x: stem_snowball(x))
        data["question2"] = data["question2"].apply(lambda x: stem_snowball(x))
    
    #We used it two times if some function create a new NA.
    if drop_na == True:
        data = data.dropna(0)    
        
    return data

# Cleaning Quora Train

In [None]:
quora_train = cleaning_tool(quora_train, lemm = True, rm_duplicate = True)

# Bag of Words

In [None]:
quora_train = quora_train.head(5000)

In [None]:
#https://en.wikipedia.org/wiki/Tf%E2%80%93idf
    
quora_train_tf = vectorizer_tf(quora_train, features = 5000)

In [None]:
quora_train_tf_idf = vectorizer_tf_idf(quora_train, features = 5000)

# Store edited databases w/ Pickle

In [None]:
#fileObject = open("Edited_Base_stem_stopwords",'wb') 
#pickle.dump(quora_train,fileObject)  
#fileObject.close()

In [None]:
#fileObject = open("data/quora_train_lemm_NA_lowercase",'rb')  
#quora_train = pickle.load(fileObject)

# Split data into training/testing

In [None]:
'''
The function train_test_split transform your databse in 4 parts, 
the first one is the new "train" database without the independent variable, 
the second one is the new test database without the independent variable,
the third one is just the independent variable from the first part and
the fourht one is just the independent variable from the second part.
'''

quora_train_features_tf, quora_test_features_tf, quora_train_y_tf, quora_test_y_tf = model_selection.train_test_split(
    quora_train_tf, quora_train['is_duplicate'], test_size = 0.3, random_state = 0)

In [None]:
quora_train_features_tf_idf, quora_test_features_tf_idf, quora_train_y_tf_idf, quora_test_y_tf_idf = model_selection.train_test_split(
    quora_train_tf_idf, quora_train['is_duplicate'], test_size = 0.3, random_state = 0)

# Exploratory Data Analysis

In [None]:
def calculate_common_percentage(df):
    """
    Receives the initial data frame and adds  the colunms "num_words_common", "num_words_total" and "common_percentage"
    :param package_name: Data frame train.csv from the Kaggle website
    :return: Data frame with added colunms "num_words_common", "num_words_total" and "common_percentage"
    """
    num_words_common = []
    num_words_total = []

    for line in range(0,len(df)):
        count_total = 0
        count_common = 0
        for word in df["question1_edited"][line].split(" "):
            if word in df["question2_edited"][line]:
                count_common = count_common+1
            count_total = count_total+1
        num_words_common.append(count_common) 
        num_words_total.append(count_total)

    num_words_common = pd.Series(num_words_common)
    num_words_total = pd.Series(num_words_total)

    df["num_words_common"] = num_words_common.values
    df["num_words_total"] = num_words_total.values
    df["common_percentage"] = df["num_words_common"]/df["num_words_total"]

    return (df)

# Learning Models

## Random Forest

In [None]:
#http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
from sklearn.ensemble import RandomForestClassifier

randomforest_tf = RandomForestClassifier(n_estimators=300, max_features='auto', bootstrap=False, 
                               oob_score=False, n_jobs=-1, random_state=0).fit(quora_train_features_tf, quora_train_y_tf)

randomforest_tf_score = randomforest_tf.score(quora_test_features_tf, quora_test_y_tf)
print(randomforest_tf_score)

predict_tf = randomforest_tf.predict_proba(quora_test_features_tf)

print(log_loss(quora_test_y_tf,predict_tf))

In [None]:
randomforest_tf_idf = RandomForestClassifier(n_estimators=300, max_features='auto', 
                                             bootstrap=False, oob_score=False, 
                                             n_jobs=-1, 
                                             random_state=0).fit(quora_train_features_tf_idf, quora_train_y_tf_idf)

randomforest_tf_score_idf = randomforest_tf_idf.score(quora_test_features_tf_idf, quora_test_y_tf_idf)
print(randomforest_tf_score)

predict_tf_idf = randomforest_tf_idf.predict_proba(quora_test_features_tf_idf)

print(log_loss(quora_test_y_tf,predict_tf))

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression as LR

clf_LR_tf = LR(penalty='l2',
               dual=False,
               tol=0.0001,
               C=1.0,
               fit_intercept=True,
               intercept_scaling=1,
               class_weight=None,
               random_state=0,
               solver='liblinear',
               max_iter=100,
               multi_class='ovr',
               verbose=0).fit(quora_train_features_tf, quora_train_y_tf)

eval_LR_tf_tts = clf_LR_tf.score(quora_test_features_tf, quora_test_y_tf)
print(eval_LR_tf_tts)
predict_tf = clf_LR_tf.predict_proba(quora_test_features_tf)
print(log_loss(quora_test_y_tf,predict_tf))

In [None]:
clf_LR_tf_idf = LR(penalty='l2',
                  dual=False,
                  tol=0.0001,
                  C=1.0,
                  fit_intercept=True,
                  intercept_scaling=1,
                  class_weight=None,
                  random_state=0,
                  solver='liblinear',
                  max_iter=100,
                  multi_class='ovr',
                  verbose=0).fit(quora_train_features_tf_idf, quora_train_y_tf_idf)

eval_LR_tf_idf_tts = clf_LR_tf_idf.score(quora_test_features_tf_idf, quora_test_y_tf_idf)
print(eval_LR_tf_idf_tts)
predict_tf_idf = clf_LR_tf_idf.predict_proba(quora_test_features_tf_idf)
print(log_loss(quora_test_y_tf_idf,predict_tf_idf))

## Gradient Boost

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

clf_GBC_tf = GradientBoostingClassifier(loss='deviance',
                                        learning_rate=0.1,
                                        n_estimators=300,
                                        subsample=1.0,
                                        min_samples_split=2,
                                        min_samples_leaf=1,
                                        min_weight_fraction_leaf=0.0,
                                        max_depth=3,
                                        init=None,
                                        random_state=0,
                                        max_features=None,
                                        verbose=0,
                                        max_leaf_nodes=None,
                                        warm_start=False,
                                        presort='auto').fit(quora_train_features_tf, quora_train_y_tf)

eval_GBC_tf_tts = clf_GBC_tf.score(quora_test_features_tf, quora_test_y_tf)
print(eval_GBC_tf_tts)
predict_tf = clf_GBC_tf.predict_proba(quora_test_features_tf)
print(log_loss(quora_test_y_tf,predict_tf))

In [None]:
clf_GBC_tf_idf = GradientBoostingClassifier(loss='deviance',
                                           learning_rate=0.1,
                                           n_estimators=100,
                                           subsample=1.0,
                                           min_samples_split=2,
                                           min_samples_leaf=1,
                                           min_weight_fraction_leaf=0.0,
                                           max_depth=3,
                                           init=None,
                                           random_state=0,
                                           max_features=None,
                                           verbose=0,
                                           max_leaf_nodes=None,
                                           warm_start=False,
                                           presort='auto').fit(quora_train_features_tf_idf, quora_train_y_tf_idf)

eval_GBC_tf_idf_tts = clf_GBC_tf_idf.score(quora_test_features_tf_idf, quora_test_y_tf_idf)
print(eval_GBC_tf_idf_tts)
predict_tf_idf = clf_GBC_tf_idf.predict_proba(quora_test_features_tf_idf)
print(log_loss(quora_test_y_tf_idf,predict_tf_idf))

## Voting

In [None]:
from sklearn.ensemble import VotingClassifier

clf_vot_tf = VotingClassifier(estimators=[('rf', randomforest_tf),
                                          ('lr', clf_LR_tf),
                                          ('gbc', clf_GBC_tf)], voting='soft').fit(quora_train_features_tf,
                                                                                   quora_train_y_tf)

eval_vot_tf_tts = clf_vot_tf.score(quora_test_features_tf, quora_test_y_tf)
print(eval_vot_tf_tts)
predict_tf = clf_vot_tf.predict_proba(quora_test_features_tf)
print(log_loss(quora_test_y_tf,predict_tf))

In [None]:
clf_vot_tf_idf = VotingClassifier(estimators=[('rf', randomforest_tf_idf),
                                             ('lr', clf_LR_tf_idf),
                                             ('gbc', clf_GBC_tf_idf)], voting='soft').fit(quora_train_features_tf_idf, 
                                                                                         quora_train_y_tf_idf)
eval_vot_tf_idf_tts = clf_vot_tf_idf.score(quora_test_features_tf_idf, quora_test_y_tf_idf)
print(eval_vot_tf_idf_tts)
predict_tf_idf = clf_vot_tf_idf.predict_proba(quora_test_features_tf_idf)
print(log_loss(quora_test_y_tf_idf,predict_tf_idf))

## Keras

In [None]:
# https://www.kaggle.com/cstahl12/titanic/titanic-with-keras

import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.optimizers import SGD, RMSprop
from keras.utils.np_utils import to_categorical
from keras.utils import np_utils

print('Keras using {} backend'.format(keras.backend.backend()))  #https://keras.io/backend/

batch_size = np.round(len(quora_train_features_tf)*0.10)
num_classes = 2
epochs = 50

quora_train_features_tf_reshape = quora_train_features_tf.astype('float32')
quora_test_features_tf_reshape = quora_test_features_tf.astype('float32')

print(quora_train_features_tf_reshape.shape[0], 'train samples')
print(quora_test_features_tf_reshape.shape[0], 'test samples')

quora_train_y_tf_cat = np_utils.to_categorical(quora_train_y_tf, 2)
quora_test_y_tf_cat = np_utils.to_categorical(quora_test_y_tf, 2)

# convert class vectors to binary class matrices for categorical cross_entropy
#y_train_kr = to_categorical(y_traincv)
#y_test_kr = to_categorical(y_testcv)

model = Sequential()
model.add(Dense(input_dim=quora_train_features_tf_reshape.shape[1], activation='relu', units=200))
model.add(Dropout(0.2))
model.add(Dense(input_dim=200, activation='relu', units=200))
model.add(Dropout(0.2))
model.add(Dense(input_dim=200, activation='relu', units=24)) #activation='softmax'
model.add(Dropout(0.2))
model.add(Dense(input_dim=24, kernel_initializer='uniform', activation='sigmoid', units=1)) #units=2 for cat.cr.ent
model.summary()


model.compile(loss='binary_crossentropy',
              #loss='categorical_crossentropy',
              #loss='mean_squared_error',
              optimizer=RMSprop(), 
              #optimizer=SGD(lr=0.001),
              metrics=['accuracy'])

history = model.fit(quora_train_features_tf_reshape, quora_train_y_tf_cat,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_data=(quora_test_features_tf_reshape, quora_test_y_tf_cat))

score = model.evaluate(quora_test_features_tf_reshape, quora_test_y_tf_cat, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])


predict_tf = model.predict_classes(quora_test_features_tf_reshape) #y_pred = np.around(model.predict(x_test_kr)[:,1])
print(log_loss(quora_test_y_tf_cat,predict_tf))