# Imports

In [None]:
import os
import sys
import time
import re
import pickle
import logging
import string
import warnings
import math

import pandas as pd
import numpy as np
import pylab
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties

import nltk
import nltk.data
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer

from sklearn import model_selection
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.cluster import KMeans
from sklearn.metrics import roc_auc_score as AUC

import gensim
from gensim import corpora
from gensim import models
from gensim import similarities
from gensim.models import Word2Vec

from bs4 import BeautifulSoup
from sklearn.metrics import log_loss

from sklearn.ensemble import RandomForestClassifier

# Get Data

In [None]:
nltk.download("stopwords")
quora_train = pd.read_csv("data/train.csv")

In [None]:
print (type(quora_train))
print(quora_train.head())

# Functions to process data

In [None]:
# Editing questions with NLTK package

#stopwords = nltk.corpus.stopwords.words('english')
def remove_stopwords(phrase,list_stopwords):
    
    final_phrase = []
    words = phrase.split(" ")
    for word in words:
        if word not in list_stopwords:
            final_phrase.append((word))
    
    final_phrase = ' '.join(final_phrase)
    
    return final_phrase
    
def remove_punctuation(phrase):
    
    #Check if NA
    if type(phrase) is float:
        if math.isnan(phrase):
            return ("")
    
    translator = str.maketrans('', '', string.punctuation)
    phrase = phrase.translate(translator) #removing punctuation
        
    return phrase

def lemm_wordnet(phrase):
    """
    Lemmatize using WordNet’s built-in morphy function. 
    Returns the input word unchanged if it cannot be found in WordNet
    """
    lemm = WordNetLemmatizer()
    
    #NA is a float type, so this if is to avoid conflict
    if type(phrase) is not float:
        phrase = [lemm.lemmatize(i) for i in phrase.split()]
        phrase = ' '.join(phrase)
    else:
        return ""
    return phrase
    
def all_lower_case(phrase):
    
    if type(phrase) is not float:
            phrase = phrase.lower()
    return phrase
    
def stem_snowball(phrase):
    """
    Receives a phrase and returns the same phrase stemmed, lowercase phrase without stopwords
    :param package_name: String. A phrase.
    :return: String. Stemmed, lowercase phrase without stopwords
    """
    stemmer = SnowballStemmer("english")
    
    phrase = remove_punctuation(phrase)
    phrase = phrase.lower() #removing word case
    
    

    
    phrase = remove_stopwords(phrase,stopwords)
    
    #Stem words according to stemmer
    final_phrase = []
    words = phrase.split(" ")
    for word in words:
        final_phrase.append((stemmer.stem(word)))
    
    final_phrase = ' '.join(final_phrase)
    
    return final_phrase

stem_snowball("What is the step by step guide to invest in share market in india?")

#This function will return a Bag of words of our two questions using TF method
def vectorizer_tf(data, features = 5000):
    
    vectorizer_count = CountVectorizer(ngram_range=(1, 2), max_features = features)
    
    merge = data.question1.append([data.question2])
    
    vector_fitt = vectorizer_count.fit(merge)
    
    question1 = vector_fitt.transform(data.question1)
    question2 = vector_fitt.transform(data.question2)
    
    question1 = question1.toarray()
    question2 = question2.toarray()
    
    question = np.column_stack((question1,question2))
    
    return question

#This function will return a Bag of words of our two questions using TF-idf method

def vectorizer_tf_idf(data, features = 5000):
    
    vectorizer_tf_idf = TfidfVectorizer(ngram_range=(1, 2), max_features = features, sublinear_tf=True)
    
    merge = data.question1.append([data.question2])
    
    vector_tf_idf_fitt = vectorizer_tf_idf.fit(merge)
    
    question1 = vector_tf_idf_fitt.transform(data.question1)
    question2 = vector_tf_idf_fitt.transform(data.question2)
        
    question1 = question1.toarray()
    question2 = question2.toarray()
    
    question = np.column_stack((question1,question2))
    
    return question

In [None]:
#cleaning tool is used so you can easily choose which functions you want to use to clean te text
def cleaning_tool(data, lower_case = True, stopwords = False, punctuation = False, lemm = False, 
                  stem = False, list_of_stopwords = None):
    
    if stopwords == True & list_of_stopwords == None:
        return print("You need to set stopwords in list_of_stopwords")
    
    if lower_case == True:
        data["question1"] = data["question1"].apply(lambda x: all_lower_case(x))
        data["question2"] = data["question2"].apply(lambda x: all_lower_case(x))
    
    if stopwords == True:
        data["question1"] = data["question1"].apply(lambda x: remove_stopwords(x, list_of_stopwords))
        data["question2"] = data["question2"].apply(lambda x: remove_stopwords(x, list_of_stopwords))
       
    if punctuation == True:
        data["question1"] = data["question1"].apply(lambda x: remove_punctuation(x))
        data["question2"] = data["question2"].apply(lambda x: remove_punctuation(x))
        
    if lemm_wordnet == True:
        data["question1"] = data["question1"].apply(lambda x: lemm_wordnet(x))
        data["question2"] = data["question2"].apply(lambda x: lemm_wordnet(x))
        
    if stem_snowball == True:
        data["question1"] = data["question1"].apply(lambda x: stem_snowball(x))
        data["question2"] = data["question2"].apply(lambda x: stem_snowball(x))

    return data

# Cleaning Quora Train

In [None]:
stopwords = nltk.corpus.stopwords.words('english')

quora_train = cleaning_tool(quora_train, lemm = True)

# Bag of Words

In [None]:
#https://en.wikipedia.org/wiki/Tf%E2%80%93idf
    
quora_train_tf = vectorizer_tf(quora_train, features = 5000)

In [None]:
quora_train_tf_idf = vectorizer_tf_idf(quora_train, features = 5000)

# Store edited databases w/ Pickle

In [None]:
fileObject = open("Edited_Base_stem_stopwords",'wb') 
pickle.dump(quora_train,fileObject)  
fileObject.close()

In [None]:
fileObject = open("Edited_Base_stem_stopwords",'rb')  
quora_train = pickle.load(fileObject)

# Split data into training/testing

In [None]:
'''
The function train_test_split transform your databse in 4 parts, 
the first one is the new "train" database without the independent variable, 
the second one is the new test database without the independent variable,
the third one is just the independent variable from the first part and
the fourht one is just the independent variable from the second part.
'''

quora_train_features_tf, quora_test_features_tf, quora_train_y_tf, quora_test_y_tf = model_selection.train_test_split(
    quora_train_tf, quora_train['is_duplicate'], test_size = 0.3, random_state = 0)

In [None]:
quora_train_features_tf_idf, quora_test_features_tf_idf, quora_train_y_tf_idf, quora_test_y_tf_idf = model_selection.train_test_split(
    quora_train_tf_idf, quora_train['is_duplicate'], test_size = 0.3, random_state = 0)

# Exploratory Data Analysis

In [None]:
def calculate_common_percentage(df):
    """
    Receives the initial data frame and adds  the colunms "num_words_common", "num_words_total" and "common_percentage"
    :param package_name: Data frame train.csv from the Kaggle website
    :return: Data frame with added colunms "num_words_common", "num_words_total" and "common_percentage"
    """

    num_words_common = []
    num_words_total = []

    for line in range(0,len(df)):
        count_total = 0
        count_common = 0
        for word in df["question1_edited"][line].split(" "):
            if word in df["question2_edited"][line]:
                count_common = count_common+1
            count_total = count_total+1
        num_words_common.append(count_common) 
        num_words_total.append(count_total)

    num_words_common = pd.Series(num_words_common)
    num_words_total = pd.Series(num_words_total)

    df["num_words_common"] = num_words_common.values
    df["num_words_total"] = num_words_total.values
    df["common_percentage"] = df["num_words_common"]/df["num_words_total"]

    return (df)



In [None]:
quora_train = calculate_common_percentage(quora_train)
#"num_words_common", "num_words_total" and "common_percentage"
plt.figure()
#plt.boxplot(quora_train["common_percentage"],quora_train["is_duplicate"])


quora_train.boxplot(column='common_percentage', by='is_duplicate')
plt.suptitle('')

axes = plt.gca()
axes.set_ylim([-0.2,1.1])
plt.title("Common word percentage and is_duplicate")

plt.show()

# Learning Models

#### Random Forest

In [None]:
#http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
randomforest_tf = RandomForestClassifier(n_estimators=300, max_features='auto', bootstrap=False, 
                               oob_score=False, n_jobs=-1, random_state=0).fit(quora_train_features_tf, quora_train_y_tf)

In [None]:
randomforest_tf_score = randomforest_tf.score(quora_test_features_tf, quora_test_y_tf)
print(randomforest_tf_score)

In [None]:
predict_tf = randomforest_tf.predict_proba(quora_test_features_tf)

In [None]:
log_loss(quora_test_y_tf,predict_tf)

In [None]:
randomforest_tf_idf = RandomForestClassifier(n_estimators=300, max_features='auto', bootstrap=False, 
                               oob_score=False, n_jobs=-1, random_state=0).fit(quora_train_features_tf_idf, quora_train_y_tf_idf)

In [None]:
randomforest_tf_score_idf = randomforest_tf_idf.score(quora_test_features_tf_idf, quora_test_y_tf_idf)
print(randomforest_tf_score)

In [None]:
predict_tf_idf = randomforest_tf_idf.predict_proba(quora_test_features_tf_idf)

In [None]:
log_loss(quora_test_y_tf,predict_tf)