In [1]:
import numpy as np
import pandas as pd
import re
import random
import seaborn as sns
import pickle
from matplotlib import pyplot as plt

import xgboost as xgb
from xgboost import XGBClassifier
from xgboost import plot_importance
from xgboost import plot_tree

from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.neural_network import MLPClassifier


import nltk
from nltk.corpus import stopwords
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.stem import LancasterStemmer
from nltk import ngrams, bigrams




### Read in Data

In [8]:
def read_data():
    # Read in training data
    data_train = pd.read_csv('Data/train_data.csv')
    data_train = data_train.drop('is_duplicate', axis=1)
    train_labels = pd.read_csv('Data/train_labels.csv')
    data_train = pd.merge(data_train, train_labels, on='id')
    test_data = pd.read_csv('Data/test_data.csv')
    
    return [train_labels, data_train, test_data]

train_labels, train_df, validation_data = read_data()

### Example Question Pair

In [9]:
q1_example = train_data['question1'].iloc[80]
q2_example = train_data['question2'].iloc[80]
print(q1_example)
print(q2_example)

Why did harry become a horcrux?
What is a Horcrux?


### Regex function 

In [10]:
def clean_text(text):
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    return text

### Data Preprocessing

In [17]:
def cleanup(question, stop_remove = True, stemming=True):
    
    # Split question and lower
    question = question.lower()

    # Regex the question and split 
    clean_question = clean_text(question).split()
    
    # Remove stopwords if True
    if stop_remove:
        stops = set(stopwords.words('english'))
        clean_question = [word for word in clean_question if word not in stops]
    
    # Stem words in the questions if True
    if stemming:
        stemmer = LancasterStemmer()
        clean_question = [stemmer.stem(word) for word in clean_question]
    
    clean_question = " ".join(clean_question)
    
    return clean_question

# Cleaning text in training and testing df's
train_df['q1_clean'] = train_df['question1'].apply(lambda question: cleanup(str(question)))
train_df['q2_clean'] = train_df['question2'].apply(lambda question: cleanup(str(question)))
validation_data['q1_clean'] = validation_data['question1'].apply(lambda question: cleanup(str(question)))
validation_data['q2_clean'] = validation_data['question2'].apply(lambda question: cleanup(str(question)))

### Clean Text DF to CSV

In [18]:
print ('Do you want to write cleaned DF to CSV? ')
answer = input('y/n ')
print ("\n")

if answer == 'y':
    train_df.to_csv('Data/cleaned_train.csv')
    validation_data.to_csv('Data/cleaned_validation.csv')

elif answer == 'n':
    print ('Do you want to load the cleaned DF from CSV then?')
    answer2 = input('y/n ')
    
    if answer2 == 'y':
        train_df = pd.read_csv('Data/cleaned_train.csv')
        validation_data = pd.read_csv('Data/cleaned_validation.csv')

Do you want to write cleaned DF to CSV? 
y/n n


Do you want to load the cleaned DF from CSV then?
y/n n


### Obtain Weights for TF-IDF

In [19]:
from collections import Counter

# Create the corpus for TF-IDF
q1_corpus = [str(question).lower() for question in list(train_df['question1'])]
q2_corpus = [str(question).lower() for question in list(train_df['question2'])]
corpus = q1_corpus + q2_corpus

# Get weights
def get_weight(count, eps=10000, min_count=2):
    if count < min_count:
        return 0
    else:
        return 1 / (count + eps)

eps = 5000
words = (" ".join(corpus)).lower().split()
counts = Counter(words)
weights = {word: get_weight(count) for word, count in counts.items()}

### Get Shared TF-IDF Feature

In [20]:
stops = set(stopwords.words("english"))

def tfidf_word_match_share(row):
    q1words = {}
    q2words = {}
    for word in str(row['question1']).lower().split():
        if word not in stops:
            q1words[word] = 1
    for word in str(row['question2']).lower().split():
        if word not in stops:
            q2words[word] = 1
    if len(q1words) == 0 or len(q2words) == 0:
        return 0
    
    shared_weights = [weights.get(w, 0) for w in q1words.keys() if w in q2words] + [weights.get(w, 0) for w in q2words.keys() if w in q1words]
    total_weights = [weights.get(w, 0) for w in q1words] + [weights.get(w, 0) for w in q2words]
    R = np.sum(shared_weights) / np.sum(total_weights)
    return R

# Apply tf-idf function to train and validation sets
tdif_train_data = train_df.apply(tfidf_word_match_share, axis=1, raw=True)
tdif_validation_data = validation_data.apply(tfidf_word_match_share, axis=1, raw=True)
train_df['tf-idf'] = tdif_train_data
validation_data['tf-idf'] = tdif_validation_data



### Adding Other Features

In [21]:
# Get no. character difference between question 1 and question 2
def char_count_difference(df):
    char_count_dif = abs(df['char_counts_q1'] - df['char_counts_q2'])
    return char_count_dif

# Get word count difference between question 1 and question 2
def word_count_difference(df):
    word_count_dif = abs(df['word_counts_q1'] - df['word_counts_q2'])
    return word_count_dif

# Checks if first word in question is the same for both questions
def first_word_same(df):
    q1_first_word = df['question1'].apply(lambda x: str(x).split()[0])
    q2_first_word = df['question2'].apply(lambda x: str(x).split()[0])
    is_same = q1_first_word == q2_first_word
    return is_same

# Check if the last word in questions
def last_word_same(df):
    q1_last_word = df['question1'].apply(lambda x: str(x).split()[len(str(x).split()) - 1])
    q2_last_word = df['question2'].apply(lambda x: str(x).split()[len(str(x).split()) - 1])
    is_same = q1_last_word == q2_last_word
    return is_same

# Check if the first word as well as the last word is the same
def first_last_same(df):
    q1_first_word = df['question1'].apply(lambda x: str(x).split()[0])
    q2_first_word = df['question2'].apply(lambda x: str(x).split()[0])
    is_same1 = q1_first_word == q2_first_word
    
    q1_last_word = df['question1'].apply(lambda x: str(x).split()[len(str(x).split()) - 1])
    q2_last_word = df['question2'].apply(lambda x: str(x).split()[len(str(x).split()) - 1])
    is_same2 = q1_last_word == q2_last_word
    
    both_same = (is_same1.astype(int) + is_same2.astype(int)) == 2
    return both_same
    
# Checks how many word matches there are between the two questions
def intersection(df):
    words_q1 = df['q1_clean'].apply(lambda x: str(x).split())
    words_q2 = df['q2_clean'].apply(lambda x: str(x).split())
    intersections = [set(q1).intersection(set(q2)) for q1, q2 in zip(words_q1, words_q2)]
    same_count = [len(intersection) for intersection in intersections]
    return same_count

# Intersections on words in raw sentences
def intersection_raw(df):
    words_q1 = df['question1'].apply(lambda x: str(x).split())
    words_q2 = df['question2'].apply(lambda x: str(x).split())
    intersections = [set(q1).intersection(set(q2)) for q1, q2 in zip(words_q1, words_q2)]
    same_count = [len(intersection) for intersection in intersections]
    return same_count

# Intersection ratio with respect to number of words in q1
def intersection_q1(df):
    words_q1 = df['q1_clean'].apply(lambda x: str(x).split())
    words_q2 = df['q2_clean'].apply(lambda x: str(x).split())
    words_q1_len = df['q1_clean'].apply(lambda x: len(str(x).split()))
    intersections = [set(q1).intersection(set(q2)) for q1, q2 in zip(words_q1, words_q2)]
    same_count = [len(intersection) for intersection in intersections]
    same_count_ratio = same_count / words_q1_len.astype(float)
    return same_count_ratio

# Intersection ratio with respect to number of words in q2
def intersection_q2(df):
    words_q1 = df['q1_clean'].apply(lambda x: str(x).split())
    words_q2 = df['q2_clean'].apply(lambda x: str(x).split())
    words_q2_len = df['q2_clean'].apply(lambda x: len(str(x).split()))
    intersections = [set(q1).intersection(set(q2)) for q1, q2 in zip(words_q1, words_q2)]
    same_count = [len(intersection) for intersection in intersections]
    same_count_ratio = same_count / words_q2_len.astype(float)
    return same_count_ratio

# Intersection ratio with respect to average number of words in q1/q2
def intersection_mean(df):
    words_q1 = df['question1'].apply(lambda x: str(x).split())
    words_q2 = df['question2'].apply(lambda x: str(x).split())
    words_q1_len = df['question1'].apply(lambda x: len(str(x).split()))
    words_q2_len = df['question2'].apply(lambda x: len(str(x).split()))
    intersections = [set(q1).intersection(set(q2)) for q1, q2 in zip(words_q1, words_q2)]
    same_count = [float((len(intersection) * 2)) / (words_q1_len.iloc[i] + words_q2_len.iloc[i]) for i, 
                  intersection in enumerate(intersections)]
    return same_count

# Checks if the sentence is the same on cleaned question
def same_sentence(df):
    q1_sentence = df['q1_clean'].apply(lambda x: str(x).split())
    q2_sentence = df['q2_clean'].apply(lambda x: str(x).split())
    is_same = q1_sentence == q2_sentence
    return is_same

# Checks if the sentence is the same on raw questions
def same_sentence_raw(df):
    q1_sentence = df['question1'].apply(lambda x: str(x).split())
    q2_sentence = df['question2'].apply(lambda x: str(x).split())
    is_same = q1_sentence == q2_sentence
    return is_same

# Checks if both question 1 and question 2 contain full-stops
def full_stop_count(df):
    q1_stops = list(df['question1'].apply(lambda x: '.' in str(x)))
    q2_stops = list(df['question2'].apply(lambda x: '.' in str(x)))
    
    both_stops = [True if q1_stop == True and q2_stop == True else False for q1_stop, q2_stop in zip(q1_stops, q2_stops)]
    return both_stops 

# Counts number of full stops in question 1
def full_stop_q1(question):
    q1_stops = [1 if char == '.' else 0 for char in list(str(question))]
    return sum(q1_stops)

# Counts number of full stops in question 2
def full_stop_q2(question):
    q2_stops = [1 if char == '.' else 0 for char in list(str(question))]
    return sum(q2_stops)

# Checks if both question 1 and question 2 contain full-stops
def start_capital_letter(df):
    q1_caps = df['question1'].apply(lambda x: str(x)[0].isupper())
    q2_caps = df['question2'].apply(lambda x: str(x)[0].isupper())
    
    both_caps = [True if q1_cap == True and q2_cap == True else False for q1_cap, q2_cap in zip(q1_caps, q2_caps)]
    return both_caps

# Checks if question 1 starts with capital letter
def start_cap_q1(df):
    q1_caps = list(df['question1'].apply(lambda x: str(x)[0].isupper()))
    return q1_caps

# Checks if question 2 starts with a capital letter
def start_cap_q2(df):
    q2_caps = list(df['question2'].apply(lambda x: str(x)[0].isupper()))
    return q2_caps

# Gets a word frequency for a chosen word in question. Used for words like 'what', 'how', etcetera
def get_word_freq(question, target_word):
    word_freq = [1 if word == target_word else 0 for word in str(question).lower().split()]
    return sum(word_freq)

# Get amount of shared 2-length ngrams in questions
def get_n2_grams(questions):
    q1_grams = list(bigrams(str(questions['question1']).lower().split()))
    q2_grams = list(bigrams(str(questions['question2']).lower().split()))
    n2_shared = len(set(q1_grams).intersection(set(q2_grams)))
    return n2_shared

# Get amount of shared 3-length ngrams in questions
def get_n3_grams(questions):
    q1_grams = list(ngrams(str(questions['question1']).lower().split(), 3))
    q2_grams = list(ngrams(str(questions['question2']).lower().split(), 3))
    n3_shared = len(set(q1_grams).intersection(set(q2_grams)))
    return n3_shared


### Create Features for Training/Validation

In [23]:
def create_features(df):
    
    # Duplicated q's?
    df['duplicated_q1'] = df['question1'].duplicated()
    df['duplicated_q2'] = df['question2'].duplicated()
    df['both_duplicated'] = np.where((df['duplicated_q1'].astype(bool) == True) & (df['duplicated_q2'].astype(bool) == True)
                     , True, False)
    
    # Character counts and wordcounts
    df['char_counts_q1'] = df['question1'].apply(lambda x: len(str(x).replace(' ', '')))
    df['char_counts_q2'] = df['question2'].apply(lambda x: len(str(x).replace(' ', '')))
    df['word_counts_q1'] = df['question1'].apply(lambda x: len(str(x).split()))
    df['word_counts_q2'] = df['question2'].apply(lambda x: len(str(x).split()))
    
    # Get number of how's, why's, where's etcetera
    df['q1_what'] = df['question1'].apply(lambda x: get_word_freq(x, 'what'))
    df['q2_what'] = df['question2'].apply(lambda x: get_word_freq(x, 'what'))
    df['both_what'] = np.where((df['q1_what'].astype(int) >= 1) & (df['q1_what'].astype(int) >= 1)
                     , True, False)
    
    df['q1_how'] = df['question1'].apply(lambda x: get_word_freq(x, 'how'))
    df['q2_how'] = df['question2'].apply(lambda x: get_word_freq(x, 'how'))
    df['both_how'] = np.where((df['q1_how'].astype(int) >= 1) & (df['q2_how'].astype(int) >= 1)
                     , True, False)
    
    df['q1_where'] = df['question1'].apply(lambda x: get_word_freq(x, 'where'))
    df['q2_where'] = df['question2'].apply(lambda x: get_word_freq(x, 'where'))
    df['both_where'] = np.where((df['q1_where'].astype(int) >= 1) & (df['q2_where'].astype(int) >= 1)
                     , True, False)
    
    df['q1_why'] = df['question1'].apply(lambda x: get_word_freq(x, 'why'))
    df['q2_why'] = df['question2'].apply(lambda x: get_word_freq(x, 'why'))
    df['both_why'] = np.where((df['q1_why'].astype(int) >= 1) & (df['q2_why'].astype(int) >= 1)
                     , True, False)
    
    df['q1_when'] = df['question1'].apply(lambda x: get_word_freq(x, 'when'))
    df['q2_when'] = df['question2'].apply(lambda x: get_word_freq(x, 'when'))
    df['both_when'] = np.where((df['q1_when'].astype(int) >= 1) & (df['q2_when'].astype(int) >= 1)
                     , True, False)
    
    df['q1_who'] = df['question1'].apply(lambda x: get_word_freq(x, 'who'))
    df['q2_who'] = df['question2'].apply(lambda x: get_word_freq(x, 'who'))
    df['both_who'] = np.where((df['q1_who'].astype(int) >= 1) & (df['q2_who'].astype(int) >= 1)
                     , True, False)
    
    df['q1_which'] = df['question1'].apply(lambda x: get_word_freq(x, 'which'))
    df['q2_which'] = df['question2'].apply(lambda x: get_word_freq(x, 'which'))
    df['both_which'] = np.where((df['q1_which'].astype(int) >= 1) & (df['q2_which'].astype(int) >= 1)
                     , True, False)
    
    df['q1_i'] = df['question1'].apply(lambda x: get_word_freq(x, 'i'))
    df['q2_i'] = df['question2'].apply(lambda x: get_word_freq(x, 'i'))
    df['both_i'] = np.where((df['q1_i'].astype(int) >= 1) & (df['q2_i'].astype(int) >= 1)
                     , True, False)
    
    # Average amount of characters per word
    df['avg_word_length_q1'] = df['char_counts_q1'] / df['word_counts_q1']
    df['avg_word_length_q2'] = df['char_counts_q2'] / df['word_counts_q2']
    df['avg_word_diff'] = abs(df['avg_word_length_q1'] - df['avg_word_length_q2'])
    
    # Character count difference and word count difference
    df['char_count_dif'] = char_count_difference(df)
    df['word_count_dif'] = word_count_difference(df)
    
    # Average word ratio
    df['char_word_ratio'] = (df['char_counts_q1'] + df['char_counts_q2']) / (df['word_counts_q1'] +df['word_counts_q2'])
    
    # Same first and last words
    df['first_word_same'] = first_word_same(df)
    df['last_word_same'] = last_word_same(df)
    df['first_last_same'] = first_last_same(df)
    
    # Same sentence
    df['same_sentence'] = same_sentence(df)
    df['same_sentence_raw'] = same_sentence_raw(df)
    
    # Intersections
    df['intersection_raw'] = intersection_raw(df)
    df['intersection'] = intersection(df)
    df['intersection_q1'] = intersection_q1(df)
    df['intersection_q2'] = intersection_q2(df)
    df['intersection_mean'] = intersection_mean(df)
    
    # Full stops
    df['full_stop_count'] = full_stop_count(df)
    df['q1_stops'] = df['question1'].apply(lambda x: full_stop_q1(x))
    df['q2_stops'] = df['question2'].apply(lambda x: full_stop_q2(x))
    
    # Starting capital letters
    df['both_start_capital'] = start_capital_letter(df)
    df['q1_caps'] = start_cap_q1(df)
    df['q2_caps'] = start_cap_q2(df)
    
    # Ngrams
    df['grams2_shared'] = df.apply(get_n2_grams, axis=1)
    df['grams3_shared'] = df.apply(get_n3_grams, axis=1)

    return df

train_df = create_features(train_df)
validation_data = create_features(validation_data)



### Train/Validation Split

In [None]:
# 80/20 split of train/validation data
model_train, model_test = train_test_split(train_df, test_size=0.2)
train_data = model_train.dropna()
test_data = model_test.dropna()

In [None]:
# Put independent variables here
independents = ['char_counts_q1', 'char_counts_q2', 'word_counts_q1', 'word_counts_q2', 'avg_word_length_q1',
                'avg_word_length_q2', 'avg_word_diff', 'char_count_dif', 'word_count_dif', 'first_word_same', 
                'last_word_same', 'first_last_same','same_sentence', 'same_sentence_raw', 'intersection',
                'intersection_raw','intersection_q1', 'intersection_q2', 'intersection_mean', 'tf-idf',
               'char_word_ratio', 'full_stop_count', 'q1_stops', 'q2_stops', 'both_start_capital', 'q1_caps',
               'q2_caps', 'q1_what', 'q2_what', 'both_what', 'q1_how', 'q2_how', 'both_how', 'q1_where',
               'q2_where', 'both_where', 'q1_why', 'q2_why', 'both_why', 'q1_when', 'q2_when', 'both_when',
               'q1_who', 'q2_who', 'both_who', 'q1_which', 'q2_which', 'both_which', 'grams2_shared', 'grams3_shared']

X_train, y_train = train_data[independents].as_matrix(), train_data['is_duplicate'].as_matrix()
X_test, y_test = test_data[independents].as_matrix(), test_data['is_duplicate'].as_matrix()

### Define and Train XGB Model

In [None]:
watchlist = [(X_test, y_test)]

# Set our parameters for xgboost
params = {}
params['objective'] = 'binary:logistic'
params['eval_metric'] = 'logloss'
params['eta'] = 0.02
params['max_depth'] = 8

d_train = xgb.DMatrix(X_train, label=y_train, feature_names=independents)
d_valid = xgb.DMatrix(X_test, label=y_test, feature_names=independents)

watchlist = [(d_train, 'train'), (d_valid, 'valid')]

bst = xgb.train(params, d_train, 1500, watchlist, early_stopping_rounds=30, verbose_eval=50)

### Predict Test Labels with XGB-model

In [None]:
x_validation= validation_data[independents].as_matrix()
d_test = xgb.DMatrix(x_validation, feature_names=independents)
p_test = bst.predict(d_test, ntree_limit=bst.best_ntree_limit)

sub = pd.DataFrame()
sub['test_id'] = validation_data['test_id']
sub['is_duplicate'] = p_test
sub['is_duplicate'] = sub['is_duplicate'].apply(lambda x: 1 if x > 0.5 else 0)
sub.to_csv('Data/xgbBoost.csv', index=False)

### Feature Importance Graph

In [None]:
# Get F-scores of features and put relative importance in df
importance = bst.get_fscore()
importance = sorted(importance.items())
importance_df = pd.DataFrame(importance, columns=['feature', 'fscore'])
importance_df['fscore'] = importance_df['fscore'] / importance_df['fscore'].sum()
importance_df = importance_df.sort_values('fscore', ascending=True)[-10:]

# Plot top 10 most important features
plt.figure()
importance_df.plot(kind='barh', x='feature', y='fscore', legend=False, figsize=(6, 10))
plt.title('Relative Importance of Features XGBoost')
plt.xlabel('Relative Importance')
plt.show()

## Some Statistics and Plots

### Train Data and Test Data Length

In [None]:
# Get number of question pairs
question_pairs_train = len(train_df)
print(question_pairs_train, ': Length of train-data')
question_pairs_test = len(test_data)
print(question_pairs_test, ': Length of test-data')

### Duplicates/Non-Duplicates Plot

In [None]:
sns.set(style="darkgrid")
ax = sns.countplot(x="is_duplicate", data=train_labels)
plt.title('Number of Duplicates vs Non-Duplicates')
plt.show()