## Libraries

In [1]:
import pandas as pd
import numpy as np

## Load the data

In [8]:
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')

## Load the data with features (pickled)

In [None]:
train = pd.read_pickle('../features/train.pkl')
test = pd.read_pickle('../features/test.pkl')

In [None]:
train.head()

## Clean Data

In [9]:
import re
from nltk.corpus import stopwords

In [10]:
def clean_sentence(val):
    "remove chars that are not letters or numbers, downcase, then remove stop words"
    stops_words = set(stopwords.words("english"))
    regex = re.compile('([^\s\w]|_)+')
    sentence = regex.sub('', val).lower()
    sentence = sentence.split(" ")
    
    for word in list(sentence):
        if word in stops_words:
            sentence.remove(word)  
            
    sentence = " ".join(sentence)
    return sentence

def clean_dataframe(data):
    "drop nans, then apply 'clean_sentence' function to question1 and 2"
    data = data.dropna(how="any")
    
    for col in ['question1', 'question2']:
        data[col] = data[col].apply(clean_sentence)
    
    return data

train_clean = clean_dataframe(train)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


## Data Processing

In [11]:
import math
import gc

from collections import Counter
from multiprocessing import Pool
from simhash import Simhash

from nltk import tokenize
import nltk

stops = set(stopwords.words("english"))

from gensim.models import word2vec

from nltk.tokenize import word_tokenize
from nltk.corpus.reader.wordnet import ADJ, ADJ_SAT, ADV, NOUN, VERB
from nltk.stem import WordNetLemmatizer

In [12]:
def transform_data(data):
    return data.apply(apply_func, axis=1, raw=True)

def chunk(data, num):   
    chunk_size = math.ceil(len(data) / num)
    return [data[i*chunk_size : (i+1)*chunk_size] for i in range(num)]

def pool_apply(data, proc_num=8):
    
    with Pool(processes=proc_num) as pool:
        chunks = chunk(data, proc_num) 
        proccessed_chunks = list(pool.map(transform_data, chunks))
  
    return np.hstack(tuple(proccessed_chunks))

In [13]:
def word_match_share(row):
    stops_words = set(stopwords.words("english"))
    
    q1words = {}
    q2words = {}
    
    for word in str(row['question1']).lower().split():
        if word not in stops_words:
            q1words[word] = 1
            
    for word in str(row['question2']).lower().split():
        if word not in stops_words:
            q2words[word] = 1
            
    if len(q1words) == 0 or len(q2words) == 0:
        # The computer-generated chaff includes a few questions that are nothing but stopwords
        return 0
    
    shared_words_in_q1 = [w for w in q1words.keys() if w in q2words]
    shared_words_in_q2 = [w for w in q2words.keys() if w in q1words]
    R = (len(shared_words_in_q1) + len(shared_words_in_q2))/(len(q1words) + len(q2words))
    return R

def tfidf_word_match_share(row):
    stops_words = set(stopwords.words("english"))
    
    q1words = {}
    q2words = {}
    
    for word in str(row['question1']).lower().split():
        if word not in stops_words:
            q1words[word] = 1
            
    for word in str(row['question2']).lower().split():
        if word not in stops_words:
            q2words[word] = 1
            
    if len(q1words) == 0 or len(q2words) == 0:
        # The computer-generated chaff includes a few questions that are nothing but stopwords
        return 0
    
    with np.errstate(invalid='ignore'):
        shared_weights = [weights.get(w, 0) for w in q1words.keys() if w in q2words] + [
            weights.get(w, 0) for w in q2words.keys() if w in q1words]
        
        total_weights = [weights.get(w, 0) for w in q1words] + [weights.get(w, 0) for w in q2words]

        R = np.sum(shared_weights) / np.sum(total_weights)

    return R if not math.isnan(R) else 0


def start_with_same_first_word(row):
    if not isinstance(row['question1'], str) or not isinstance(row['question2'], str):
        return 0
    
    first_word_q1 = row['question1'].split()[0].lower()
    first_word_q2 = row['question2'].split()[0].lower()
    
    return 1 if first_word_q1 == first_word_q2 else 0

def question_length(row):
    question = row[feature]
    return len(question) if isinstance(question, str) else 0

def word_count(row):
    question = row[feature]
    return len(question.split()) if isinstance(question, str) else 0


# If a word appears only once, we ignore it completely (likely a typo)
# Epsilon defines a smoothing constant, which makes the effect of extremely rare words smaller
def get_weight(count, eps=10000, min_count=2):
    if count < min_count:
        return 0
    else:
        return 1 / (count + eps)
    
def simhash_distance_seq(row):
    q1 = row['question1']
    q2 = row['question2']
    
    if not isinstance(q1, str) or not isinstance(q2, str):
        return 0

    return Simhash(q1).distance(Simhash(q2))

def simhash_distance_shingle(row):
    q1 = row['question1']
    q2 = row['question2']
    
    if not isinstance(q1, str) or not isinstance(q2, str):
        return 0
    
    q1_shingles = get_singles(q1)
    q2_shingles = get_singles(q2)
    
    return Simhash(q1_shingles).distance(Simhash(q2_shingles))

def get_singles(sequence, width = 3):
    sequence = sequence.lower()
    sequence = re.sub(r'[^\w]+', '', sequence)
    return [sequence[i:i + width] for i in range(max(len(sequence) - width + 1, 1))]



def get_common_unigrams(row):
    question1 = str(row['question1'])
    question2 = str(row['question2'])
    
    q1_unigrams = set([i for i in nltk.ngrams(question1, 1)])
    q2_unigrams = set([i for i in nltk.ngrams(question2, 1)])
    return len( q1_unigrams.intersection(q2_unigrams))

def get_common_unigram_ratio(row):
    question1 = str(row['question1'])
    question2 = str(row['question2'])
    
    q1_unigrams = set([i for i in nltk.ngrams(question1, 1)])
    q2_unigrams = set([i for i in nltk.ngrams(question2, 1)])
    unigram_count = float(row["unigrams_common_count"])
               
    return  unigram_count / max(len(q1_unigrams.union(q2_unigrams)),1)

def get_common_bigrams(row):
    question1 = str(row['question1'])
    question2 = str(row['question2'])
    
    q1_bigrams = set([i for i in nltk.ngrams(question1, 2)])
    q2_bigrams = set([i for i in nltk.ngrams(question2, 2)])
    return len(q1_bigrams.intersection(q2_bigrams))

def get_common_bigram_ratio(row):
    question1 = str(row['question1'])
    question2 = str(row['question2'])
    
    q1_bigrams = set([i for i in nltk.ngrams(question1, 2)])
    q2_bigrams = set([i for i in nltk.ngrams(question2, 2)])
    bigram_count = float(row["bigrams_common_count"])
               
    return  bigram_count / max(len(q1_bigrams.union(q2_bigrams)),1)

def build_corpus(data):
    "Creates a list of lists containing words from each sentence"
    corpus = []
    for col in ['question1', 'question2']:
        for sentence in data[col].iteritems():
            word_list = sentence[1].split(" ")
            corpus.append(word_list)
            
    return corpus

def question_to_word2vec(question_string, word2vec_model):
    """
    Given question string, returns word2vec vector of the questions tring
    :param question_string : The given question as a string.
    """
    stops_words = set(stopwords.words("english"))
    
    if not isinstance(question_string, str):
        return 0
    
    words = word_tokenize(question_string)[:-1]
    non_stop_words = []
    for word in words:
        if word.lower().strip('-') not in stops_words:
            word = WordNetLemmatizer().lemmatize(word, NOUN)
            
            if word.lower() in word2vec_model.wv:
                non_stop_words.append(word.lower().strip('-'))
            
    if len(non_stop_words) == 0:
        return 0
    
    vectors = [word2vec_model.wv[word] for word in non_stop_words]
    vector = sum(vectors)/float(len(non_stop_words))
    
    return vector

def numpy_cosine(row):
    """
    Cosine similarity between q1 and q2 question instances using their vectors
    :return: similarity between q1 and q2
    """
    q1, q2 = row['question1'], row['question2']
    q1_vec, q2_vec = question_to_word2vec(q1, word2vec_model), question_to_word2vec(q2, word2vec_model)
    
    with np.errstate(invalid='ignore'):
        cosine_similarity = np.dot(q1_vec, q2_vec) / (np.linalg.norm(q1_vec) * np.linalg.norm(q2_vec))
    
    return cosine_similarity if isinstance(cosine_similarity, np.float32) else 0.0

def mean_word2vec(row):
    question = row[feature]
    return np.mean(question_to_word2vec(question, word2vec_model))

# Run clean data in the first part of the notebook to generate the train_clean dataset
corpus = build_corpus(train_clean)  

In [None]:
word2vec_model = word2vec.Word2Vec(corpus, size=100, window=20, min_count=500, workers=8)
word2vec_model.wv['trump']

In [None]:
eps = 5000 
train_qs = pd.Series(train['question1'].tolist() + train['question2'].tolist()).astype(str)
words = (" ".join(train_qs)).lower().split()
counts = Counter(words)
weights = {word: get_weight(count) for word, count in counts.items()}

In [None]:
train[train['is_duplicate']==1][:20][['question1', 'question2']]

### Generate training features

In [None]:
# Apply_func always MUST be defined above pool_apply, cheers!    
apply_func = word_match_share
train['word_share'] = pool_apply(train)

apply_func = start_with_same_first_word
train['start_with_same_world'] = pool_apply(train)

feature = 'question1'
apply_func = question_length
train['q1_char_num'] = pool_apply(train)

feature = 'question2'
apply_func = question_length
train['q2_char_num'] = pool_apply(train)

feature = 'question1'
apply_func = word_count
train['q1_word_num'] = pool_apply(train)

feature = 'question2'
apply_func = word_count
train['q2_word_num'] = pool_apply(train)

apply_func = tfidf_word_match_share
train['rfidf_share'] = pool_apply(train)

train['char_difference'] = abs(train['q1_char_num'] - train['q2_char_num'])
train['word_difference'] = abs(train['q1_word_num'] - train['q2_word_num'])

apply_func = simhash_distance_seq
train['seq_simhash_distance'] = pool_apply(train)

apply_func = simhash_distance_shingle
train['shingle_simhash_distance'] = pool_apply(train)

train['avg_word_len_q1'] = train['q1_char_num'] / (train['q1_word_num'] + 10e-4)
train['avg_word_len_q2'] = train['q2_char_num'] / (train['q2_word_num'] + 10e-4)
train['avg_word_difference'] = abs(train['avg_word_len_q1'] - train['avg_word_len_q2'])

apply_func = get_common_unigrams
train['unigrams_common_count'] = pool_apply(train)

apply_func = get_common_bigrams
train['bigrams_common_count'] = pool_apply(train)

apply_func = get_common_unigram_ratio
train['unigrams_common_ratio'] = pool_apply(train)

apply_func = get_common_bigram_ratio
train['bigrams_common_ratio'] = pool_apply(train)

apply_func = numpy_cosine
train['cosin_sim'] = pool_apply(train, proc_num=8)

feature = 'question1'
apply_func = mean_word2vec
train['word2vec_q1_mean'] = pool_apply(train, proc_num=4)

feature = 'question2'
apply_func = mean_word2vec
train['word2vec_q2_mean'] = pool_apply(train, proc_num=4)

In [None]:
train.to_pickle('../features/train.pkl')
gc.collect()

### Genearte test features

In [None]:
apply_func = start_with_same_first_word
test['start_with_same_world'] = pool_apply(test)

apply_func = word_match_share
test['word_share'] = pool_apply(test)

feature = 'question1'
apply_func = question_length
test['q1_char_num'] = pool_apply(test)

feature = 'question2'
apply_func = question_length
test['q2_char_num'] = pool_apply(test)

feature = 'question1'
apply_func = word_count
test['q1_word_num'] = pool_apply(test)

feature = 'question2'
apply_func = word_count
test['q2_word_num'] = pool_apply(test)

apply_func = tfidf_word_match_share
test['rfidf_share'] = pool_apply(test)

test['char_difference'] = abs(test['q1_char_num'] - test['q2_char_num'])
test['word_difference'] = abs(test['q1_word_num'] - test['q2_word_num'])

apply_func = simhash_distance_seq
test['seq_simhash_distance'] = pool_apply(test)

apply_func = simhash_distance_shingle
test['shingle_simhash_distance'] = pool_apply(test)

apply_func = simhash_distance_seq
test['seq_simhash_distance'] = pool_apply(test)

apply_func = simhash_distance_shingle
test['shingle_simhash_distance'] = pool_apply(test)

test['avg_word_len_q1'] = test['q1_char_num'] / (test['q1_word_num'] + 10e-4)
test['avg_word_len_q2'] = test['q2_char_num'] / (test['q2_word_num'] + 10e-4)
test['avg_word_difference'] = abs(test['avg_word_len_q1'] - test['avg_word_len_q2'])

apply_func = get_common_unigrams
test['unigrams_common_count'] = pool_apply(test)

apply_func = get_common_bigrams
test['bigrams_common_count'] = pool_apply(test)

apply_func = get_common_unigram_ratio
test['unigrams_common_ratio'] = pool_apply(test)

apply_func = get_common_bigram_ratio
test['bigrams_common_ratio'] = pool_apply(test)

# This function takes a lot of RAM and it scales with the number of processes

apply_func = numpy_cosine
test['cosin_sim'] = pool_apply(test, proc_num=4) 

feature = 'question1'
apply_func = mean_word2vec
test['word2vec_q1_mean'] = pool_apply(test, proc_num=4)

feature = 'question2'
apply_func = mean_word2vec
test['word2vec_q2_mean'] = pool_apply(test, proc_num=4)

In [22]:
test.to_pickle('../features/test.pkl')
gc.collect()