## Libraries

In [1]:
import pandas as pd
import numpy as np

## Load the data

In [2]:
train = pd.read_csv('../input/train.csv')

In [3]:
test = pd.read_csv('../input/test.csv')

In [3]:
train.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


## Clean Data

In [None]:
import re
from nltk.corpus import stopwords

In [None]:
def clean_sentence(val):
    "remove chars that are not letters or numbers, downcase, then remove stop words"
    stops_words = set(stopwords.words("english"))
    regex = re.compile('([^\s\w]|_)+')
    sentence = regex.sub('', val).lower()
    sentence = sentence.split(" ")
    
    for word in list(sentence):
        if word in stops_words:
            sentence.remove(word)  
            
    sentence = " ".join(sentence)
    return sentence

def clean_dataframe(data):
    "drop nans, then apply 'clean_sentence' function to question1 and 2"
    data = data.dropna(how="any")
    
    for col in ['question1', 'question2']:
        data[col] = data[col].apply(clean_sentence)
    
    return data

train_clean = clean_dataframe(train)

## Data Processing

In [None]:
import math
import gc

from collections import Counter
from multiprocessing import Pool
from simhash import Simhash

from nltk import tokenize
import nltk

stops = set(stopwords.words("english"))

In [None]:
def transform_data(data):
    return data.apply(apply_func, axis=1, raw=True)

def chunk(data, num):   
    chunk_size = math.ceil(len(data) / num)
    return [data[i*chunk_size : (i+1)*chunk_size] for i in range(num)]

def pool_apply(data, proc_num=8):
    
    with Pool(processes=proc_num) as pool:
        chunks = chunk(data, proc_num) 
        proccessed_chunks = list(pool.map(transform_data, chunks))
  
    return np.hstack(tuple(proccessed_chunks))

In [None]:
def word_match_share(row):
    stops_words = set(stopwords.words("english"))
    
    q1words = {}
    q2words = {}
    
    for word in str(row['question1']).lower().split():
        if word not in stops_words:
            q1words[word] = 1
            
    for word in str(row['question2']).lower().split():
        if word not in stops_words:
            q2words[word] = 1
            
    if len(q1words) == 0 or len(q2words) == 0:
        # The computer-generated chaff includes a few questions that are nothing but stopwords
        return 0
    
    shared_words_in_q1 = [w for w in q1words.keys() if w in q2words]
    shared_words_in_q2 = [w for w in q2words.keys() if w in q1words]
    R = (len(shared_words_in_q1) + len(shared_words_in_q2))/(len(q1words) + len(q2words))
    return R

def tfidf_word_match_share(row):
    stops_words = set(stopwords.words("english"))
    
    q1words = {}
    q2words = {}
    
    for word in str(row['question1']).lower().split():
        if word not in stops_words:
            q1words[word] = 1
            
    for word in str(row['question2']).lower().split():
        if word not in stops_words:
            q2words[word] = 1
            
    if len(q1words) == 0 or len(q2words) == 0:
        # The computer-generated chaff includes a few questions that are nothing but stopwords
        return 0
    
    with np.errstate(invalid='ignore'):
        shared_weights = [weights.get(w, 0) for w in q1words.keys() if w in q2words] + [
            weights.get(w, 0) for w in q2words.keys() if w in q1words]
        
        total_weights = [weights.get(w, 0) for w in q1words] + [weights.get(w, 0) for w in q2words]

        R = np.sum(shared_weights) / np.sum(total_weights)

    return R if not math.isnan(R) else 0


def start_with_same_first_word(row):
    if not isinstance(row['question1'], str) or not isinstance(row['question2'], str):
        return 0
    
    first_word_q1 = row['question1'].split()[0].lower()
    first_word_q2 = row['question2'].split()[0].lower()
    
    return 1 if first_word_q1 == first_word_q2 else 0

def question_length(row):
    question = row[feature]
    return len(question) if isinstance(question, str) else 0

def word_count(row):
    question = row[feature]
    return len(question.split()) if isinstance(question, str) else 0


# If a word appears only once, we ignore it completely (likely a typo)
# Epsilon defines a smoothing constant, which makes the effect of extremely rare words smaller
def get_weight(count, eps=10000, min_count=2):
    if count < min_count:
        return 0
    else:
        return 1 / (count + eps)
    
def simhash_distance_seq(row):
    q1 = row['question1']
    q2 = row['question2']
    
    if not isinstance(q1, str) or not isinstance(q2, str):
        return 0

    return Simhash(q1).distance(Simhash(q2))

def simhash_distance_shingle(row):
    q1 = row['question1']
    q2 = row['question2']
    
    if not isinstance(q1, str) or not isinstance(q2, str):
        return 0
    
    q1_shingles = get_singles(q1)
    q2_shingles = get_singles(q2)
    
    return Simhash(q1_shingles).distance(Simhash(q2_shingles))

def get_singles(sequence, width = 3):
    sequence = sequence.lower()
    sequence = re.sub(r'[^\w]+', '', sequence)
    return [sequence[i:i + width] for i in range(max(len(sequence) - width + 1, 1))]



def get_common_unigrams(row):
    question1 = str(row['question1'])
    question2 = str(row['question2'])
    
    q1_unigrams = set([i for i in nltk.ngrams(question1, 1)])
    q2_unigrams = set([i for i in nltk.ngrams(question2, 1)])
    return len( q1_unigrams.intersection(q2_unigrams))

def get_common_unigram_ratio(row):
    question1 = str(row['question1'])
    question2 = str(row['question2'])
    
    q1_unigrams = set([i for i in nltk.ngrams(question1, 1)])
    q2_unigrams = set([i for i in nltk.ngrams(question2, 1)])
    unigram_count = float(row["unigrams_common_count"])
               
    return  unigram_count / max(len(q1_unigrams.union(q2_unigrams)),1)

def get_common_bigrams(row):
    question1 = str(row['question1'])
    question2 = str(row['question2'])
    
    q1_bigrams = set([i for i in nltk.ngrams(question1, 2)])
    q2_bigrams = set([i for i in nltk.ngrams(question2, 2)])
    return len(q1_bigrams.intersection(q2_bigrams))

def get_common_bigram_ratio(row):
    question1 = str(row['question1'])
    question2 = str(row['question2'])
    
    q1_bigrams = set([i for i in nltk.ngrams(question1, 2)])
    q2_bigrams = set([i for i in nltk.ngrams(question2, 2)])
    bigram_count = float(row["bigrams_common_count"])
               
    return  bigram_count / max(len(q1_bigrams.union(q2_bigrams)),1)

In [None]:
eps = 5000 
train_qs = pd.Series(train['question1'].tolist() + train['question2'].tolist()).astype(str)
words = (" ".join(train_qs)).lower().split()
counts = Counter(words)
weights = {word: get_weight(count) for word, count in counts.items()}

In [None]:
train[train['is_duplicate']==1][:20][['question1', 'question2']]

In [None]:
# Apply_func always MUST be defined above pool_apply, cheers!    
apply_func = word_match_share
train['word_share'] = pool_apply(train)

apply_func = start_with_same_first_word
train['start_with_same_world'] = pool_apply(train)

feature = 'question1'
apply_func = question_length
train['q1_char_num'] = pool_apply(train)

feature = 'question2'
apply_func = question_length
train['q2_char_num'] = pool_apply(train)

feature = 'question1'
apply_func = word_count
train['q1_word_num'] = pool_apply(train)

feature = 'question2'
apply_func = word_count
train['q2_word_num'] = pool_apply(train)

apply_func = tfidf_word_match_share
train['rfidf_share'] = pool_apply(train)

train['char_difference'] = abs(train['q1_char_num'] - train['q2_char_num'])
train['word_difference'] = abs(train['q1_word_num'] - train['q2_word_num'])

apply_func = simhash_distance_seq
train['seq_simhash_distance'] = pool_apply(train)

apply_func = simhash_distance_shingle
train['shingle_simhash_distance'] = pool_apply(train)

train['avg_word_len_q1'] = train['q1_char_num'] / (train['q1_word_num'] + 10e-4)
train['avg_word_len_q2'] = train['q2_char_num'] / (train['q2_word_num'] + 10e-4)
train['avg_word_difference'] = abs(train['avg_word_len_q1'] - train['avg_word_len_q2'])

apply_func = get_common_unigrams
train['unigrams_common_count'] = pool_apply(train)

apply_func = get_common_bigrams
train['bigrams_common_count'] = pool_apply(train)

apply_func = get_common_unigram_ratio
train['unigrams_common_ratio'] = pool_apply(train)

apply_func = get_common_bigram_ratio
train['bigrams_common_ratio'] = pool_apply(train)

In [None]:
gc.collect()

In [None]:
apply_func = start_with_same_first_word
test['start_with_same_world'] = pool_apply(test)

apply_func = word_match_share
test['word_share'] = pool_apply(test)

feature = 'question1'
apply_func = question_length
test['q1_char_num'] = pool_apply(test)

feature = 'question2'
apply_func = question_length
test['q2_char_num'] = pool_apply(test)

feature = 'question1'
apply_func = word_count
test['q1_word_num'] = pool_apply(test)

feature = 'question2'
apply_func = word_count
test['q2_word_num'] = pool_apply(test)

apply_func = tfidf_word_match_share
test['rfidf_share'] = pool_apply(test)

test['char_difference'] = abs(test['q1_char_num'] - test['q2_char_num'])
test['word_difference'] = abs(test['q1_word_num'] - test['q2_word_num'])

apply_func = simhash_distance_seq
test['seq_simhash_distance'] = pool_apply(test)

apply_func = simhash_distance_shingle
test['shingle_simhash_distance'] = pool_apply(test)

apply_func = simhash_distance_seq
test['seq_simhash_distance'] = pool_apply(test)

apply_func = simhash_distance_shingle
test['shingle_simhash_distance'] = pool_apply(test)

test['avg_word_len_q1'] = test['q1_char_num'] / (test['q1_word_num'] + 10e-4)
test['avg_word_len_q2'] = test['q2_char_num'] / (test['q2_word_num'] + 10e-4)
test['avg_word_difference'] = abs(test['avg_word_len_q1'] - test['avg_word_len_q2'])

apply_func = get_common_unigrams
test['unigrams_common_count'] = pool_apply(test)

apply_func = get_common_bigrams
test['bigrams_common_count'] = pool_apply(test)

apply_func = get_common_unigram_ratio
test['unigrams_common_ratio'] = pool_apply(test)

apply_func = get_common_bigram_ratio
test['bigrams_common_ratio'] = pool_apply(test)

In [12]:
train.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,word_share,start_with_same_world,q1_char_num,q2_char_num,q1_word_num,q2_word_num,rfidf_share,char_difference,word_difference,seq_simhash_distance,shingle_simhash_distance,avg_word_len_q1,avg_word_len_q2,avg_word_difference
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0,0.727273,1,66,57,14,12,0.772164,9,2,15,10,4.713949,4.749604,0.035655
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0,0.307692,1,51,88,8,13,0.361758,37,5,22,18,6.374203,6.76871,0.394507
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0,0.363636,1,73,59,14,10,0.355191,14,4,26,23,5.213913,5.89941,0.685497
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0,0.0,0,50,65,11,9,0.0,15,2,36,28,4.545041,7.22142,2.676378
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0,0.0,1,76,39,13,7,0.0,37,6,34,21,5.845704,5.570633,0.275071


In [16]:
test.head()

Unnamed: 0,test_id,question1,question2,start_with_same_world,word_share,q1_char_num,q2_char_num,q1_word_num,q2_word_num,rfidf_share,char_difference,word_difference,seq_simhash_distance,shingle_simhash_distance,avg_word_len_q1,avg_word_len_q2,avg_word_difference
0,0,How does the Surface Pro himself 4 compare wit...,Why did Microsoft choose core m3 and not core ...,0,0.266667,57,68,11,14,0.274019,11,3,26,23,5.181347,4.856796,0.324551
1,1,Should I have a hair transplant at age 24? How...,How much cost does hair transplant require?,0,0.5,66,43,14,7,0.480962,23,7,27,22,4.713949,6.14198,1.428031
2,2,What but is the best way to send money from Ch...,What you send money to China?,1,0.444444,60,29,14,6,0.468893,31,8,22,27,4.285408,4.832528,0.54712
3,3,Which food not emulsifiers?,What foods fibre?,0,0.0,27,17,4,3,0.0,10,1,32,33,6.748313,5.664778,1.083535
4,4,"How ""aberystwyth"" start reading?",How their can I start reading?,1,0.8,32,30,4,6,1.0,2,2,21,23,7.998,4.999167,2.998834


## Word2Vec

In [13]:
from gensim.models import word2vec

from nltk.tokenize import word_tokenize
from nltk.corpus.reader.wordnet import ADJ, ADJ_SAT, ADV, NOUN, VERB
from nltk.stem import WordNetLemmatizer

In [14]:
def build_corpus(data):
    "Creates a list of lists containing words from each sentence"
    corpus = []
    for col in ['question1', 'question2']:
        for sentence in data[col].iteritems():
            word_list = sentence[1].split(" ")
            corpus.append(word_list)
            
    return corpus

corpus = build_corpus(train_clean)  

In [15]:
word2vec_model = word2vec.Word2Vec(corpus, size=100, window=20, min_count=200, workers=4)
word2vec_model.wv['trump']

array([ 1.5423857 , -0.18877243, -0.69895309, -1.62728703, -0.03071801,
        0.92019987,  2.66838503,  0.26456219,  1.01132786,  0.65255654,
        1.51943994,  0.34171048, -0.60262084, -3.05914187, -2.49309993,
        2.32025671,  0.55832887, -2.12868094, -1.48019063,  0.26215437,
        0.87051916,  0.53066379, -1.2863183 , -0.62387747, -2.58575153,
        0.26904136,  2.88699746, -3.03885865,  0.3766177 , -2.54019713,
       -0.25186637,  1.73393941,  2.31052351,  1.37149477,  0.81612736,
        1.0461694 ,  0.00594117, -1.76909316,  0.27269265,  0.53426611,
        0.26753989, -1.60441387, -0.30379802,  2.01264501, -2.93228245,
       -0.12985253,  3.03247499,  2.15053654, -0.55543244,  3.65650511,
       -0.55203521,  4.95813894,  0.16183898, -1.47227442, -1.03181267,
       -2.21922421,  0.92020571,  3.44367146, -3.06644678, -2.15232968,
        1.71454263,  2.65542889,  0.73848563,  1.55730188,  2.12465954,
        2.60739946, -2.69158435, -2.13815713,  1.95312846,  0.75

In [16]:
word2vec_model.wv['question']

'question' in word2vec_model.wv

True

In [17]:
def question_to_word2vec(question_string, word2vec_model):
    """
    Given question string, returns word2vec vector of the questions tring
    :param question_string : The given question as a string.
    """
    stops_words = set(stopwords.words("english"))
    
    if not isinstance(question_string, str):
        return 0
    
    words = word_tokenize(question_string)[:-1]
    non_stop_words = []
    for word in words:
        if word.lower().strip('-') not in stops_words:
            word = WordNetLemmatizer().lemmatize(word, NOUN)
            
            if word.lower() in word2vec_model.wv:
                non_stop_words.append(word.lower().strip('-'))
            
    if len(non_stop_words) == 0:
        return 0
    
    vectors = [word2vec_model.wv[word] for word in non_stop_words]
    vector = sum(vectors)/float(len(non_stop_words))
    
    return vector

def numpy_cosine(row):
    """
    Cosine similarity between q1 and q2 question instances using their vectors
    :return: similarity between q1 and q2
    """
    q1, q2 = row['question1'], row['question2']
    q1_vec, q2_vec = question_to_word2vec(q1, word2vec_model), question_to_word2vec(q2, word2vec_model)
    
    with np.errstate(invalid='ignore'):
        cosine_similarity = np.dot(q1_vec, q2_vec) / (np.linalg.norm(q1_vec) * np.linalg.norm(q2_vec))
    
    return cosine_similarity if isinstance(cosine_similarity, np.float32) else 0.0

In [18]:
apply_func = numpy_cosine
train['cosin_sim'] = pool_apply(train, proc_num=4)

In [23]:
apply_func = numpy_cosine
test['cosin_sim'] = pool_apply(test, proc_num=4)

In [19]:
train[38:41]

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,word_share,start_with_same_world,q1_char_num,q2_char_num,...,q2_word_num,rfidf_share,char_difference,word_difference,seq_simhash_distance,shingle_simhash_distance,avg_word_len_q1,avg_word_len_q2,avg_word_difference,cosin_sim
38,38,77,78,How do we prepare for UPSC?,How do I prepare for civil service?,1,0.4,1,27,35,...,7,0.328275,8,1,20,21,4.49925,4.999286,0.500036,0.780258
39,39,79,80,What is the stall speed and AOA of an f-14 wit...,Why did aircraft stop using variable-sweep win...,0,0.0,0,71,72,...,12,0.0,1,3,42,35,4.733018,5.9995,1.266482,0.240071
40,40,81,82,Why do Slavs squat?,Will squats make my legs thicker?,0,0.0,0,19,33,...,6,0.0,14,2,21,33,4.748813,5.499083,0.750271,0.0


## Feature picking

In [61]:
try:
    from sklearn.model_selection import train_test_split
except ImportError:
    from sklearn.cross_validation import train_test_split

#features = ['cosin_sim', 'word_share', 'q1_char_num', 'q1_word_num', 'q2_char_num', 'q2_word_num',
#            'start_with_same_world', 'rfidf_share']

features = ['cosin_sim', 'word_share', 'q1_char_num', 'q1_word_num', 'q2_char_num', 'q2_word_num',
            'start_with_same_world', 'rfidf_share', 'char_difference', 'word_difference',
           'seq_simhash_distance', 'shingle_simhash_distance', 'avg_word_len_q1', 'avg_word_len_q2',
           'avg_word_difference', 'unigrams_common_count', 'bigrams_common_count', 'unigrams_common_ratio',
           'bigrams_common_ratio']

#features = ['cosin_sim', 'start_with_same_world', 'rfidf_share']

target = 'is_duplicate'

X = train[features]
y = train[target]

## Oversampling

In [62]:
pos_train = X[y == 1]
neg_train = X[y == 0]

# Now we oversample the negative class
# There is likely a much more elegant way to do this...
p = 0.165
scale = ((len(pos_train) / (len(pos_train) + len(neg_train))) / p) - 1
while scale > 1:
    neg_train = pd.concat([neg_train, neg_train])
    scale -=1
neg_train = pd.concat([neg_train, neg_train[:int(scale * len(neg_train))]])
print(len(pos_train) / (len(pos_train) + len(neg_train)))

X = pd.concat([pos_train, neg_train])
y = (np.zeros(len(pos_train)) + 1).tolist() + np.zeros(len(neg_train)).tolist()

del pos_train, neg_train

0.19124366100096607


## Cross validation

In [63]:
X_train, X_vald, y_train, y_vald = train_test_split(X, y, test_size=0.2, random_state=42)

In [64]:
X_train.head()

Unnamed: 0,cosin_sim,word_share,q1_char_num,q1_word_num,q2_char_num,q2_word_num,start_with_same_world,rfidf_share,char_difference,word_difference,seq_simhash_distance,shingle_simhash_distance,avg_word_len_q1,avg_word_len_q2,avg_word_difference,unigrams_common_count,bigrams_common_count,unigrams_common_ratio,bigrams_common_ratio
109330,0.622394,0.222222,41,6,44,11,0,0.091403,3,5,30,31,6.832195,3.999636,2.832558,11,13,0.52381,0.22807
149537,0.056752,0.0,121,19,67,12,0,0.0,54,7,35,35,6.368086,5.582868,0.785218,17,25,0.586207,0.210084
235896,0.593747,0.162162,181,35,175,33,1,0.190338,6,2,34,25,5.171281,5.30287,0.131589,27,57,0.75,0.360759
346102,0.702948,0.5,33,7,42,8,0,0.526688,9,1,27,26,4.713612,5.249344,0.535731,10,14,0.454545,0.291667
42793,-0.221622,0.0,82,12,48,8,0,0.0,34,4,34,32,6.832764,5.99925,0.833514,17,11,0.62963,0.11828


In [32]:
X_test = test[features]
X_test.head()

Unnamed: 0,cosin_sim,word_share,q1_char_num,q1_word_num,q2_char_num,q2_word_num,start_with_same_world,rfidf_share,char_difference,word_difference,seq_simhash_distance,shingle_simhash_distance,avg_word_len_q1,avg_word_len_q2,avg_word_difference
0,0.686881,0.266667,57,11,68,14,0,0.274019,11,3,26,23,5.181347,4.856796,0.324551
1,0.741746,0.5,66,14,43,7,0,0.480962,23,7,27,22,4.713949,6.14198,1.428031
2,0.837249,0.444444,60,14,29,6,1,0.468893,31,8,22,27,4.285408,4.832528,0.54712
3,1.0,0.0,27,4,17,3,0,0.0,10,1,32,33,6.748313,5.664778,1.083535
4,1.0,0.8,32,4,30,6,1,1.0,2,2,21,23,7.998,4.999167,2.998834


In [None]:
X.head()

## Transofmrations

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
# Model works fine without scaling

scaler = StandardScaler().fit(X_train)

X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_vald_scaled = scaler.transform(X_vald)

## Model

In [52]:
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, log_loss

from xgboost import XGBClassifier
#import lightgbm

from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import AdaBoostClassifier

from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

In [67]:
#model = RandomForestClassifier(n_estimators=150, n_jobs=8)   # 0.39680 (on public)
#model = ExtraTreesClassifier(n_estimators=62, n_jobs=8) # 0.48183 (on public)
#model = AdaBoostClassifier()
#model = GradientBoostingClassifier(n_estimators=500, max_depth=4, learning_rate=0.2, subsample=0.7) # 0.34721 (on public)
#model = KNeighborsClassifier(n_neighbors=25)
#model = MultinomialNB() # 0.57
#model = SVC()

model = XGBClassifier(n_estimators=500, learning_rate=0.02, max_depth=4, subsample=0.7, gamma=0.5, seed=42,
            colsample_bytree=0.7) # 0.34785 (on public)
model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_vald, y_vald)],
          early_stopping_rounds=50, verbose=True, eval_metric='logloss')


#model = VotingClassifier(estimators=[('xgb', xgb), ('knn', knn), ('rf', rf)],
#                        voting='soft', weights=[4.5, 1.1, 1.2])

#model.fit(X_train, y_train)

Will train until validation_1 error hasn't decreased in 50 rounds.
[0]	validation_0-logloss:0.684018	validation_1-logloss:0.683936
[1]	validation_0-logloss:0.674575	validation_1-logloss:0.674483
[2]	validation_0-logloss:0.665092	validation_1-logloss:0.665085
[3]	validation_0-logloss:0.656351	validation_1-logloss:0.656311
[4]	validation_0-logloss:0.647597	validation_1-logloss:0.647604
[5]	validation_0-logloss:0.639303	validation_1-logloss:0.639205
[6]	validation_0-logloss:0.631464	validation_1-logloss:0.631371
[7]	validation_0-logloss:0.623993	validation_1-logloss:0.623850
[8]	validation_0-logloss:0.616406	validation_1-logloss:0.616323
[9]	validation_0-logloss:0.609289	validation_1-logloss:0.609031
[10]	validation_0-logloss:0.602391	validation_1-logloss:0.602252
[11]	validation_0-logloss:0.595698	validation_1-logloss:0.595445
[12]	validation_0-logloss:0.589066	validation_1-logloss:0.588886
[13]	validation_0-logloss:0.582652	validation_1-logloss:0.582478
[14]	validation_0-logloss:0.57709

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.7,
       gamma=0.5, learning_rate=0.02, max_delta_step=0, max_depth=4,
       min_child_weight=1, missing=None, n_estimators=500, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=42, silent=True, subsample=0.7)

In [68]:
val_predictions = model.predict(X_vald)
val_prob_predictions = model.predict_proba(X_vald)

for metric_name, metric_func in zip(
    ['F1-score', 'Acc', 'Precision', 'Recall', 'LogLoss'],
    [f1_score, accuracy_score, precision_score, recall_score, log_loss]
):
    
    val_predictions = val_predictions if metric_name not in ['LogLoss'] else val_prob_predictions
    metric_score = metric_func(y_vald, val_predictions)
    print('{0}: {1}'.format(metric_name, metric_score))

F1-score: 0.19921649959672774
Acc: 0.8219067508872632
Precision: 0.6928471248246845
Recall: 0.11633305298570228
LogLoss: 0.3398170983777008


In [45]:
model.fit(X, y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=150, n_jobs=8,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [46]:
#del X, y, X_train, y_train, X_vald, y_vald
gc.collect()

71

In [None]:
X_test.head()

In [None]:
predictions = model.predict_proba(X_test)

In [None]:
predictions[:,1]

## Generate submission

In [None]:
np.savetxt(
    'submission.csv', np.c_[range(len(predictions)), predictions[:,1]],
    delimiter=',', header='test_id,is_duplicate', comments='', fmt='%d,%f'
)

In [None]:
with open('submission.csv', 'w', buffering=1) as submission_file:
    submission_file.write('test_id,is_duplicate')
    
    for test_id, test_row in enumerate(X_test.iterrows()):
        row_prediction = model.predict_proba(X_test[test_id:])
        submission_file.write('%d,%f' % test_id, row_prediction[:,1])