## Libraries

In [1]:
import pandas as pd
import numpy as np

## Load the data

In [2]:
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')

In [3]:
train.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


## Data Processing

In [4]:
from nltk.corpus import stopwords

In [5]:
def word_match_share(row):
    stops_words = set(stopwords.words("english"))
    
    q1words = {}
    q2words = {}
    
    for word in str(row['question1']).lower().split():
        if word not in stops_words:
            q1words[word] = 1
            
    for word in str(row['question2']).lower().split():
        if word not in stops_words:
            q2words[word] = 1
            
    if len(q1words) == 0 or len(q2words) == 0:
        # The computer-generated chaff includes a few questions that are nothing but stopwords
        return 0
    
    shared_words_in_q1 = [w for w in q1words.keys() if w in q2words]
    shared_words_in_q2 = [w for w in q2words.keys() if w in q1words]
    R = (len(shared_words_in_q1) + len(shared_words_in_q2))/(len(q1words) + len(q2words))
    return R

In [6]:
from multiprocessing import Pool

def transform_data(data):
    return data.apply(word_match_share, axis=1, raw=True)

# This takes about 7-8 minutes on i5 proc (desktop 3.5 GHz) -> TODO chunk more and increase process number

total = [train, test]

pool = Pool(processes=2)
train['R'], test['R'] = pool.map(transform_data, total)  

# Old version without pool
#train['R'] = train.apply(word_match_share, axis=1, raw=True)
#test['R'] = test.apply(word_match_share, axis=1, raw=True)

In [7]:
train.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,R
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0,0.727273
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0,0.307692
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0,0.363636
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0,0.0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0,0.0


In [8]:
# This takes about 3-4 minute on i5 proc (desktop 3.5 GHz)

train['q1_char_num'] = train.apply(lambda x: len(x['question1']), axis=1, raw=True)
train['q1_word_num'] = train.apply(lambda x: len(x['question1'].split()), axis=1, raw=True)

train['q2_char_num'] = train.apply(
    lambda x: len(x['question2']) if isinstance(x['question2'], str) else 0,axis=1, raw=True
)
train['q2_word_num'] = train.apply(
    lambda x: len(x['question2'].split()) if isinstance(x['question2'], str) else 0, axis=1, raw=True
)

test['q1_char_num'] = test.apply(
    lambda x: len(x['question1']) if isinstance(x['question1'], str) else 0,axis=1, raw=True
)
test['q1_word_num'] = test.apply(
    lambda x: len(x['question1'].split()) if isinstance(x['question1'], str) else 0, axis=1, raw=True
)

test['q2_char_num'] = test.apply(
    lambda x: len(x['question2']) if isinstance(x['question2'], str) else 0,axis=1, raw=True
)
test['q2_word_num'] = test.apply(
    lambda x: len(x['question2'].split()) if isinstance(x['question2'], str) else 0, axis=1, raw=True
)

In [9]:
train.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,R,q1_char_num,q1_word_num,q2_char_num,q2_word_num
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0,0.727273,66,14,57,12
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0,0.307692,51,8,88,13
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0,0.363636,73,14,59,10
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0,0.0,50,11,65,9
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0,0.0,76,13,39,7


In [10]:
from sklearn.model_selection import train_test_split

features = ['R', 'q1_char_num', 'q1_word_num', 'q2_char_num', 'q2_word_num']
target = 'is_duplicate'

X = train[features]
y = train[target]

X_train, X_vald, y_train, y_vald = train_test_split(X, y, test_size=0.2, random_state=42)
X_test = test[features]

In [11]:
X_test.head()

Unnamed: 0,R,q1_char_num,q1_word_num,q2_char_num,q2_word_num
0,0.266667,57,11,68,14
1,0.5,66,14,43,7
2,0.444444,60,14,29,6
3,0.0,27,4,17,3
4,0.8,32,4,30,6


## Model

In [27]:
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, log_loss

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier

In [83]:
#model = RandomForestClassifier(n_estimators=12, n_jobs=8)
model = XGBClassifier(n_estimators=120, learning_rate=0.5, max_depth=5)

model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.5, max_delta_step=0, max_depth=5,
       min_child_weight=1, missing=None, n_estimators=120, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [84]:
val_predictions = model.predict(X_vald)
val_prob_predictions = model.predict_proba(X_vald)

for metric_name, metric_func in zip(
    ['F1-score', 'Acc', 'Precision', 'Recall', 'LogLoss'],
    [f1_score, accuracy_score, precision_score, recall_score, log_loss]
):
    
    val_predictions = val_predictions if metric_name not in ['LogLoss'] else val_prob_predictions
    metric_score = metric_func(y_vald, val_predictions)
    print('{0}: {1}'.format(metric_name, metric_score))

F1-score: 0.7105889055045218
Acc: 0.7486828761532563
Precision: 0.6211902390438248
Recall: 0.8300449176509732
LogLoss: 0.44446307194446655


In [85]:
model.fit(X, y)
predictions = model.predict_proba(X_test)

In [96]:
predictions[:,1]

array([  5.38870804e-02,   6.51866317e-01,   5.35193980e-01, ...,
         6.30405964e-04,   1.54330239e-01,   4.57387239e-01], dtype=float32)

## Generate submission

In [95]:
np.savetxt(
    'submission.csv', np.c_[range(len(predictions)), predictions[:,1]],
    delimiter=',', header='test_id,is_duplicate', comments='', fmt='%d,%f'
)