## Libraries

In [1]:
import pandas as pd
import numpy as np

## Load the data

In [2]:
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')

In [3]:
train.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


## Data Processing

In [4]:
from nltk.corpus import stopwords

In [5]:
def word_match_share(row):
    stops_words = set(stopwords.words("english"))
    
    q1words = {}
    q2words = {}
    
    for word in str(row['question1']).lower().split():
        if word not in stops_words:
            q1words[word] = 1
            
    for word in str(row['question2']).lower().split():
        if word not in stops_words:
            q2words[word] = 1
            
    if len(q1words) == 0 or len(q2words) == 0:
        # The computer-generated chaff includes a few questions that are nothing but stopwords
        return 0
    
    shared_words_in_q1 = [w for w in q1words.keys() if w in q2words]
    shared_words_in_q2 = [w for w in q2words.keys() if w in q1words]
    R = (len(shared_words_in_q1) + len(shared_words_in_q2))/(len(q1words) + len(q2words))
    return R

In [6]:
from multiprocessing import Pool

def transform_data(data):
    return data.apply(word_match_share, axis=1, raw=True)

# This takes about 2-4 minutes on i7 proc (desktop 2.7 GHz)
# TODO chunk more and increase process number, make a function

total = [train, test[:500000], test[500000:1000000], test[1000000:1500000], test[1500000:2000000], test[2000000:]]

pool = Pool(processes=8)
train['word_share'], test_c1, test_c2, test_c3, test_c4, test_c5 = pool.map(transform_data, total)  
test['word_share'] = np.hstack((test_c1, test_c2, test_c3, test_c4, test_c5))

# Old version without pool
#train['word_share'] = train.apply(word_match_share, axis=1, raw=True)
#test['word_share'] = test.apply(word_match_share, axis=1, raw=True)

In [7]:
train.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,word_share
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0,0.727273
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0,0.307692
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0,0.363636
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0,0.0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0,0.0


In [8]:
train[train['is_duplicate']==1][:20][['question1', 'question2']]

Unnamed: 0,question1,question2
5,Astrology: I am a Capricorn Sun Cap moon and c...,"I'm a triple Capricorn (Sun, Moon and ascendan..."
7,How can I be a good geologist?,What should I do to be a great geologist?
11,How do I read and find my YouTube comments?,How can I see all my Youtube comments?
12,What can make Physics easy to learn?,How can you make physics easy to learn?
13,What was your first sexual experience like?,What was your first sexual experience?
15,What would a Trump presidency mean for current...,How will a Trump presidency affect the student...
16,What does manipulation mean?,What does manipulation means?
18,Why are so many Quora users posting questions ...,Why do people ask Quora questions which can be...
20,Why do rockets look white?,Why are rockets and boosters painted white?
29,How should I prepare for CA final law?,How one should know that he/she completely pre...


In [9]:
test.head()

Unnamed: 0,test_id,question1,question2,word_share
0,0,How does the Surface Pro himself 4 compare wit...,Why did Microsoft choose core m3 and not core ...,0.266667
1,1,Should I have a hair transplant at age 24? How...,How much cost does hair transplant require?,0.5
2,2,What but is the best way to send money from Ch...,What you send money to China?,0.444444
3,3,Which food not emulsifiers?,What foods fibre?,0.0
4,4,"How ""aberystwyth"" start reading?",How their can I start reading?,0.8


In [10]:
def start_with_same_first_word(row):
    if not isinstance(row['question1'], str) or not isinstance(row['question2'], str):
        return 0
    
    first_word_q1 = row['question1'].split()[0].lower()
    first_word_q2 = row['question2'].split()[0].lower()
    
    return 1 if first_word_q1 == first_word_q2 else 0

train['start_with_same_world'] = train.apply(start_with_same_first_word, axis=1, raw=True)
test['start_with_same_world'] = test.apply(start_with_same_first_word, axis=1, raw=True)

In [11]:
# This takes about 3-4 minute on i5 proc (desktop 3.5 GHz)

train['q1_char_num'] = train.apply(lambda x: len(x['question1']), axis=1, raw=True)
train['q1_word_num'] = train.apply(lambda x: len(x['question1'].split()), axis=1, raw=True)

train['q2_char_num'] = train.apply(
    lambda x: len(x['question2']) if isinstance(x['question2'], str) else 0,axis=1, raw=True
)
train['q2_word_num'] = train.apply(
    lambda x: len(x['question2'].split()) if isinstance(x['question2'], str) else 0, axis=1, raw=True
)

test['q1_char_num'] = test.apply(
    lambda x: len(x['question1']) if isinstance(x['question1'], str) else 0,axis=1, raw=True
)
test['q1_word_num'] = test.apply(
    lambda x: len(x['question1'].split()) if isinstance(x['question1'], str) else 0, axis=1, raw=True
)

test['q2_char_num'] = test.apply(
    lambda x: len(x['question2']) if isinstance(x['question2'], str) else 0,axis=1, raw=True
)
test['q2_word_num'] = test.apply(
    lambda x: len(x['question2'].split()) if isinstance(x['question2'], str) else 0, axis=1, raw=True
)

In [12]:
from collections import Counter

train_qs = pd.Series(train['question1'].tolist() + train['question2'].tolist()).astype(str)
# If a word appears only once, we ignore it completely (likely a typo)
# Epsilon defines a smoothing constant, which makes the effect of extremely rare words smaller
def get_weight(count, eps=10000, min_count=2):
    if count < min_count:
        return 0
    else:
        return 1 / (count + eps)

eps = 5000 
words = (" ".join(train_qs)).lower().split()
counts = Counter(words)
weights = {word: get_weight(count) for word, count in counts.items()}

In [13]:
import gc

del train_qs
gc.collect()

329

In [14]:
import math

def tfidf_word_match_share(row):
    stops_words = set(stopwords.words("english"))
    
    q1words = {}
    q2words = {}
    
    for word in str(row['question1']).lower().split():
        if word not in stops_words:
            q1words[word] = 1
            
    for word in str(row['question2']).lower().split():
        if word not in stops_words:
            q2words[word] = 1
            
    if len(q1words) == 0 or len(q2words) == 0:
        # The computer-generated chaff includes a few questions that are nothing but stopwords
        return 0
    
    shared_weights = [weights.get(w, 0) for w in q1words.keys() if w in q2words] + [weights.get(w, 0) for w in q2words.keys() if w in q1words]
    total_weights = [weights.get(w, 0) for w in q1words] + [weights.get(w, 0) for w in q2words]
    
    R = np.sum(shared_weights) / np.sum(total_weights)
    
    return R if not math.isnan(R) else 0

train['rfidf_share'] = train.apply(tfidf_word_match_share, axis=1, raw=True)
test['rfidf_share'] = test.apply(tfidf_word_match_share, axis=1, raw=True)

In [15]:
train.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,word_share,start_with_same_world,q1_char_num,q1_word_num,q2_char_num,q2_word_num,rfidf_share
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0,0.727273,1,66,14,57,12,0.772164
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0,0.307692,1,51,8,88,13,0.361758
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0,0.363636,1,73,14,59,10,0.355191
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0,0.0,0,50,11,65,9,0.0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0,0.0,1,76,13,39,7,0.0


## Feature picking

In [16]:
try:
    from sklearn.model_selection import train_test_split
except ImportError:
    from sklearn.cross_validation import train_test_split

features = ['word_share', 'q1_char_num', 'q1_word_num', 'q2_char_num', 'q2_word_num',
            'start_with_same_world', 'rfidf_share']
target = 'is_duplicate'

X = train[features]
y = train[target]

## Oversampling

In [17]:
pos_train = X[y == 1]
neg_train = X[y == 0]

# Now we oversample the negative class
# There is likely a much more elegant way to do this...
p = 0.165
scale = ((len(pos_train) / (len(pos_train) + len(neg_train))) / p) - 1
while scale > 1:
    neg_train = pd.concat([neg_train, neg_train])
    scale -=1
neg_train = pd.concat([neg_train, neg_train[:int(scale * len(neg_train))]])
print(len(pos_train) / (len(pos_train) + len(neg_train)))

X = pd.concat([pos_train, neg_train])
y = (np.zeros(len(pos_train)) + 1).tolist() + np.zeros(len(neg_train)).tolist()
del pos_train, neg_train

0.19124366100096607


## Cross validation

In [20]:
X_train, X_vald, y_train, y_vald = train_test_split(X, y, test_size=0.2, random_state=42)
X_test = test[features]

In [21]:
X_test.head()

Unnamed: 0,word_share,q1_char_num,q1_word_num,q2_char_num,q2_word_num,start_with_same_world,rfidf_share
0,0.266667,57,11,68,14,0,0.274019
1,0.5,66,14,43,7,0,0.480962
2,0.444444,60,14,29,6,1,0.468893
3,0.0,27,4,17,3,0,0.0
4,0.8,32,4,30,6,1,1.0


In [22]:
X.head()

Unnamed: 0,word_share,q1_char_num,q1_word_num,q2_char_num,q2_word_num,start_with_same_world,rfidf_share
5,0.470588,86,16,90,16,0,0.510771
7,0.5,30,7,41,9,0,0.645836
11,0.571429,43,9,38,8,1,0.654071
12,1.0,36,7,39,8,0,1.0
13,0.571429,43,7,38,6,1,0.536457


## Transofmrations

In [23]:
from sklearn.preprocessing import StandardScaler

In [74]:
scaler = StandardScaler().fit(X_train)

X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_vald_scaled = scaler.transform(X_vald)

## Model

In [24]:
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, log_loss

from xgboost import XGBClassifier

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

In [25]:
model = RandomForestClassifier(n_estimators=58, n_jobs=8)

#model = XGBClassifier(n_estimators=500, learning_rate=0.48, max_depth=4)
#model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_vald, y_vald)],
#          early_stopping_rounds=50, verbose=True, eval_metric='logloss')

#knn = KNeighborsClassifier(n_neighbors=25)
model.fit(X_train, y_train)

#model = VotingClassifier(estimators=[('xgb', xgb), ('knn', knn), ('rf', rf)],
#                         voting='soft', weights=[4.5, 1.1, 1.2])
#model.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=58, n_jobs=8,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [26]:
val_predictions = model.predict(X_vald)
val_prob_predictions = model.predict_proba(X_vald)

for metric_name, metric_func in zip(
    ['F1-score', 'Acc', 'Precision', 'Recall', 'LogLoss'],
    [f1_score, accuracy_score, precision_score, recall_score, log_loss]
):
    
    val_predictions = val_predictions if metric_name not in ['LogLoss'] else val_prob_predictions
    metric_score = metric_func(y_vald, val_predictions)
    print('{0}: {1}'.format(metric_name, metric_score))

F1-score: 0.6449485062558514
Acc: 0.893105613140463
Precision: 0.8774824850906143
Recall: 0.5098402018502943
LogLoss: 0.24534618634940053


In [27]:
del train, pool

gc.collect()

14

In [None]:
model.fit(X, y)

In [83]:
del X, y, X_train, y_train, X_vald, y_vald

gc.collect()

In [None]:
predictions = model.predict_proba(X_test)

In [84]:
predictions[:,1]

array([ 0.05369232,  0.64482141,  0.52798671, ...,  0.00065555,
        0.10855845,  0.46532747], dtype=float32)

## Generate submission

In [85]:
np.savetxt(
    'submission.csv', np.c_[range(len(predictions)), predictions[:,1]],
    delimiter=',', header='test_id,is_duplicate', comments='', fmt='%d,%f'
)