## Libraries

In [1]:
import pandas as pd
import numpy as np

## Load the data

In [2]:
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')

In [3]:
train.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


## Clean Data

In [4]:
import re
from nltk.corpus import stopwords

In [5]:
def clean_sentence(val):
    "remove chars that are not letters or numbers, downcase, then remove stop words"
    stops_words = set(stopwords.words("english"))
    regex = re.compile('([^\s\w]|_)+')
    sentence = regex.sub('', val).lower()
    sentence = sentence.split(" ")
    
    for word in list(sentence):
        if word in stops_words:
            sentence.remove(word)  
            
    sentence = " ".join(sentence)
    return sentence

def clean_dataframe(data):
    "drop nans, then apply 'clean_sentence' function to question1 and 2"
    data = data.dropna(how="any")
    
    for col in ['question1', 'question2']:
        data[col] = data[col].apply(clean_sentence)
    
    return data

train_clean = clean_dataframe(train)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


## Data Processing

In [6]:
import math
import gc

from collections import Counter
from multiprocessing import Pool

In [7]:
def transform_data(data):
    return data.apply(apply_func, axis=1, raw=True)

def chunk(data, num):   
    chunk_size = math.ceil(len(data) / num)
    return [data[i*chunk_size : (i+1)*chunk_size] for i in range(num)]

def pool_apply(data, proc_num=8):
    
    with Pool(processes=proc_num) as pool:
        chunks = chunk(data, proc_num) 
        proccessed_chunks = list(pool.map(transform_data, chunks))
  
    return np.hstack(tuple(proccessed_chunks))

In [8]:
def word_match_share(row):
    stops_words = set(stopwords.words("english"))
    
    q1words = {}
    q2words = {}
    
    for word in str(row['question1']).lower().split():
        if word not in stops_words:
            q1words[word] = 1
            
    for word in str(row['question2']).lower().split():
        if word not in stops_words:
            q2words[word] = 1
            
    if len(q1words) == 0 or len(q2words) == 0:
        # The computer-generated chaff includes a few questions that are nothing but stopwords
        return 0
    
    shared_words_in_q1 = [w for w in q1words.keys() if w in q2words]
    shared_words_in_q2 = [w for w in q2words.keys() if w in q1words]
    R = (len(shared_words_in_q1) + len(shared_words_in_q2))/(len(q1words) + len(q2words))
    return R

def tfidf_word_match_share(row):
    stops_words = set(stopwords.words("english"))
    
    q1words = {}
    q2words = {}
    
    for word in str(row['question1']).lower().split():
        if word not in stops_words:
            q1words[word] = 1
            
    for word in str(row['question2']).lower().split():
        if word not in stops_words:
            q2words[word] = 1
            
    if len(q1words) == 0 or len(q2words) == 0:
        # The computer-generated chaff includes a few questions that are nothing but stopwords
        return 0
    
    with np.errstate(invalid='ignore'):
        shared_weights = [weights.get(w, 0) for w in q1words.keys() if w in q2words] + [
            weights.get(w, 0) for w in q2words.keys() if w in q1words]
        
        total_weights = [weights.get(w, 0) for w in q1words] + [weights.get(w, 0) for w in q2words]

        R = np.sum(shared_weights) / np.sum(total_weights)

    return R if not math.isnan(R) else 0


def start_with_same_first_word(row):
    if not isinstance(row['question1'], str) or not isinstance(row['question2'], str):
        return 0
    
    first_word_q1 = row['question1'].split()[0].lower()
    first_word_q2 = row['question2'].split()[0].lower()
    
    return 1 if first_word_q1 == first_word_q2 else 0

def question_length(row):
    question = row[feature]
    return len(question) if isinstance(question, str) else 0

def word_count(row):
    question = row[feature]
    return len(question.split()) if isinstance(question, str) else 0


# If a word appears only once, we ignore it completely (likely a typo)
# Epsilon defines a smoothing constant, which makes the effect of extremely rare words smaller
def get_weight(count, eps=10000, min_count=2):
    if count < min_count:
        return 0
    else:
        return 1 / (count + eps)

In [9]:
eps = 5000 
train_qs = pd.Series(train['question1'].tolist() + train['question2'].tolist()).astype(str)
words = (" ".join(train_qs)).lower().split()
counts = Counter(words)
weights = {word: get_weight(count) for word, count in counts.items()}

In [10]:
train[train['is_duplicate']==1][:20][['question1', 'question2']]

Unnamed: 0,question1,question2
5,Astrology: I am a Capricorn Sun Cap moon and c...,"I'm a triple Capricorn (Sun, Moon and ascendan..."
7,How can I be a good geologist?,What should I do to be a great geologist?
11,How do I read and find my YouTube comments?,How can I see all my Youtube comments?
12,What can make Physics easy to learn?,How can you make physics easy to learn?
13,What was your first sexual experience like?,What was your first sexual experience?
15,What would a Trump presidency mean for current...,How will a Trump presidency affect the student...
16,What does manipulation mean?,What does manipulation means?
18,Why are so many Quora users posting questions ...,Why do people ask Quora questions which can be...
20,Why do rockets look white?,Why are rockets and boosters painted white?
29,How should I prepare for CA final law?,How one should know that he/she completely pre...


In [66]:
# Apply_func always MUST be defined above pool_apply, cheers!    
apply_func = word_match_share
train['word_share'] = pool_apply(train)
test['word_share'] = pool_apply(test)

In [67]:
apply_func = start_with_same_first_word
train['start_with_same_world'] = pool_apply(train)
test['start_with_same_world'] = pool_apply(test)

In [68]:
feature = 'question1'
apply_func = question_length
train['q1_char_num'] = pool_apply(train)
test['q1_char_num'] = pool_apply(test)

feature = 'question2'
apply_func = question_length
train['q2_char_num'] = pool_apply(train)
test['q2_char_num'] = pool_apply(test)

feature = 'question1'
apply_func = word_count
train['q1_word_num'] = pool_apply(train)
test['q1_word_num'] = pool_apply(test)

feature = 'question2'
apply_func = word_count
train['q2_word_num'] = pool_apply(train)
test['q2_word_num'] = pool_apply(test)

In [69]:
apply_func = tfidf_word_match_share
train['rfidf_share'] = pool_apply(train)
test['rfidf_share'] = pool_apply(test)

In [70]:
train['char_difference'] = abs(train['q1_char_num'] - train['q2_char_num'])
train['word_difference'] = abs(train['q1_word_num'] - train['q2_word_num'])
test['char_difference'] = abs(test['q1_char_num'] - test['q2_char_num'])
test['word_difference'] = abs(test['q1_word_num'] - test['q2_word_num'])

In [16]:
train.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,word_share,start_with_same_world,q1_char_num,q2_char_num,q1_word_num,q2_word_num,rfidf_share,char_difference,word_difference
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0,0.727273,1,66,57,14,12,0.772164,9,2
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0,0.307692,1,51,88,8,13,0.361758,37,5
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0,0.363636,1,73,59,14,10,0.355191,14,4
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0,0.0,0,50,65,11,9,0.0,15,2
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0,0.0,1,76,39,13,7,0.0,37,6


In [17]:
test.head()

Unnamed: 0,test_id,question1,question2
0,0,How does the Surface Pro himself 4 compare wit...,Why did Microsoft choose core m3 and not core ...
1,1,Should I have a hair transplant at age 24? How...,How much cost does hair transplant require?
2,2,What but is the best way to send money from Ch...,What you send money to China?
3,3,Which food not emulsifiers?,What foods fibre?
4,4,"How ""aberystwyth"" start reading?",How their can I start reading?


## Word2Vec

In [18]:
from gensim.models import word2vec

from nltk.tokenize import word_tokenize
from nltk.corpus.reader.wordnet import ADJ, ADJ_SAT, ADV, NOUN, VERB
from nltk.stem import WordNetLemmatizer

In [19]:
def build_corpus(data):
    "Creates a list of lists containing words from each sentence"
    corpus = []
    for col in ['question1', 'question2']:
        for sentence in data[col].iteritems():
            word_list = sentence[1].split(" ")
            corpus.append(word_list)
            
    return corpus

corpus = build_corpus(train_clean)  

In [20]:
word2vec_model = word2vec.Word2Vec(corpus, size=100, window=20, min_count=200, workers=4)
word2vec_model.wv['trump']

array([ 1.66531444, -2.6076982 , -1.55004895, -0.56047791,  0.02588514,
        0.56109214,  0.24540929, -1.03416407,  0.2651619 ,  0.71056539,
        0.53173745,  2.43410349, -1.82260382, -0.60769838, -4.00416565,
       -1.25950003,  1.968418  , -3.43402267, -2.57230663,  0.44623995,
        0.14124124,  1.73834646, -1.02509892, -0.5528025 , -1.5985986 ,
        0.38852143,  3.15907383,  0.71225041, -2.22475004, -1.36451101,
        2.33302689,  2.45906925, -0.48566511,  1.68039298,  1.26377034,
       -1.72000921,  1.69556677, -3.02244735, -0.72536319, -2.71408653,
       -1.97752225,  0.2182361 ,  1.34421635, -2.55360794,  2.69507337,
        2.64636898,  0.65941912,  2.70007825, -2.19196296, -2.63015628,
       -2.49362659,  1.22135413,  0.3970907 ,  1.09797943, -2.29536223,
       -0.84813648, -1.14950073, -0.79710531,  3.76239395,  0.64569944,
        2.35649991, -1.1662575 , -1.69598496, -0.67901123, -2.23652768,
        1.37600672, -1.18862998,  0.56982374, -0.15774135, -1.52

In [21]:
word2vec_model.wv['question']

'question' in word2vec_model.wv

True

In [24]:
def question_to_word2vec(question_string, word2vec_model):
    """
    Given question string, returns word2vec vector of the questions tring
    :param question_string : The given question as a string.
    """
    stops_words = set(stopwords.words("english"))
    
    if not isinstance(question_string, str):
        return 0
    
    words = word_tokenize(question_string)[:-1]
    non_stop_words = []
    for word in words:
        if word.lower().strip('-') not in stops_words:
            word = WordNetLemmatizer().lemmatize(word, NOUN)
            
            if word.lower() in word2vec_model.wv:
                non_stop_words.append(word.lower().strip('-'))
            
    if len(non_stop_words) == 0:
        return 0
    
    vectors = [word2vec_model.wv[word] for word in non_stop_words]
    vector = sum(vectors)/float(len(non_stop_words))
    
    return vector

def numpy_cosine(row):
    """
    Cosine similarity between q1 and q2 question instances using their vectors
    :return: similarity between q1 and q2
    """
    q1, q2 = row['question1'], row['question2']
    q1_vec, q2_vec = question_to_word2vec(q1, word2vec_model), question_to_word2vec(q2, word2vec_model)
    
    with np.errstate(invalid='ignore'):
        cosine_similarity = np.dot(q1_vec, q2_vec) / (np.linalg.norm(q1_vec) * np.linalg.norm(q2_vec))
    
    return cosine_similarity if isinstance(cosine_similarity, np.float32) else 0.0

In [71]:
apply_func = numpy_cosine
train['cosin_sim'] = pool_apply(train, proc_num=4)
test['cosin_sim'] = pool_apply(test, proc_num=4)

In [72]:
train[38:41]

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,word_share,start_with_same_world,q1_char_num,q2_char_num,q1_word_num,q2_word_num,rfidf_share,char_difference,word_difference,cosin_sim
38,38,77,78,How do we prepare for UPSC?,How do I prepare for civil service?,1,0.4,1,27,35,6,7,0.328275,8,1,0.804913
39,39,79,80,What is the stall speed and AOA of an f-14 wit...,Why did aircraft stop using variable-sweep win...,0,0.0,0,71,72,15,12,0.0,1,3,0.281494
40,40,81,82,Why do Slavs squat?,Will squats make my legs thicker?,0,0.0,0,19,33,4,6,0.0,14,2,0.0


## Feature picking

In [125]:
try:
    from sklearn.model_selection import train_test_split
except ImportError:
    from sklearn.cross_validation import train_test_split

#features = ['cosin_sim', 'word_share', 'q1_char_num', 'q1_word_num', 'q2_char_num', 'q2_word_num',
#            'start_with_same_world', 'rfidf_share']

features = ['cosin_sim', 'word_share', 'q1_char_num', 'q1_word_num', 'q2_char_num', 'q2_word_num',
            'start_with_same_world', 'rfidf_share', 'char_difference', 'word_difference']

#features = ['cosin_sim', 'start_with_same_world', 'rfidf_share']

target = 'is_duplicate'

X = train[features]
y = train[target]

## Oversampling

In [126]:
pos_train = X[y == 1]
neg_train = X[y == 0]

# Now we oversample the negative class
# There is likely a much more elegant way to do this...
p = 0.165
scale = ((len(pos_train) / (len(pos_train) + len(neg_train))) / p) - 1
while scale > 1:
    neg_train = pd.concat([neg_train, neg_train])
    scale -=1
neg_train = pd.concat([neg_train, neg_train[:int(scale * len(neg_train))]])
print(len(pos_train) / (len(pos_train) + len(neg_train)))

X = pd.concat([pos_train, neg_train])
y = (np.zeros(len(pos_train)) + 1).tolist() + np.zeros(len(neg_train)).tolist()

del pos_train, neg_train

0.19124366100096607


## Cross validation

In [128]:
X_train, X_vald, y_train, y_vald = train_test_split(X, y, test_size=0.2, random_state=42)
X_test = test[features]

In [129]:
X_test.head()

Unnamed: 0,cosin_sim,word_share,q1_char_num,q1_word_num,q2_char_num,q2_word_num,start_with_same_world,rfidf_share,char_difference,word_difference
0,0.67944,0.266667,57,11,68,14,0,0.274019,11,3
1,0.757762,0.5,66,14,43,7,0,0.480962,23,7
2,0.8474,0.444444,60,14,29,6,1,0.468893,31,8
3,1.0,0.0,27,4,17,3,0,0.0,10,1
4,1.0,0.8,32,4,30,6,1,1.0,2,2


In [130]:
X.head()

Unnamed: 0,cosin_sim,word_share,q1_char_num,q1_word_num,q2_char_num,q2_word_num,start_with_same_world,rfidf_share,char_difference,word_difference
5,0.97941,0.470588,86,16,90,16,0,0.510771,4,0
7,0.536462,0.5,30,7,41,9,0,0.645836,11,2
11,0.808033,0.571429,43,9,38,8,1,0.654071,5,1
12,0.919684,1.0,36,7,39,8,0,1.0,3,1
13,0.938224,0.571429,43,7,38,6,1,0.536457,5,1


## Transofmrations

In [119]:
from sklearn.preprocessing import StandardScaler

In [120]:
# Model works fine without scaling

scaler = StandardScaler().fit(X_train)

X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_vald_scaled = scaler.transform(X_vald)

## Model

In [121]:
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, log_loss

from xgboost import XGBClassifier
#import lightgbm

from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import AdaBoostClassifier

from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

In [134]:
#model = RandomForestClassifier(n_estimators=87, n_jobs=8)   # 0.39680 (on public)
#model = ExtraTreesClassifier(n_estimators=62, n_jobs=8) # 0.48183 (on public)
#model = AdaBoostClassifier()
#model = GradientBoostingClassifier(n_estimators=500, max_depth=4, learning_rate=0.48, subsample=0.7) # 0.34721 (on public)
#model = KNeighborsClassifier(n_neighbors=25)
#model = MultinomialNB() # 0.57
#model = SVC()

model = XGBClassifier(n_estimators=500, learning_rate=0.48, max_depth=6, subsample=0.65, gamma=0.5) # 0.34785 (on public)
#model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_vald, y_vald)],
#          early_stopping_rounds=50, verbose=True, eval_metric='logloss')


#model = VotingClassifier(estimators=[('xgb', xgb), ('knn', knn), ('rf', rf)],
#                        voting='soft', weights=[4.5, 1.1, 1.2])

#model.fit(X_train, y_train)

In [123]:
val_predictions = model.predict(X_vald)
val_prob_predictions = model.predict_proba(X_vald)

for metric_name, metric_func in zip(
    ['F1-score', 'Acc', 'Precision', 'Recall', 'LogLoss'],
    [f1_score, accuracy_score, precision_score, recall_score, log_loss]
):
    
    val_predictions = val_predictions if metric_name not in ['LogLoss'] else val_prob_predictions
    metric_score = metric_func(y_vald, val_predictions)
    print('{0}: {1}'.format(metric_name, metric_score))

F1-score: 0.7504597456396757
Acc: 0.9174172635139464
Precision: 0.8837421354974013
Recall: 0.6521110176619007
LogLoss: 0.17625651319209343


In [135]:
model.fit(X, y)

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0.5, learning_rate=0.48, max_delta_step=0, max_depth=6,
       min_child_weight=1, missing=None, n_estimators=500, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=0.65)

In [124]:
#del X, y, X_train, y_train, X_vald, y_vald
gc.collect()

523

In [None]:
X_test.head()

In [136]:
predictions = model.predict_proba(X_test)

In [None]:
predictions[:,1]

## Generate submission

In [137]:
np.savetxt(
    'submission.csv', np.c_[range(len(predictions)), predictions[:,1]],
    delimiter=',', header='test_id,is_duplicate', comments='', fmt='%d,%f'
)

In [None]:
with open('submission.csv', 'w', buffering=1) as submission_file:
    submission_file.write('test_id,is_duplicate')
    
    for test_id, test_row in enumerate(X_test.iterrows()):
        row_prediction = model.predict_proba(X_test[test_id:])
        submission_file.write('%d,%f' % test_id, row_prediction[:,1])