## Libraries

In [1]:
import pandas as pd
import numpy as np

## Load the data

In [2]:
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')

In [3]:
train.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


## Data Processing

In [4]:
from nltk.corpus import stopwords
from sklearn.ensemble import RandomForestClassifier

In [5]:
def word_match_share(row):
    stops_words = set(stopwords.words("english"))
    
    q1words = {}
    q2words = {}
    
    for word in str(row['question1']).lower().split():
        if word not in stops_words:
            q1words[word] = 1
            
    for word in str(row['question2']).lower().split():
        if word not in stops_words:
            q2words[word] = 1
            
    if len(q1words) == 0 or len(q2words) == 0:
        # The computer-generated chaff includes a few questions that are nothing but stopwords
        return 0
    
    shared_words_in_q1 = [w for w in q1words.keys() if w in q2words]
    shared_words_in_q2 = [w for w in q2words.keys() if w in q1words]
    R = (len(shared_words_in_q1) + len(shared_words_in_q2))/(len(q1words) + len(q2words))
    return R

In [None]:
from multiprocessing import Pool

def transform_data(data):
    return data.apply(word_match_share, axis=1, raw=True)

# This takes about 7-8 minutes on i5 proc (desktop 3.5 GHz) -> TODO chunk more and increase process number

total = [train, test]

pool = Pool(processes=2)
train['R'], test['R'] = pool.map(transform_data, total)  

# Old version without pool
#train['R'] = train.apply(word_match_share, axis=1, raw=True)
#test['R'] = test.apply(word_match_share, axis=1, raw=True)

In [None]:
train.head()

In [None]:
# This takes about 3-4 minute on i5 proc (desktop 3.5 GHz)

train['q1_char_num'] = train.apply(lambda x: len(x['question1']), axis=1, raw=True)
train['q1_word_num'] = train.apply(lambda x: len(x['question1'].split()), axis=1, raw=True)

train['q2_char_num'] = train.apply(
    lambda x: len(x['question2']) if isinstance(x['question2'], str) else 0,axis=1, raw=True
)
train['q2_word_num'] = train.apply(
    lambda x: len(x['question2'].split()) if isinstance(x['question2'], str) else 0, axis=1, raw=True
)

test['q1_char_num'] = test.apply(
    lambda x: len(x['question1']) if isinstance(x['question1'], str) else 0,axis=1, raw=True
)
test['q1_word_num'] = test.apply(
    lambda x: len(x['question1'].split()) if isinstance(x['question1'], str) else 0, axis=1, raw=True
)

test['q2_char_num'] = test.apply(
    lambda x: len(x['question2']) if isinstance(x['question2'], str) else 0,axis=1, raw=True
)
test['q2_word_num'] = test.apply(
    lambda x: len(x['question2'].split()) if isinstance(x['question2'], str) else 0, axis=1, raw=True
)

In [None]:
train.head()

In [None]:
from sklearn.model_selection import train_test_split

features = ['R', 'q1_char_num', 'q1_word_num', 'q2_char_num', 'q2_word_num']
target = 'is_duplicate'

X = train[features]
y = train[target]

X_train, X_vald, y_train, y_vald = train_test_split(X, y, test_size=0.2, random_state=42)
X_test = test[features]

## Model

In [22]:
from sklearn.metrics import f1_score, accuracy_score

model = RandomForestClassifier(n_estimators=100, n_jobs=8)
model.fit(X_train, y_train)

f1score = f1_score(y_vald, model.predict(X_vald))
acc = accuracy_score(y_vald, model.predict(X_vald))

print('F1 Score: {0}'.format(f1score))
print('Acc: {0}'.format(acc))

F1 Score: 0.6476187403200825


In [None]:
predictions = model.predict(test_X)

## Generate submission

In [None]:
np.savetxt(
    'submission.csv', np.c_[range(len(predictions)), predictions],
    delimiter=',', header='test_id,is_duplicate', comments='', fmt='%d,%d'
)