## Libraries

In [6]:
import pandas as pd
import numpy as np

## Load the data with features

In [2]:
train = pd.read_pickle('../features/train.pkl')
test = pd.read_pickle('../features/test.pkl')

## Feature selection

In [3]:
try:
    from sklearn.model_selection import train_test_split
except ImportError:
    from sklearn.cross_validation import train_test_split

In [4]:
features = ['cosin_sim', 'word_share', 'q1_char_num', 'q1_word_num', 'q2_char_num', 'q2_word_num',
            'start_with_same_world', 'rfidf_share', 'char_difference', 'word_difference',
           'seq_simhash_distance', 'shingle_simhash_distance', 'avg_word_len_q1', 'avg_word_len_q2',
           'avg_word_difference', 'unigrams_common_count', 'bigrams_common_count', 'unigrams_common_ratio',
           'bigrams_common_ratio', 'word2vec_q1_mean', 'word2vec_q2_mean']

target = 'is_duplicate'

X, y = train[features], train[target]

## Oversampling

In [7]:
pos_train = X[y == 1]
neg_train = X[y == 0]

# Now we oversample the negative class
# There is likely a much more elegant way to do this...
p = 0.165
scale = ((len(pos_train) / (len(pos_train) + len(neg_train))) / p) - 1
while scale > 1:
    neg_train = pd.concat([neg_train, neg_train])
    scale -=1
neg_train = pd.concat([neg_train, neg_train[:int(scale * len(neg_train))]])
print(len(pos_train) / (len(pos_train) + len(neg_train)))

X = pd.concat([pos_train, neg_train])
y = (np.zeros(len(pos_train)) + 1).tolist() + np.zeros(len(neg_train)).tolist()

del pos_train, neg_train

0.1694470207734739




## Cross validation

In [8]:
X_train, X_vald, y_train, y_vald = train_test_split(X, y, test_size=0.2, random_state=42)
X_test = test[features]

## Transformations

In [9]:
from sklearn.preprocessing import StandardScaler

In [None]:
# Model works fine without scaling

scaler = StandardScaler().fit(X_train)

X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_vald_scaled = scaler.transform(X_vald)

## Model selection

In [10]:
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, log_loss

from xgboost import XGBClassifier
#import lightgbm

from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import AdaBoostClassifier

from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

In [None]:
#model = RandomForestClassifier(n_estimators=150, n_jobs=8)   # 0.39680 (on public)
#model = ExtraTreesClassifier(n_estimators=62, n_jobs=8) # 0.48183 (on public)
#model = AdaBoostClassifier()
#model = GradientBoostingClassifier(n_estimators=500, max_depth=4, learning_rate=0.2, subsample=0.7) # 0.34721 (on public)
#model = KNeighborsClassifier(n_neighbors=25)
#model = MultinomialNB() # 0.57
#model = SVC()

model = XGBClassifier(n_estimators=500, learning_rate=0.1, max_depth=4, subsample=0.7, gamma=0.5, seed=42,
            colsample_bytree=0.7) # 0.34785 (on public)
model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_vald, y_vald)],
          early_stopping_rounds=50, verbose=True, eval_metric='logloss')


#model = VotingClassifier(estimators=[('xgb', xgb), ('knn', knn), ('rf', rf)],
#                        voting='soft', weights=[4.5, 1.1, 1.2])

#model.fit(X_train, y_train)

Will train until validation_1 error hasn't decreased in 50 rounds.
[0]	validation_0-logloss:0.641337	validation_1-logloss:0.641384
[1]	validation_0-logloss:0.602217	validation_1-logloss:0.602174
[2]	validation_0-logloss:0.566184	validation_1-logloss:0.566266
[3]	validation_0-logloss:0.535856	validation_1-logloss:0.536030
[4]	validation_0-logloss:0.510439	validation_1-logloss:0.510547
[5]	validation_0-logloss:0.488447	validation_1-logloss:0.488754
[6]	validation_0-logloss:0.469658	validation_1-logloss:0.469912
[7]	validation_0-logloss:0.453445	validation_1-logloss:0.453685
[8]	validation_0-logloss:0.439065	validation_1-logloss:0.439447
[9]	validation_0-logloss:0.427278	validation_1-logloss:0.427665
[10]	validation_0-logloss:0.416377	validation_1-logloss:0.416744
[11]	validation_0-logloss:0.406581	validation_1-logloss:0.407052
[12]	validation_0-logloss:0.398135	validation_1-logloss:0.398607
[13]	validation_0-logloss:0.390881	validation_1-logloss:0.391476
[14]	validation_0-logloss:0.38512

## Evaluation

In [13]:
val_predictions = model.predict(X_vald)
val_prob_predictions = model.predict_proba(X_vald)

for metric_name, metric_func in zip(
    ['F1-score', 'Acc', 'Precision', 'Recall', 'LogLoss'],
    [f1_score, accuracy_score, precision_score, recall_score, log_loss]
):
    
    val_predictions = val_predictions if metric_name not in ['LogLoss'] else val_prob_predictions
    metric_score = metric_func(y_vald, val_predictions)
    print('{0}: {1}'.format(metric_name, metric_score))

F1-score: 0.3614493395321378
Acc: 0.8526538651469828
Precision: 0.6787694013303769
Recall: 0.24630393241476414
LogLoss: 0.3002466427174439


## Prepare for submission

In [14]:
model.fit(X, y)
predictions = model.predict_proba(X_test)

## Generate submission

In [15]:
np.savetxt(
    '../submissions/submission.csv', np.c_[range(len(predictions)), predictions[:,1]],
    delimiter=',', header='test_id,is_duplicate', comments='', fmt='%d,%f'
)