## Libraries

In [1]:
import pandas as pd
import numpy as np

## Load the data with features

In [5]:
train = pd.read_pickle('../features/train_new.pkl')

In [17]:
test = pd.read_pickle('../features/test_new.pkl')

## Feature selection

In [6]:
from sklearn.feature_selection import VarianceThreshold

try:
    from sklearn.model_selection import train_test_split
except ImportError:
    from sklearn.cross_validation import train_test_split

In [7]:
# features = ['cosin_sim', 'word_share', 'q1_char_num', 'q1_word_num', 'q2_char_num', 'q2_word_num',
#             'start_with_same_world', 'rfidf_share', 'char_difference', 'word_difference',
#            'seq_simhash_distance', 'shingle_simhash_distance', 'avg_word_len_q1', 'avg_word_len_q2',
#            'avg_word_difference', 'unigrams_common_count', 'bigrams_common_count', 'unigrams_common_ratio',
#            'bigrams_common_ratio', 'word2vec_q1_mean', 'word2vec_q2_mean']

features = ['word_share', 'start_with_same_world', 'q1_char_num', 'q2_char_num',
       'q1_word_num', 'q2_word_num', 'rfidf_share', 'char_difference',
       'word_difference', 'seq_simhash_distance', 'shingle_simhash_distance',
       'avg_word_len_q1', 'avg_word_len_q2', 'avg_word_difference',
       'unigrams_common_count', 'bigrams_common_count',
       'unigrams_common_ratio', 'bigrams_common_ratio', 'cosin_sim',
       'word2vec_q1_mean', 'word2vec_q2_mean', 'q1_NN_count', 'q2_NN_count',
       'NN_diff', 'q1_RB_count', 'q2_RB_count', 'RB_diff', 'q1_VB_count',
       'q2_VB_count', 'VB_diff', 'q1_DT_count', 'q2_DT_count', 'DT_diff',
       'q1_JJ_count', 'q2_JJ_count', 'JJ_diff', 'q1_FW_count', 'q2_FW_count',
       'FW_diff', 'q1_RP_count', 'q2_RP_count', 'RP_diff', 'q1_SYM_count',
       'q2_SYM_count', 'SYM_diff']

target = 'is_duplicate'

X = train[features]
y = train[target]

## Oversampling

In [13]:
pos_train = X[y == 1]
neg_train = X[y == 0]

# Now we oversample the negative class
# There is likely a much more elegant way to do this...
p = 0.165
scale = ((len(pos_train) / (len(pos_train) + len(neg_train))) / p) - 1
while scale > 1:
    neg_train = pd.concat([neg_train, neg_train])
    scale -=1
neg_train = pd.concat([neg_train, neg_train[:int(scale * len(neg_train))]])
print(len(pos_train) / (len(pos_train) + len(neg_train)))

X = pd.concat([pos_train, neg_train])
y = (np.zeros(len(pos_train)) + 1).tolist() + np.zeros(len(neg_train)).tolist()

del pos_train, neg_train

0.19124366100096607


## Cross validation

In [14]:
X_train, X_vald, y_train, y_vald = train_test_split(X, y, test_size=0.2, random_state=42)

In [18]:
X_test = test[features]

## Transformations

In [9]:
from sklearn.preprocessing import StandardScaler

In [None]:
# Model works fine without scaling

scaler = StandardScaler().fit(X_train)

X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_vald_scaled = scaler.transform(X_vald)
X_scaled = scaler.transform(X)

## Model selection

In [15]:
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, log_loss

from xgboost import XGBClassifier

from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Perceptron

from sklearn.neural_network import BernoulliRBM

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import AdaBoostClassifier

from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.lda import LDA
from sklearn.qda import QDA
from sklearn.linear_model import SGDClassifier

from sklearn.grid_search import GridSearchCV

In [16]:
#model = RandomForestClassifier(n_jobs=8)   # 0.39680 (on public)
#rfc = RandomForestClassifier(max_depth=40, n_estimators=56, n_jobs=8)
#model = ExtraTreesClassifier(n_estimators=200, n_jobs=8) # 0.48183 (on public)
#model = AdaBoostClassifier()
#gbc = GradientBoostingClassifier(n_estimators=500, max_depth=4, learning_rate=0.2, subsample=0.7) # 0.34721 (on public)
#model = KNeighborsClassifier(n_jobs=8, n_neighbors=25)
#model = BernoulliNB(alpha=0.01) # 0.57
#model = SVC()
#model = LogisticRegression(max_iter=500, C=2, tol=0.01)
#model = LDA()
#model = QDA(reg_param=0.9)
#sgd = SGDClassifier(loss='log', penalty='elasticnet', n_jobs=8, n_iter=50, eta0=0.5, epsilon=0.5)

#model.fit(X_train, y_train, test_data=[(X_test, y_test)])

#model = ExtraTreesClassifier(n_estimators=58, n_jobs=8, max_depth=40)


model = XGBClassifier(n_estimators=500, learning_rate=0.1, max_depth=5, subsample=0.65, gamma=1.5, seed=42,
            colsample_bytree=0.7) # 0.34785 (on public)

# xgb2 = XGBClassifier(n_estimators=700, learning_rate=0.1, max_depth=6, subsample=0.65, gamma=1.5, seed=42, 
#                      colsample_bytree=0.7)

# xgb3 = XGBClassifier(n_estimators=500, learning_rate=0.2, max_depth=4, subsample=0.7, gamma=0.5, seed=42, 
#                      colsample_bytree=0.7)

# xgb4 = XGBClassifier(n_estimators=500, learning_rate=0.1, max_depth=5, subsample=0.65, gamma=1.5, seed=1234,
#             colsample_bytree=0.7) # 0.34785 (on public)

# xgb5 = XGBClassifier(n_estimators=700, learning_rate=0.1, max_depth=6, subsample=0.65, gamma=1.5, seed=5552, 
#                      colsample_bytree=0.7)

# xgb6 = XGBClassifier(n_estimators=500, learning_rate=0.2, max_depth=4, subsample=0.7, gamma=0.5, seed=7121, 
#                      colsample_bytree=0.7)

model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_vald, y_vald)],
         early_stopping_rounds=50, verbose=True, eval_metric='logloss')


# model = VotingClassifier(estimators=[('xgb1', xgb1), ('xgb2', xgb2), ('xgb3', xgb3), ('xgb4', xgb4),
#                                      ('xgb5', xgb5), ('xgb6', xgb6)],
#                          voting='soft', weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0])

#model.fit(X_train, y_train)

[0]	validation_0-logloss:0.645468	validation_1-logloss:0.64541
Multiple eval metrics have been passed: 'validation_1-logloss' will be used for early stopping.

Will train until validation_1-logloss hasn't improved in 50 rounds.
[1]	validation_0-logloss:0.604841	validation_1-logloss:0.604714
[2]	validation_0-logloss:0.570907	validation_1-logloss:0.570659
[3]	validation_0-logloss:0.543447	validation_1-logloss:0.543081
[4]	validation_0-logloss:0.51907	validation_1-logloss:0.518724
[5]	validation_0-logloss:0.498074	validation_1-logloss:0.497703
[6]	validation_0-logloss:0.481667	validation_1-logloss:0.481271
[7]	validation_0-logloss:0.466593	validation_1-logloss:0.466211
[8]	validation_0-logloss:0.454845	validation_1-logloss:0.45445
[9]	validation_0-logloss:0.442107	validation_1-logloss:0.441694
[10]	validation_0-logloss:0.431037	validation_1-logloss:0.430619
[11]	validation_0-logloss:0.422077	validation_1-logloss:0.421675
[12]	validation_0-logloss:0.413204	validation_1-logloss:0.412826
[13

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.7,
       gamma=1.5, learning_rate=0.1, max_delta_step=0, max_depth=5,
       min_child_weight=1, missing=None, n_estimators=500, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=42, silent=True, subsample=0.65)

## Cross validation

In [None]:
model = RandomForestClassifier(n_jobs=8) 
parameters = {
    'n_estimators' : [10, 25, 50, 80],
    'max_depth' : [3, 4, 6, 12, 20, 30, 40],
    'min_samples_leaf' : [1, 2, 3],
    'criterion': ['entropy', 'gini'],
    'n_jobs': [8]
}

clf = GridSearchCV(model, parameters, cv=5, scoring="log_loss", verbose=5, n_jobs=1)
clf.fit(X_train, y_train)
best_estimator = clf.best_estimator_
print('Best hyperparameters: ' + str(clf.best_params_))

In [None]:
model = XGBClassifier()
parameters = {
    'n_estimators' : [50],
    'max_depth' : [4, 6, 12, 35],
    'learning_rate' : [0.01, 0.05, 0.1, 0.2],
    'gamma': [0.5, 1.5, 3, 5],
    'subsample': [1, 0.7],
    'colsample_bytree': [0.7, 1]
}

clf = GridSearchCV(model, parameters, cv=5, scoring="log_loss", verbose=5, n_jobs=1)
clf.fit(X_train, y_train)
best_estimator = clf.best_estimator_
print('Best hyperparameters: ' + str(clf.best_params_))

## Evaluation

In [None]:
val_predictions = model.predict(X_vald_scaled)
val_prob_predictions = model.predict_proba(X_vald_scaled)

for metric_name, metric_func in zip(
    ['F1-score', 'Acc', 'Precision', 'Recall', 'LogLoss'],
    [f1_score, accuracy_score, precision_score, recall_score, log_loss]
):
    
    val_predictions = val_predictions if metric_name not in ['LogLoss'] else val_prob_predictions
    metric_score = metric_func(y_vald, val_predictions)
    print('{0}: {1}'.format(metric_name, metric_score))

## Prepare for submission

In [19]:
model.fit(X, y)
predictions = model.predict_proba(X_test)

## Generate submission

In [20]:
np.savetxt(
    '../submissions/submission.csv', np.c_[range(len(predictions)), predictions[:,1]],
    delimiter=',', header='test_id,is_duplicate', comments='', fmt='%d,%f'
)