## Libraries

In [1]:
import pandas as pd
import numpy as np

## Load the data with features

In [2]:
train = pd.read_pickle('../features/train.pkl')

In [None]:
test = pd.read_pickle('../features/test.pkl')

## Feature selection

In [3]:
try:
    from sklearn.model_selection import train_test_split
except ImportError:
    from sklearn.cross_validation import train_test_split

In [4]:
features = ['cosin_sim', 'word_share', 'q1_char_num', 'q1_word_num', 'q2_char_num', 'q2_word_num',
            'start_with_same_world', 'rfidf_share', 'char_difference', 'word_difference',
           'seq_simhash_distance', 'shingle_simhash_distance', 'avg_word_len_q1', 'avg_word_len_q2',
           'avg_word_difference', 'unigrams_common_count', 'bigrams_common_count', 'unigrams_common_ratio',
           'bigrams_common_ratio', 'word2vec_q1_mean', 'word2vec_q2_mean']

target = 'is_duplicate'

X, y = train[features], train[target]

## Oversampling

In [5]:
pos_train = X[y == 1]
neg_train = X[y == 0]

# Now we oversample the negative class
# There is likely a much more elegant way to do this...
p = 0.165
scale = ((len(pos_train) / (len(pos_train) + len(neg_train))) / p) - 1
while scale > 1:
    neg_train = pd.concat([neg_train, neg_train])
    scale -=1
neg_train = pd.concat([neg_train, neg_train[:int(scale * len(neg_train))]])
print(len(pos_train) / (len(pos_train) + len(neg_train)))

X = pd.concat([pos_train, neg_train])
y = (np.zeros(len(pos_train)) + 1).tolist() + np.zeros(len(neg_train)).tolist()

del pos_train, neg_train

0.19124366100096607


## Cross validation

In [6]:
X_train, X_vald, y_train, y_vald = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
X_test = test[features]

## Transformations

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
# Model works fine without scaling

scaler = StandardScaler().fit(X_train)

X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_vald_scaled = scaler.transform(X_vald)

## Model selection

In [7]:
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, log_loss

from xgboost import XGBClassifier

from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Perceptron

from sklearn.neural_network import BernoulliRBM

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import AdaBoostClassifier

from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

from sklearn.grid_search import GridSearchCV

In [None]:
model = RandomForestClassifier(n_jobs=8)   # 0.39680 (on public)
#model = ExtraTreesClassifier(n_estimators=200, n_jobs=8) # 0.48183 (on public)
#model = AdaBoostClassifier()
#model = GradientBoostingClassifier(n_estimators=500, max_depth=4, learning_rate=0.2, subsample=0.7) # 0.34721 (on public)
#model = KNeighborsClassifier(n_jobs=8, n_neighbors=25)
#model = BernoulliNB(alpha=0.01) # 0.57
#model = SVC()
#model = LogisticRegression(max_iter=500, C=2, tol=0.01)

#model.fit(X_train, y_train, test_data=[(X_test, y_test)])

#model = XGBClassifier(n_estimators=500, learning_rate=0.1, max_depth=8, subsample=0.7, gamma=0.5, seed=42,
#            colsample_bytree=0.7) # 0.34785 (on public)
#model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_vald, y_vald)],
#          early_stopping_rounds=50, verbose=True, eval_metric='logloss')


#model = VotingClassifier(estimators=[('xgb', xgb), ('knn', knn), ('rf', rf)],
#                        voting='soft', weights=[4.5, 1.1, 1.2])

#model.fit(X_train, y_train)

In [None]:
model = RandomForestClassifier(n_jobs=8) 
parameters = {
    'n_estimators' : [10, 25, 50, 80],
    'max_depth' : [3, 4, 6, 12, 20, 30, 40],
    'min_samples_leaf' : [1, 2, 3],
    'criterion': ['entropy', 'gini'],
    'n_jobs': [8]
}

clf = GridSearchCV(model, parameters, cv=5, scoring="log_loss", verbose=5, n_jobs=1)
clf.fit(X_train, y_train)
best_estimator = clf.best_estimator_
print('Best hyperparameters: ' + str(clf.best_params_))

In [None]:
model = XGBClassifier()
parameters = {
    'n_estimators' : [500],
    'max_depth' : [3, 4, 6, 12, 20, 30, 40],
    'learning_rate' : [0.01, 0.02, 0.05, 0.1, 0.15, 0.2, 0.3],
    'gamma': [0, 0.2, 0.5, 0.7],
    'subsample': [1, 0.7, 0.5]
}

clf = GridSearchCV(model, parameters, cv=5, scoring="log_loss", verbose=5, n_jobs=1)
clf.fit(X_train, y_train)
best_estimator = clf.best_estimator_

In [None]:
model = RandomForestClassifier(max_depth=40, n_estimators=80, n_jobs=8)

## Evaluation

In [None]:
val_predictions = model.predict(X_vald)
val_prob_predictions = model.predict_proba(X_vald)

for metric_name, metric_func in zip(
    ['F1-score', 'Acc', 'Precision', 'Recall', 'LogLoss'],
    [f1_score, accuracy_score, precision_score, recall_score, log_loss]
):
    
    val_predictions = val_predictions if metric_name not in ['LogLoss'] else val_prob_predictions
    metric_score = metric_func(y_vald, val_predictions)
    print('{0}: {1}'.format(metric_name, metric_score))

## Prepare for submission

In [None]:
model.fit(X, y)
predictions = model.predict_proba(X_test)

## Generate submission

In [None]:
np.savetxt(
    '../submissions/submission.csv', np.c_[range(len(predictions)), predictions[:,1]],
    delimiter=',', header='test_id,is_duplicate', comments='', fmt='%d,%f'
)