In [None]:
import numpy as np
import pandas as pd
import networkx as nx
import warnings

from sklearn.metrics import log_loss
from sklearn.model_selection import KFold
import xgboost as xgb
import datetime
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style("dark")

In [None]:
def plot_real_feature(df, fname):
    
    ix_train = np.where(df['id'] >= 0)[0]
    ix_test = np.where(df['id'] == -1)[0]
    ix_is_dup = np.where(df['is_duplicate'] == 1)[0]
    ix_not_dup = np.where(df['is_duplicate'] == 0)[0]

    fig = plt.figure(figsize=(16, 12))
    ax1 = plt.subplot2grid((3, 2), (0, 0), colspan=2)
    ax2 = plt.subplot2grid((3, 2), (1, 0), colspan=2)
    ax3 = plt.subplot2grid((3, 2), (2, 0))
    ax4 = plt.subplot2grid((3, 2), (2, 1))
    ax1.set_title('Distribution of %s' % fname, fontsize=20)
    sns.distplot(df.loc[ix_train][fname], 
                 bins=50, 
                 ax=ax1)    
    sns.distplot(df.loc[ix_is_dup][fname], 
                 bins=50, 
                 ax=ax2,
                 label='is dup')    
    sns.distplot(df.loc[ix_not_dup][fname], 
                 bins=50, 
                 ax=ax2,
                 label='not dup')
    ax2.legend(loc='upper right', prop={'size': 18})
    sns.boxplot(y=fname, 
                x='is_duplicate', 
                data=df.loc[ix_train], 
                ax=ax3)
    sns.violinplot(y=fname, 
                   x='is_duplicate', 
                   data=df.loc[ix_train], 
                   ax=ax4)
    plt.show()

In [None]:
src = '/media/w/1c392724-ecf3-4615-8f3c-79368ec36380/DS Projects/Kaggle/Quora/scripts/features/'

# trdf =  pd.read_csv(src + 'df_train_spacylemmat_fullclean.csv').iloc[:, :-1]
# tedf =  pd.read_csv(src + 'df_test_spacylemmat_fullclean.csv').iloc[:, 4:]

trdf =  pd.read_csv(src + 'df_train_lemmatfullcleanSTEMMED.csv').iloc[:, :-1]
tedf =  pd.read_csv(src + 'df_test_lemmatfullcleanSTEMMED.csv').iloc[:, 4:]

#trdf = pd.read_csv('input/train.csv').iloc[:, :-1]
#tedf = pd.read_csv('input/test.csv')

tr = pd.concat([trdf, tedf], ignore_index = True)

In [None]:
g = nx.Graph()
g.add_nodes_from(tr.question1)
g.add_nodes_from(tr.question2)
edges = list(tr[['question1', 'question2']].to_records(index=False))
g.add_edges_from(edges)

print('Number of unique questions:', len(set(tr.question1) | set(tr.question2)), g.number_of_nodes())
print('Number of rows in the data:', len(tr), g.number_of_edges())

d = g.degree()
print('Mean number of connections:', np.mean([d[k] for k in d]))

In [None]:
def create_q_interactions(name):
    comb['min_'+name] = comb[['q1_'+name, 'q2_'+name]].min(1)
    comb['max_'+name] = comb[['q1_'+name, 'q2_'+name]].max(1)
    comb['mean_'+name] = comb[['q1_'+name, 'q2_'+name]].mean(1)
    comb['sum_'+name] = comb['q1_'+name] + comb['q2_'+name]
    comb['diff_'+name] = abs(comb['q1_'+name] - comb['q2_'+name])

In [None]:
comb = pd.DataFrame()

comb['q1_neighbor_count'] = tr['question1'].map(g.neighbors).map(len)
comb['q2_neighbor_count'] = tr['question2'].map(g.neighbors).map(len)
create_q_interactions('neighbor_count')

In [None]:
comb['shared_neighbor_count'] = tr[['question1', 'question2']].apply(
    lambda x: nx.common_neighbors(g, x.question1, x.question2), 1).apply(lambda x: sum(1 for _ in x))

In [None]:
comb_tr = comb.iloc[:trdf.shape[0], :]
comb_te = comb.iloc[trdf.shape[0]:, :]
comb_te = comb_te.reset_index(drop=True)

comb_tr.to_csv('train_network_neighbors.csv', index=False)
comb_te.to_csv('test_network_neighbors.csv', index=False)

#### Load, drop duplicates and save full feature set

In [None]:
comb_te = pd.read_csv('test_network_neighbors.csv')
test_networkfeats = pd.read_pickle('test_networkfeats.pkl')
test_networkfeats = test_networkfeats.reset_index(drop=True)

df = pd.concat([test_networkfeats, comb_te], axis = 1)

dfc = df.iloc[0:10000,:]
dfc = dfc.T.drop_duplicates().T
duplicate_cols = sorted(list(set(df.columns).difference(set(dfc.columns))))
print('Dropping duplicate columns:', duplicate_cols)
df.drop(duplicate_cols, axis = 1, inplace = True)
print('Final shape:', df.shape)

df.to_csv('test_fullnetworkfeatsTony.csv', index = False)

In [None]:
comb_tr = pd.read_csv('train_network_neighbors.csv')
train_networkfeats = pd.read_pickle('train_networkfeats.pkl')

df = pd.concat([train_networkfeats, comb_tr], axis = 1)

dfc = df.iloc[0:10000,:]
dfc = dfc.T.drop_duplicates().T
duplicate_cols = sorted(list(set(df.columns).difference(set(dfc.columns))))
print('Dropping duplicate columns:', duplicate_cols)
df.drop(duplicate_cols, axis = 1, inplace = True)
print('Final shape:', df.shape)

df.to_csv('train_fullnetworkfeatsTony.csv', index = False)

In [None]:
train_networkfeats.drop(['q1_counts', 'q2_counts', 'sum_counts', 'diff_counts'], 1, inplace=True)
test_networkfeats.drop(['q1_counts', 'q2_counts', 'sum_counts', 'diff_counts'], 1, inplace=True)

In [None]:
def quick_xgb(train_feats, test_feats, train=train_af, test=test_af, pred_trans=True,
              train_labels=train_labels, weights=np.ones(len(train_af))):
    train_id = np.arange(len(train_af))
    test_id = np.arange(len(test_af))
    
    train = pd.concat([train, train_feats], 1)
    test = pd.concat([test, test_feats], 1)
    
    params = {}
    params["objective"] = "binary:logistic"
    params['eval_metric'] = ['logloss']
    params["eta"] = 0.2
    params["subsample"] = 0.7
    params["min_child_weight"] = 5
    params["colsample_bytree"] = 0.5
    #params["max_delta_step"] = 5.0
    #params["gamma"] = 10.0
    params["max_depth"] = 10
    params["silent"] = 1
    params["seed"] = 1001
    
    skf = KFold(n_splits=10, shuffle=True, random_state=1001).split(train_labels)
    test_preds = np.zeros(len(test))
    for i, (idx_train, idx_val) in enumerate(skf):
        val_preds = np.zeros(len(train.iloc[idx_val, :]))
        d_train = xgb.DMatrix(train.iloc[idx_train, :], label=train_labels[idx_train], weight=weights[idx_train])
        d_valid = xgb.DMatrix(train.iloc[idx_val, :], label=train_labels[idx_val], weight=weights[idx_val])
        watchlist = [(d_train, 'train'), (d_valid, 'valid')]
        bst = xgb.train(params, d_train, 500000, watchlist, early_stopping_rounds=10, verbose_eval=25)
        val_preds = bst.predict(d_valid, ntree_limit=bst.best_ntree_limit)
        test_preds = bst.predict(xgb.DMatrix(test), ntree_limit=bst.best_ntree_limit)
        break
    
    loss = log_loss(train_labels[idx_val], val_preds)
    
    def pred_transform(preds):
        a = 0.165 / 0.369191399096
        b = (1 - 0.165) / (1 - 0.369191399096)
        return a * preds / (a * preds + b * (1 - preds))
    if pred_trans:
        print(test_id.shape)
        print(test_preds.shape)
        test_df = pd.DataFrame({"test_id": test_id, "is_duplicate": pred_transform(test_preds)})
    else:
        test_df = pd.DataFrame({"test_id": test_id, "is_duplicate": test_preds})
    print('Log Loss:', loss)
    print('Accuracy:', (train_labels[idx_val] == np.round(val_preds)).mean())
    
    now = datetime.datetime.now()
    
    test_pred_filename = "model_out/quick_preds_xgb_{:.4f}_{:%Y%m%d_%H%M}.csv.gz".format(loss, now)
    test_df.to_csv(test_pred_filename, index=False, compression='gzip')
    
    importance = bst.get_fscore()
    importance = sorted(importance.items(), key=lambda x:x[1], reverse=True)[:50]
    
    df = pd.DataFrame(importance, columns=['feature', 'fscore'])
    df['fscore'] = df['fscore'] / df['fscore'].sum()
    
    plt.figure()
    df.plot()
    df.plot(kind='barh', x='feature', y='fscore', legend=False, figsize=(6, 10))
    plt.title('XGBoost Feature Importance')
    plt.xlabel('relative importance')

In [None]:
quick_xgb(pd.DataFrame({'i': np.ones(len(train_af))}), pd.DataFrame({'i': np.ones(len(test_af))}))

In [None]:
quick_xgb(comb_tr, comb_te)

In [None]:
quick_xgb(train_networkfeats, test_networkfeats)

In [None]:
train_networkfeats = pd.concat([train_networkfeats, comb_tr], 1)
test_networkfeats = pd.concat([test_networkfeats, comb_te], 1)

In [None]:
quick_xgb(train_networkfeats, test_networkfeats)

In [None]:
train_networkfeats.to_pickle('train_networkfeats.pkl')
test_networkfeats.to_pickle('test_networkfeats.pkl')