In [2]:
import numpy as np
import pandas as pd
import networkx as nx
import warnings

from sklearn.metrics import log_loss
from sklearn.model_selection import KFold
import xgboost as xgb
import datetime
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style("dark")


In [3]:
def plot_real_feature(df, fname):
    
    ix_train = np.where(df['id'] >= 0)[0]
    ix_test = np.where(df['id'] == -1)[0]
    ix_is_dup = np.where(df['is_duplicate'] == 1)[0]
    ix_not_dup = np.where(df['is_duplicate'] == 0)[0]

    fig = plt.figure(figsize=(16, 12))
    ax1 = plt.subplot2grid((3, 2), (0, 0), colspan=2)
    ax2 = plt.subplot2grid((3, 2), (1, 0), colspan=2)
    ax3 = plt.subplot2grid((3, 2), (2, 0))
    ax4 = plt.subplot2grid((3, 2), (2, 1))
    ax1.set_title('Distribution of %s' % fname, fontsize=20)
    sns.distplot(df.loc[ix_train][fname], 
                 bins=50, 
                 ax=ax1)    
    sns.distplot(df.loc[ix_is_dup][fname], 
                 bins=50, 
                 ax=ax2,
                 label='is dup')    
    sns.distplot(df.loc[ix_not_dup][fname], 
                 bins=50, 
                 ax=ax2,
                 label='not dup')
    ax2.legend(loc='upper right', prop={'size': 18})
    sns.boxplot(y=fname, 
                x='is_duplicate', 
                data=df.loc[ix_train], 
                ax=ax3)
    sns.violinplot(y=fname, 
                   x='is_duplicate', 
                   data=df.loc[ix_train], 
                   ax=ax4)
    plt.show()

In [4]:
# src = '/media/w/1c392724-ecf3-4615-8f3c-79368ec36380/DS Projects/Kaggle/Quora/scripts/features/'

# trdf =  pd.read_csv(src + 'df_train_spacylemmat_fullclean.csv').iloc[:, :-1]
# tedf =  pd.read_csv(src + 'df_test_spacylemmat_fullclean.csv').iloc[:, 4:]

#tr =  pd.read_csv(src + 'df_train_lemmatfullcleanSTEMMED.csv').iloc[:, :-1]
#te =  pd.read_csv(src + 'df_test_lemmatfullcleanSTEMMED.csv').iloc[:, 4:]

trdf = pd.read_csv('input/train.csv').iloc[:, :-1]
tedf = pd.read_csv('input/test.csv')

tr = pd.concat([trdf, tedf], ignore_index = True)

In [5]:
g = nx.Graph()
g.add_nodes_from(tr.question1)
g.add_nodes_from(tr.question2)
edges = list(tr[['question1', 'question2']].to_records(index=False))
g.add_edges_from(edges)

print('Number of unique questions:', len(set(tr.question1) | set(tr.question2)), g.number_of_nodes())
print('Number of rows in the data:', len(tr), g.number_of_edges())

d = g.degree()
print('Mean number of connections:', np.mean([d[k] for k in d]))

Number of unique questions: 4789032 4789032
Number of rows in the data: 2750086 2743415
Mean number of connections: 1.14570752503


http://networkx.readthedocs.io/en/stable/reference/algorithms.clustering.html

In [24]:
comb = pd.DataFrame()
#comb['id'] = tr['id']
#comb['is_duplicate'] = tr['is_duplicate']

degrees_dict = g.degree()
comb['q1_degrees'] = tr['question1'].map(degrees_dict)
comb['q2_degrees'] = tr['question2'].map(degrees_dict)

cluster_dict = nx.cluster.clustering(g)
comb['q1_cluster'] = tr['question1'].map(cluster_dict)
comb['q2_cluster'] = tr['question2'].map(cluster_dict)

cluster_square_dict = nx.cluster.square_clustering(g)
comb['q1_squared_cluster'] = tr['question1'].map(cluster_square_dict)
comb['q2_squared_cluster'] = tr['question2'].map(cluster_square_dict)

cluster_triangles_dict = nx.triangles(g)
comb['q1_triangles_cluster'] = tr['question1'].map(cluster_triangles_dict)
comb['q2_triangles_cluster'] = tr['question2'].map(cluster_triangles_dict)

In [25]:
comb['degrees_max'] = comb[['q1_degrees', 'q2_degrees']].max(1)
comb['degrees_min'] = comb[['q1_degrees', 'q2_degrees']].min(1)
comb['sum_degrees'] = comb.q1_degrees + comb.q2_degrees
comb['diff_degrees'] = abs(comb.q1_degrees - comb.q2_degrees)

comb['cluster_max'] = comb[['q1_cluster', 'q2_cluster']].max(1)
comb['cluster_min'] = comb[['q1_cluster', 'q2_cluster']].min(1)
comb['sum_cluster'] = comb.q1_cluster + comb.q2_cluster
comb['diff_cluster'] = abs(comb.q1_cluster - comb.q2_cluster)

comb['squared_cluster_max'] = comb[['q1_squared_cluster', 'q2_squared_cluster']].max(1)
comb['squared_cluster_min'] = comb[['q1_squared_cluster', 'q2_squared_cluster']].min(1)
comb['sum_squared_cluster'] = comb.q1_squared_cluster + comb.q2_squared_cluster
comb['diff_squared_cluster'] = abs(comb.q1_squared_cluster - comb.q2_squared_cluster)

comb['triangles_cluster_max'] = comb[['q1_triangles_cluster', 'q2_triangles_cluster']].max(1)
comb['triangles_cluster_min'] = comb[['q1_triangles_cluster', 'q2_triangles_cluster']].min(1)
comb['sum_triangles_cluster'] = comb.q1_triangles_cluster + comb.q2_triangles_cluster
comb['diff_triangles_cluster'] = abs(comb.q1_triangles_cluster - comb.q2_triangles_cluster)

In [26]:
cluster_neighbors = nx.average_neighbor_degree(g)
comb['q1_neighbors'] = tr['question1'].map(cluster_neighbors)
comb['q2_neighbors'] = tr['question2'].map(cluster_neighbors)

In [27]:
comb['neighbors_max'] = comb[['q1_neighbors', 'q2_neighbors']].max(1)
comb['neighbors_min'] = comb[['q1_neighbors', 'q2_neighbors']].min(1)
comb['sum_neighbors'] = comb.q1_neighbors + comb.q2_neighbors
comb['diff_neighbors'] = abs(comb.q1_neighbors - comb.q2_neighbors)

In [34]:
cluster_closeness_centrality = nx.closeness_centrality(g)
comb['q1_closeness_centrality'] = tr['question1'].map(cluster_closeness_centrality)
comb['q2_closeness_centrality'] = tr['question2'].map(cluster_closeness_centrality)

In [37]:
comb['closeness_centrality_max'] = comb[['q1_closeness_centrality', 'q2_closeness_centrality']].max(1)
comb['closeness_centrality_min'] = comb[['q1_closeness_centrality', 'q2_closeness_centrality']].min(1)
comb['sum_closeness_centrality'] = comb.q1_closeness_centrality + comb.q2_closeness_centrality
comb['diff_closeness_centrality'] = abs(comb.q1_closeness_centrality - comb.q2_closeness_centrality)

In [41]:
cluster_degree_centrality = nx.degree_centrality(g)
print('Starting map')
comb['q1_degree_centrality'] = tr['question1'].map(cluster_degree_centrality)
comb['q2_degree_centrality'] = tr['question2'].map(cluster_degree_centrality)

Starting map


In [42]:
comb['degree_centrality_max'] = comb[['q1_degree_centrality', 'q2_degree_centrality']].max(1)
comb['degree_centrality_min'] = comb[['q1_degree_centrality', 'q2_degree_centrality']].min(1)
comb['sum_degree_centrality'] = comb.q1_degree_centrality + comb.q2_degree_centrality
comb['diff_degree_centrality'] = abs(comb.q1_degree_centrality - comb.q2_degree_centrality)

In [45]:
cluster_pagerank_sp = nx.pagerank_scipy(g)
print('Starting map')
comb['q1_pagerank_sp'] = tr['question1'].map(cluster_pagerank_sp)
comb['q2_pagerank_sp'] = tr['question2'].map(cluster_pagerank_sp)

Starting map


In [46]:
comb['pagerank_sp_max'] = comb[['q1_pagerank_sp', 'q2_pagerank_sp']].max(1)
comb['pagerank_sp_min'] = comb[['q1_pagerank_sp', 'q2_pagerank_sp']].min(1)
comb['sum_pagerank_sp'] = comb.q1_pagerank_sp + comb.q2_pagerank_sp
comb['diff_pagerank_sp'] = abs(comb.q1_pagerank_sp - comb.q2_pagerank_sp)

In [50]:
cluster_eigenvector_centrality_np = nx.eigenvector_centrality_numpy(g)
comb['q1_eigenvector_centrality_np'] = tr['question1'].map(cluster_eigenvector_centrality_np)
comb['q2_eigenvector_centrality_np'] = tr['question2'].map(cluster_eigenvector_centrality_np)

In [51]:
comb['eigenvector_centrality_np_max'] = comb[['q1_eigenvector_centrality_np', 'q2_eigenvector_centrality_np']].max(1)
comb['eigenvector_centrality_np_min'] = comb[['q1_eigenvector_centrality_np', 'q2_eigenvector_centrality_np']].min(1)
comb['sum_eigenvector_centrality_np'] = comb.q1_eigenvector_centrality_np + comb.q2_eigenvector_centrality_np
comb['diff_eigenvector_centrality_np'] = abs(comb.q1_eigenvector_centrality_np - comb.q2_eigenvector_centrality_np)

In [60]:
train_counts = pd.read_csv('input/train_comb_feats.csv', usecols=['q1_counts', 'q2_counts'])
test_counts = pd.read_csv('input/test_comb_feats.csv', usecols=['q1_counts', 'q2_counts'])
comb_counts = pd.concat([train_counts, test_counts], ignore_index=True)
comb = pd.concat([comb, comb_counts], 1)

In [63]:
comb['counts_max'] = comb[['q1_counts', 'q2_counts']].max(1)
comb['counts_min'] = comb[['q1_counts', 'q2_counts']].min(1)
comb['sum_counts'] = comb.q1_counts + comb.q2_counts
comb['diff_counts'] = abs(comb.q1_counts - comb.q2_counts)
comb['mult_counts'] = comb.q1_counts * comb.q2_counts

In [65]:
features = ['counts', 'degrees', 'cluster', 'squared_cluster', 'triangles_cluster', 'neighbors', 'closeness_centrality',
             'degree_centrality', 'pagerank_sp', 'eigenvector_centrality_np']
for feat1 in features:
    for feat2 in features:
        comb['q1_'+feat1+'_by_q1_'+feat2] = comb.groupby(['q1_'+feat2])['q1_'+feat1].transform('mean')
        comb['q1_'+feat1+'_by_q2_'+feat2] = comb.groupby(['q2_'+feat2])['q1_'+feat1].transform('mean')
        comb['q2_'+feat1+'_by_q1_'+feat2] = comb.groupby(['q1_'+feat2])['q2_'+feat1].transform('mean')
        comb['q2_'+feat1+'_by_q2_'+feat2] = comb.groupby(['q2_'+feat2])['q2_'+feat1].transform('mean')

In [67]:
comb.head(15)

Unnamed: 0,q1_degrees,q2_degrees,q1_cluster,q2_cluster,q1_squared_cluster,q2_squared_cluster,q1_triangles_cluster,q2_triangles_cluster,degrees_max,degrees_min,...,q2_eigenvector_centrality_np_by_q1_degree_centrality,q2_eigenvector_centrality_np_by_q2_degree_centrality,q1_eigenvector_centrality_np_by_q1_pagerank_sp,q1_eigenvector_centrality_np_by_q2_pagerank_sp,q2_eigenvector_centrality_np_by_q1_pagerank_sp,q2_eigenvector_centrality_np_by_q2_pagerank_sp,q1_eigenvector_centrality_np_by_q1_eigenvector_centrality_np,q1_eigenvector_centrality_np_by_q2_eigenvector_centrality_np,q2_eigenvector_centrality_np_by_q1_eigenvector_centrality_np,q2_eigenvector_centrality_np_by_q2_eigenvector_centrality_np
0,1,2,0.0,0.0,0.0,0.0,0,0,2,1,...,4e-06,2.540147e-07,9.137852e-10,1.924594e-09,4.799199e-08,7.268954e-11,1.523951e-20,1.523951e-20,-1.957451e-20,-1.957451e-20
1,8,3,0.0,0.0,0.0,0.0,0,0,8,3,...,3e-06,6.221181e-07,8.615669e-21,4.406944e-09,4.820338e-21,1.627923e-10,-9.906106e-20,-9.906106e-20,-2.325737e-20,-2.9201719999999996e-20
2,2,1,0.0,0.0,0.0,0.0,0,0,2,1,...,1.8e-05,6.397188e-08,3.133882e-11,2.342248e-07,3.7569e-11,4.733866e-09,-1.86646e-20,-1.86646e-20,-2.6161969999999997e-20,-2.6161969999999997e-20
3,1,1,0.0,0.0,0.0,0.0,0,0,1,1,...,4e-06,6.397188e-08,8.316762e-13,1.259265e-09,4.321457e-13,4.566923e-11,5.883408e-21,5.883408e-21,-1.523763e-20,-1.523763e-20
4,3,1,0.0,0.0,0.0,0.0,0,0,3,1,...,2.7e-05,6.397188e-08,9.7622e-09,5.385854e-07,3.660229e-10,9.940754e-09,-2.9017419999999996e-20,-2.9017419999999996e-20,2.2752439999999998e-20,1.808821e-20
5,1,1,0.0,0.0,0.0,0.0,0,0,1,1,...,4e-06,6.397188e-08,8.316762e-13,1.259265e-09,4.321457e-13,4.566923e-11,7.598932e-21,7.598932e-21,-1.430768e-20,-1.430768e-20
6,1,1,0.0,0.0,0.0,0.0,0,0,1,1,...,4e-06,6.397188e-08,8.316762e-13,1.259265e-09,4.321457e-13,4.566923e-11,-2.078459e-20,-2.078459e-20,2.831839e-20,2.831839e-20
7,1,1,0.0,0.0,0.0,0.0,0,0,1,1,...,4e-06,6.397188e-08,8.316762e-13,1.259265e-09,4.321457e-13,4.566923e-11,3.7579429999999996e-20,3.7579429999999996e-20,-3.147881e-20,-3.147881e-20
8,2,3,1.0,0.666667,1.0,0.25,1,2,3,2,...,1.8e-05,6.221181e-07,6.766396e-10,6.806594e-13,1.222425e-08,1.359074e-12,2.0131179999999998e-20,3.200266e-20,-8.584788e-22,-8.584788e-22
9,1,1,0.0,0.0,0.0,0.0,0,0,1,1,...,4e-06,6.397188e-08,8.316762e-13,1.259265e-09,4.321457e-13,4.566923e-11,-1.374517e-20,-1.374517e-20,9.277722999999999e-21,9.277722999999999e-21


In [68]:
comb.tail(15)

Unnamed: 0,q1_degrees,q2_degrees,q1_cluster,q2_cluster,q1_squared_cluster,q2_squared_cluster,q1_triangles_cluster,q2_triangles_cluster,degrees_max,degrees_min,...,q2_eigenvector_centrality_np_by_q1_degree_centrality,q2_eigenvector_centrality_np_by_q2_degree_centrality,q1_eigenvector_centrality_np_by_q1_pagerank_sp,q1_eigenvector_centrality_np_by_q2_pagerank_sp,q2_eigenvector_centrality_np_by_q1_pagerank_sp,q2_eigenvector_centrality_np_by_q2_pagerank_sp,q1_eigenvector_centrality_np_by_q1_eigenvector_centrality_np,q1_eigenvector_centrality_np_by_q2_eigenvector_centrality_np,q2_eigenvector_centrality_np_by_q1_eigenvector_centrality_np,q2_eigenvector_centrality_np_by_q2_eigenvector_centrality_np
2750071,1,1,0.0,0.0,0.0,0.0,0,0,1,1,...,4e-06,6.397188e-08,8.316762e-13,1.259265e-09,4.321457e-13,4.566923e-11,2.708738e-21,2.708738e-21,1.359236e-21,1.359236e-21
2750072,1,1,0.0,0.0,0.0,0.0,0,0,1,1,...,4e-06,6.397188e-08,8.316762e-13,1.259265e-09,4.321457e-13,4.566923e-11,-6.966384e-21,-6.966384e-21,7.366024e-21,7.366024e-21
2750073,1,1,0.0,0.0,0.0,0.0,0,0,1,1,...,4e-06,6.397188e-08,8.316762e-13,1.259265e-09,4.321457e-13,4.566923e-11,-3.053938e-20,-3.053938e-20,3.252443e-20,3.252443e-20
2750074,1,3,0.0,0.0,0.0,0.0,0,0,3,1,...,4e-06,6.221181e-07,4.927419e-09,6.38292e-07,2.669653e-07,2.357428e-08,1.184743e-20,1.184743e-20,-7.171014e-20,-7.171014e-20
2750075,1,1,0.0,0.0,0.0,0.0,0,0,1,1,...,4e-06,6.397188e-08,8.316762e-13,1.259265e-09,4.321457e-13,4.566923e-11,-2.2046689999999998e-20,-2.2046689999999998e-20,2.183819e-20,2.183819e-20
2750076,1,1,0.0,0.0,0.0,0.0,0,0,1,1,...,4e-06,6.397188e-08,8.316762e-13,1.259265e-09,4.321457e-13,4.566923e-11,1.6750419999999998e-20,1.6750419999999998e-20,-1.839643e-20,-1.839643e-20
2750077,1,1,0.0,0.0,0.0,0.0,0,0,1,1,...,4e-06,6.397188e-08,8.316762e-13,1.259265e-09,4.321457e-13,4.566923e-11,1.9520629999999998e-20,1.9520629999999998e-20,-2.2874459999999998e-20,-2.2874459999999998e-20
2750078,1,1,0.0,0.0,0.0,0.0,0,0,1,1,...,4e-06,6.397188e-08,8.316762e-13,1.259265e-09,4.321457e-13,4.566923e-11,8.013391e-21,8.013391e-21,-1.2696039999999999e-20,-1.2696039999999999e-20
2750079,1,1,0.0,0.0,0.0,0.0,0,0,1,1,...,4e-06,6.397188e-08,8.316762e-13,1.259265e-09,4.321457e-13,4.566923e-11,-9.514065e-21,-9.514065e-21,1.622228e-20,1.622228e-20
2750080,1,1,0.0,0.0,0.0,0.0,0,0,1,1,...,4e-06,6.397188e-08,8.316762e-13,1.259265e-09,4.321457e-13,4.566923e-11,9.230238e-21,9.230238e-21,6.412759e-21,6.412759e-21


In [None]:
comb_tr = comb.iloc[:trdf.shape[0], :]
comb_te = comb.iloc[trdf.shape[0]:, :]

comb_tr.to_pickle('train_networkfeats.pkl')
comb_te.to_pickle('test_networkfeats.pkl')

In [72]:
train_af = pd.read_csv('input/train_comb_feats.csv')
test_af = pd.read_csv('input/test_comb_feats.csv')
train_labels = pd.read_csv('train1.csv', usecols=['is_duplicate'], squeeze=True)

train_af.drop(['q1_counts', 'q2_counts', 'q_count_sum', 'q_count_diff'], 1, inplace=True)
test_af.drop(['q1_counts', 'q2_counts', 'q_count_sum', 'q_count_diff'], 1, inplace=True)

In [None]:
def quick_xgb(train_feats, test_feats, train=train_af, test=test_af, pred_trans=True,
              train_labels=train_labels, weights=np.ones(len(train_af))):
    train_id = np.arange(len(train_af))
    test_id = np.arange(len(test_af))
    
    train = pd.concat([train, train_feats], 1)
    test = pd.concat([test, test_feats], 1)
    
    params = {}
    params["objective"] = "binary:logistic"
    params['eval_metric'] = ['logloss']
    params["eta"] = 0.2
    params["subsample"] = 0.7
    params["min_child_weight"] = 5
    params["colsample_bytree"] = 0.5
    #params["max_delta_step"] = 5.0
    #params["gamma"] = 10.0
    params["max_depth"] = 10
    params["silent"] = 1
    params["seed"] = 1001
    
    skf = KFold(n_splits=10, shuffle=True, random_state=1001).split(train_labels)
    test_preds = np.zeros(len(test))
    for i, (idx_train, idx_val) in enumerate(skf):
        val_preds = np.zeros(len(train.iloc[idx_val, :]))
        d_train = xgb.DMatrix(train.iloc[idx_train, :], label=train_labels[idx_train], weight=weights[idx_train])
        d_valid = xgb.DMatrix(train.iloc[idx_val, :], label=train_labels[idx_val], weight=weights[idx_val])
        watchlist = [(d_train, 'train'), (d_valid, 'valid')]
        bst = xgb.train(params, d_train, 500000, watchlist, early_stopping_rounds=10, verbose_eval=25)
        val_preds = bst.predict(d_valid, ntree_limit=bst.best_ntree_limit)
        test_preds = bst.predict(xgb.DMatrix(test), ntree_limit=bst.best_ntree_limit)
        break
    
    loss = log_loss(train_labels[idx_val], val_preds)
    def pred_transform(preds):
        a = 0.165 / 0.369191399096
        b = (1 - 0.165) / (1 - 0.369191399096)
        return a * preds / (a * preds + b * (1 - preds))
    if pred_trans:
        test_df = pd.DataFrame({"test_id": test_id, "is_duplicate": pred_transform(test_preds)})
    else:
        test_df = pd.DataFrame({"test_id": test_id, "is_duplicate": test_preds})
    print('Log Loss:', loss)
    print('Accuracy:', (train_labels[idx_val] == np.round(val_preds)).mean())
    
    now = datetime.datetime.now()
    
    test_pred_filename = "model_out/quick_preds_xgb_{:.4f}_{:%Y%m%d_%H%M}.csv.gz".format(loss, now)
    test_df.to_csv(test_pred_filename, index=False, compression='gzip')
    
    importance = bst.get_fscore()
    importance = sorted(importance.items(), key=lambda x:x[1], reverse=True)[:50]
    
    df = pd.DataFrame(importance, columns=['feature', 'fscore'])
    df['fscore'] = df['fscore'] / df['fscore'].sum()
    
    plt.figure()
    df.plot()
    df.plot(kind='barh', x='feature', y='fscore', legend=False, figsize=(6, 10))
    plt.title('XGBoost Feature Importance')
    plt.xlabel('relative importance')

In [80]:
quick_xgb(comb_tr, comb_te)

[0]	train-logloss:0.575662	valid-logloss:0.576261
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 10 rounds.
[25]	train-logloss:0.224162	valid-logloss:0.244436
[50]	train-logloss:0.202643	valid-logloss:0.238037
[75]	train-logloss:0.185633	valid-logloss:0.237436
[100]	train-logloss:0.173258	valid-logloss:0.237613
Stopping. Best iteration:
[90]	train-logloss:0.177721	valid-logloss:0.23731



ValueError: arrays must all be same length

In [81]:
comb_tr.to_pickle('train_networkfeats.pkl')
comb_te.to_pickle('test_networkfeats.pkl')