In [1]:
import numpy as np
import pandas as pd
import networkx as nx
import warnings

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style("dark")
import community

In [2]:
def plot_real_feature(df, fname):
    
    ix_train = np.where(df['id'] >= 0)[0]
    ix_test = np.where(df['id'] == -1)[0]
    ix_is_dup = np.where(df['is_duplicate'] == 1)[0]
    ix_not_dup = np.where(df['is_duplicate'] == 0)[0]

    fig = plt.figure(figsize=(16, 12))
    ax1 = plt.subplot2grid((3, 2), (0, 0), colspan=2)
    ax2 = plt.subplot2grid((3, 2), (1, 0), colspan=2)
    ax3 = plt.subplot2grid((3, 2), (2, 0))
    ax4 = plt.subplot2grid((3, 2), (2, 1))
    ax1.set_title('Distribution of %s' % fname, fontsize=20)
    sns.distplot(df.loc[ix_train][fname], 
                 bins=50, 
                 ax=ax1)    
    sns.distplot(df.loc[ix_is_dup][fname], 
                 bins=50, 
                 ax=ax2,
                 label='is dup')    
    sns.distplot(df.loc[ix_not_dup][fname], 
                 bins=50, 
                 ax=ax2,
                 label='not dup')
    ax2.legend(loc='upper right', prop={'size': 18})
    sns.boxplot(y=fname, 
                x='is_duplicate', 
                data=df.loc[ix_train], 
                ax=ax3)
    sns.violinplot(y=fname, 
                   x='is_duplicate', 
                   data=df.loc[ix_train], 
                   ax=ax4)
    plt.show()

In [3]:
src = '/media/w/1c392724-ecf3-4615-8f3c-79368ec36380/DS Projects/Kaggle/Quora/scripts/features/'

#trdf =  pd.read_csv(src + 'df_train_spacylemmat_fullclean.csv').iloc[:, :-1]
#tedf =  pd.read_csv(src + 'df_test_spacylemmat_fullclean.csv').iloc[:, 4:]

trdf =  pd.read_csv(src + 'df_train_NER.csv')
tedf =  pd.read_csv(src + 'df_test_NER.csv')

#tr =  pd.read_csv(src + 'df_train_lemmatfullcleanSTEMMED.csv').iloc[:, :-1]
#te =  pd.read_csv(src + 'df_test_lemmatfullcleanSTEMMED.csv').iloc[:, 4:]

tr = pd.concat([trdf, tedf], ignore_index = True)

In [4]:
g = nx.Graph()
g.add_nodes_from(tr.question1)
g.add_nodes_from(tr.question2)
edges = list(tr[['question1', 'question2']].to_records(index=False))
g.add_edges_from(edges)

print('Number of unique questions:', len(set(tr.question1) | set(tr.question2)), g.number_of_nodes())
print('Number of rows in the data:', len(tr), g.number_of_edges())

d = g.degree()
print('Mean number of connections:', np.mean([d[k] for k in d]))

Number of unique questions: 4693234 4693234
Number of rows in the data: 2750086 2721224
Mean number of connections: 1.15963704345


http://networkx.readthedocs.io/en/stable/reference/algorithms.clustering.html

In [6]:
comb = pd.DataFrame()
#comb['id'] = tr['id']
#comb['is_duplicate'] = tr['is_duplicate']

degrees_dict = g.degree()
comb['q1_NER_degrees'] = tr['question1'].map(degrees_dict)
comb['q2_NER_degrees'] = tr['question2'].map(degrees_dict)

cluster_dict = nx.cluster.clustering(g)
comb['q1_NER_cluster'] = tr['question1'].map(cluster_dict)
comb['q2_NER_cluster'] = tr['question2'].map(cluster_dict)

cluster_square_dict = nx.cluster.square_clustering(g)
comb['q1_NER_squared_cluster'] = tr['question1'].map(cluster_square_dict)
comb['q2_NER_squared_cluster'] = tr['question2'].map(cluster_square_dict)

cluster_triangles_dict = nx.triangles(g)
comb['q1_NER_triangles_cluster'] = tr['question1'].map(cluster_triangles_dict)
comb['q2_NER_triangles_cluster'] = tr['question2'].map(cluster_triangles_dict)

In [7]:
comb_tr = comb.iloc[:trdf.shape[0], :]
comb_te = comb.iloc[trdf.shape[0]:, :]

comb_tr.to_csv('train_networkfeats_NER.csv', index = False)
comb_te.to_csv('test_networkfeats_NER.csv', index = False)

In [None]:
cc = filter(lambda x : (len(x) > 3) and (len(x) < 10), 
            nx.connected_component_subgraphs(g))

g1 = next(cc)
g1.nodes()
with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    nx.draw_circular(g1, with_labels=True, alpha=0.5, font_size=8)
    plt.show()
    
g2 = next(cc)
g1.nodes()
with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    nx.draw_circular(g1, with_labels=True, alpha=0.5, font_size=8)
    plt.show()

In [None]:
cc = nx.connected_component_subgraphs(g)
node_cts = list(sub.number_of_nodes() for sub in cc)
cc = nx.connected_component_subgraphs(g)
edge_cts = list(sub.number_of_edges() for sub in cc)
cts = pd.DataFrame({'nodes': node_cts, 'edges': edge_cts})
cts['mean_deg'] = 2 * cts.edges / cts.nodes
cts.nodes.clip_upper(10).value_counts().sort_index()

In [None]:
cts.plot.scatter('nodes', 'edges')
plt.show()