In [3]:
import networkx as nx
import numpy as np
import pandas as pd

src = '/media/w/1c392724-ecf3-4615-8f3c-79368ec36380/DS Projects/Kaggle/Quora/scripts/features/'

df_train = pd.read_csv(src + "df_train_spacylemmat_fullclean.csv", usecols=["qid1", "qid2"])
df_test = pd.read_csv(src + "df_test_spacylemmat_fullclean_QUESTIONIDS.csv", usecols=["qid1", "qid2"])
df = pd.concat([df_train, df_test])
print("df.shape:", df.shape) # df.shape: (2750086, 2)

df.shape: (2750086, 2)


In [None]:
g = nx.Graph()
g.add_nodes_from(df.qid1)
edges = list(df[['qid1', 'qid2']].to_records(index=False))
g.add_edges_from(edges)
g.remove_edges_from(g.selfloop_edges())
print(len(set(df.qid1)), g.number_of_nodes()) # 4789604
print(len(df), g.number_of_edges()) # 2743365 (after self-edges)

df_output = pd.DataFrame(data=g.nodes(), columns=["qid"])
print("df_output.shape:", df_output.shape)

NB_CORES = 20
for k in range(2, NB_CORES + 1):
    fieldname = "kcore{}".format(k)
    print("fieldname = ", fieldname)
    ck = nx.k_core(g, k=k).nodes()
    print("len(ck) = ", len(ck))
    df_output[fieldname] = 0
    df_output.ix[df_output.qid.isin(ck), fieldname] = k
    
df_output.to_csv("question_kcores.csv", index=None)
df_cores = pd.read_csv("question_kcores.csv", index_col="qid")
df_cores.index.names = ["qid"]
df_cores['max_kcore'] = df_cores.apply(lambda row: max(row), axis=1)
df_cores[['max_kcore']].to_csv("question_max_kcores.csv") # with index

In [6]:
import networkx as nx
import numpy as np
import pandas as pd
from tqdm import tqdm

src = '/media/w/1c392724-ecf3-4615-8f3c-79368ec36380/DS Projects/Kaggle/Quora/scripts/features/'

df_train = pd.read_csv(src + "df_train_spacylemmat_fullclean.csv", usecols=["question1", "question2"])
df_test = pd.read_csv(src + "df_test_spacylemmat_fullclean.csv", usecols=["question1", "question2"])
df_train.fillna('NULL', inplace = True)
df_test.fillna('NULL', inplace = True)

dfs = (df_train, df_test)
questions = []
for df in dfs:
    df['question1'] = df['question1'].str.lower()
    df['question2'] = df['question2'].str.lower()
    questions += df['question1'].tolist()
    questions += df['question2'].tolist()

graph = nx.Graph()
graph.add_nodes_from(questions)

for df in [df_train, df_test]:
    edges = list(df[['question1', 'question2']].to_records(index=False))
    graph.add_edges_from(edges)

graph.remove_edges_from(graph.selfloop_edges())

df = pd.DataFrame(data=graph.nodes(), columns=["question"])
df['kcores'] = 1
n_cores = 30
for k in tqdm(range(2, n_cores + 1)):
    ck = nx.k_core(graph, k=k).nodes()
    df['kcores'][df.question.isin(ck)] = k
print(df['kcores'].value_counts())


  0%|          | 0/29 [00:00<?, ?it/s][A
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
100%|██████████| 29/29 [24:57<00:00, 50.24s/it]

1     4637331
2       65639
3       15945
4        6811
5        3502
6        2299
7        1406
8        1018
9         932
11        624
10        496
12        414
14        395
13        306
30        236
17        201
16        164
19        140
20        131
15        123
22        120
21        110
18        107
25         86
23         82
28         70
29         33
26         30
24         22
27         13
Name: kcores, dtype: int64





In [16]:
comb = pd.concat([df_train, df_test])
questions_dict = pd.Series(df.kcores.values,index=df.question.values).to_dict()

kcores_df = pd.DataFrame()
kcores_df['q1_kcores_string'] = comb['question1'].map(questions_dict)
kcores_df['q2_kcores_string'] = comb['question2'].map(questions_dict)
kcores_df['kcores_string_sum'] = kcores_df['q1_kcores_string'] + kcores_df['q2_kcores_string']
kcores_df['kcores_string_diff'] = np.abs(kcores_df['q1_kcores_string'] - kcores_df['q2_kcores_string'])
kcores_df['kcores_string_product'] = kcores_df['q1_kcores_string'] * kcores_df['q2_kcores_string']
kcores_df['kcores_string_div'] = kcores_df['q1_kcores_string'] / kcores_df['q2_kcores_string']

train_kcores = kcores_df.iloc[:df_train.shape[0], :]
test_kcores = kcores_df.iloc[df_train.shape[0]:, :]

train_kcores.to_pickle('train_Kcores_string.pkl')
test_kcores.to_pickle('test_Kcores_string.pkl')