In [1]:
import pandas as pd
import numpy as np
import gc
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn.manifold import Isomap
from sklearn.manifold import LocallyLinearEmbedding
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.decomposition import NMF
from multiprocessing import Pool
import time
import pickle

In [3]:
train = pd.read_csv("../data/open_data/sample_train.txt", delimiter="\t")
valid = pd.read_csv("../data/open_data/valid_id.txt", delimiter="\t")
test = pd.read_csv("../data/open_data/test_id.txt", delimiter="\t")
df = pd.concat([train, test, valid], axis=0)

# 构建图

In [6]:
def build_graph():
    edge = pd.read_csv("../data/open_data/dat_edge/dat_edge_1", delimiter="\t")
    from_id = []
    to_id = []
    dates = []
    nums = []
    weights = []
    for i, row in edge.iterrows():
        for t in row.info.split(","):
            from_id.append(row.from_id)
            to_id.append(row.to_id)

            date, nums_weight = t.split(":")
            num, weight = nums_weight.split("_")

            dates.append(date)
            nums.append(num)
            weights.append(weight)
    graph = pd.DataFrame({"from_id": from_id, "to_id": to_id, "date": dates, "num":nums, "weight":weights})
    graph.to_csv("graph", index=False)
    graph.weight = graph.weight + 1
    a = graph.groupby(["from_id", "to_id"]).weight.max()
    a = a.reset_index()

    with open("graph_for_emb.txt", "w") as f:
        for i, row in a.iterrows(): 
            f.write("%d %d %d\n" % (row.from_id, row.to_id, row.weight))

    graph_filter = graph[graph.from_id.isin(df.id) | graph.to_id.isin(df.id)]
    graph_filter.to_csv("graph_filter.csv", index=False)

In [8]:
edge = pd.read_csv("../data/open_data/dat_edge/dat_edge_1", delimiter="\t")

In [9]:
from_id = []
to_id = []
dates = []
nums = []
weights = []
for i, row in edge.iterrows():
    for t in row.info.split(","):
        from_id.append(row.from_id)
        to_id.append(row.to_id)

        date, nums_weight = t.split(":")
        num, weight = nums_weight.split("_")

        dates.append(date)
        nums.append(num)
        weights.append(weight)

In [10]:
graph = pd.DataFrame({"from_id": from_id, "to_id": to_id, "date": dates, "num":nums, "weight":weights})

In [16]:
graph['weight'] = graph['weight'].astype(int)

In [17]:
graph['weight'] = graph['weight']+1

In [19]:
a =graph.groupby(["from_id", "to_id"]).weight.max()
a = a.reset_index()

In [20]:
with open("graph_for_emb.txt", "w") as f:
    for i, row in a.iterrows(): 
        f.write("%d %d %d\n" % (row.from_id, row.to_id, row.weight))


In [21]:
graph_filter = graph[graph.from_id.isin(df.id) | graph.to_id.isin(df.id)]
graph_filter.to_csv("graph_filter.csv", index=False)

# 链路分析

In [22]:
import networkx as nx
import pickle

In [23]:
def link_analysis():
    graph = pd.read_csv("graph")

    a = graph.groupby(["from_id", "to_id"]).weight.sum()
    a = a.reset_index()

    with open("graph_for_pagerank.txt", "w") as f:
        for i, row in a.iterrows(): 
            f.write("%d %d %d\n" % (row.from_id, row.to_id, row.weight))

    G = nx.DiGraph()
    with open("graph_for_pagerank.txt", "r") as f:
        for line in f:
            from_id, to_id, weight = line.strip().split()
            G.add_edge(int(from_id), int(to_id), weight=int(weight))

    pr = nx.pagerank(G)

    with open("pagerank.plk", "wb") as f:
        pickle.dump(pr, f)

    graph_filter = pd.read_csv("graph_filter.csv")
    graph_filter_ids = set(graph_filter.to_id.tolist()) | set(graph_filter.from_id.tolist())

    h,a=nx.hits(G)

    with open("dh.plk", "wb") as f:
        pickle.dump(h, f)

    with open("a.plk", "wb") as f:
        pickle.dump(a, f)
        
    dc = degree_centrality(G)
    with open("degree_centrality.plk", "wb") as f:
        pickle.dump(dc, f)

In [24]:
link_analysis()

NameError: name 'degree_centrality' is not defined

# 筛选embedding

In [4]:
graph = pd.read_csv("data/graph")
graph_filter = graph[graph.from_id.isin(df.id) | graph.to_id.isin(df.id)]
graph_filter.to_csv("data/graph_filter.csv", index=False)

graph_emb = pd.read_csv("data/deepwalk_192.emb", delimiter=" ", names=["id"] + ["dp_%d" % i for i in range(192)], skiprows=1)
graph_emb[graph_emb.id.isin(df.id)].to_csv("features/graph/deepwalk_192_filtered.emb", index=False)

graph_emb = pd.read_csv("data/deepwalk_128.emb", delimiter=" ", names=["id"] + ["dp_%d" % i for i in range(128)], skiprows=1)
graph_emb[graph_emb.id.isin(df.id)].to_csv("features/graph/deepwalk_128_filtered.emb", index=False)

graph_emb = pd.read_csv("data/deepwalk_256.emb", delimiter=" ", names=["id"] + ["dp_%d" % i for i in range(256)], skiprows=1)
graph_emb[graph_emb.id.isin(df.id)].to_csv("features/graph/deepwalk_256_filtered.emb", index=False)

# 基本特征

In [5]:
def risk_feature():
    risk = pd.read_csv("data/dat_risk.txt", delimiter="\t")
    risk['total'] = risk[["a_cnt", "b_cnt", "c_cnt", "d_cnt", "e_cnt"]].sum(axis=1)
    for c in ["a_cnt", "b_cnt", "c_cnt", "d_cnt", "e_cnt"]:
        risk[c + "_ratio"] = risk[c] / risk.total
    risk.to_csv("features/risk/risk.csv", index=False)

In [6]:
def symbol_feature():
    dat_symbol = pd.read_csv("data/dat_symbol.txt", delimiter="\t")
    dat_symbol['cat_count'] = dat_symbol.symbol.apply(lambda x: len(x.split(",")))
    dat_symbol['symbol'] = dat_symbol.symbol.apply(lambda x:" ".join([i for i in x.split(",")]))
    dat_symbol['symbol_1'] = dat_symbol.symbol.apply(lambda x:" ".join([i.split("_")[0] for i in x.split(" ")]))
    
    vectorizer = CountVectorizer()
    a = vectorizer.fit_transform(dat_symbol.symbol)
    lev2 = pd.DataFrame(a.toarray(), columns=["lev_2_" + str(i) for i in range(44)])
    b = vectorizer.fit_transform(dat_symbol.symbol_1)
    lev1 = pd.DataFrame(b.toarray(), columns=["lev_1_" + str(i) for i in range(24)])

    dat_symbol = dat_symbol.join(lev1)
    dat_symbol = dat_symbol.join(lev2)

    dat_symbol.drop(["symbol", "symbol_1"], axis=1, inplace=True)
    dat_symbol.to_csv("features/symbol/symbol.csv", index=False)

In [7]:
def graph_feature(df, graph_filter):
    graph_filter.date = pd.to_datetime(graph_filter.date)

    out_degree = graph_filter.groupby("from_id").to_id.count().reset_index().rename(columns={"from_id":"id", "to_id": "out_degree"})
    in_degree = graph_filter.groupby("to_id").from_id.count().reset_index().rename(columns={"to_id": "id", "from_id":"in_degree"})

    out_num = graph_filter.groupby("from_id").num.sum().reset_index().rename(columns={"from_id":"id", "num": "out_sum"})
    in_num = graph_filter.groupby("to_id").num.sum().reset_index().rename(columns={"to_id": "id", "num":"in_sum"})

    in_weight = graph_filter.groupby("to_id").weight.sum().reset_index().rename(columns={"to_id": "id", "weight":"in_weight"})
    out_weight = graph_filter.groupby("from_id").weight.sum().reset_index().rename(columns={"from_id":"id", "weight": "out_weight"})

#     graph_filter = graph_filter.sort_values("date")

#     in_span = (graph_filter.groupby("to_id").date.last().dt.year - graph_filter.groupby("to_id").date.first().dt.year) * 12 + (graph_filter.groupby("to_id").date.last().dt.month - graph_filter.groupby("to_id").date.first().dt.month)
#     out_span = (graph_filter.groupby("from_id").date.last().dt.year - graph_filter.groupby("from_id").date.first().dt.year) * 12 + (graph_filter.groupby("from_id").date.last().dt.month - graph_filter.groupby("from_id").date.first().dt.month)
#     in_span = in_span.reset_index().rename(columns={"to_id":"id", "date": "in_span"})
#     out_span = out_span.reset_index().rename(columns={"from_id":"id", "date": "out_span"})

    in_unique = graph_filter.groupby("to_id").from_id.nunique().reset_index().rename(columns={"to_id":"id", "from_id": "in_nunique"})
    out_unique = graph_filter.groupby("from_id").to_id.nunique().reset_index().rename(columns={"from_id":"id", "to_id": "out_nunique"})

    graph_info = df[['id']]
    graph_info = graph_info.merge(out_degree, on="id")
    graph_info = graph_info.merge(in_degree, on="id")

    graph_info = graph_info.merge(out_num, on="id")
    graph_info = graph_info.merge(in_num, on="id")

    graph_info = graph_info.merge(out_weight, on="id")
    graph_info = graph_info.merge(in_weight, on="id")

    graph_info = graph_info.merge(out_unique, on="id")
    graph_info = graph_info.merge(in_unique, on="id")
    
    common_id = set(graph_filter.from_id.tolist()) & set(graph_filter.to_id.tolist())

    from_dict = {}
    for i in df.id:
        from_dict[i] = set(graph_filter[graph_filter.from_id == i].to_id.values)

    to_dict = {}
    for i in df.id:
        to_dict[i] = set(graph_filter[graph_filter.to_id == i].from_id.values)

    common_id = {}
    for i in df.id:
        common_id[i] = from_dict[i] & to_dict[i]

    graph_info['common_num'] = graph_info.id.apply(lambda x: len(common_id[x]))

    graph_info.to_csv("features/graph/graph_info.csv", index=False)

In [8]:
def app_feature(graph_filter):
    app = pd.read_csv("data/dat_app.txt", delimiter="\t", header=None, names=["id", "app_list"])
    app = app[app.id.isin(graph_filter.from_id) | app.id.isin(graph_filter.to_id)]
    # app = app[app.id.isin(graph2_id & app_id)]
    # app = app[app.id.isin(df.id)]

    app['apps'] = app.app_list.apply(lambda x: " ".join(x.split(",")))

    d = {}
    def count(x):
        for i in x.split(" "):
            d[i] = d.get(i, 0) + 1

    _ = app.apps.apply(count)

    app['app_num'] = app.apps.apply(lambda x: len(x.split(" ")))
    app["app_freq_sum"] = app.apps.apply(lambda x: sum([d[i] for i in x.split(" ")]))
    app['app_num_mean'] = app.app_freq_sum / app.app_num

    app['app_freq_max'] = app.apps.apply(lambda x: max([d[i] for i in x.split(" ")]))
    app['app_freq_min'] = app.apps.apply(lambda x: min([d[i] for i in x.split(" ")]))
    app['app_freq_median'] = app.apps.apply(lambda x: np.median([d[i] for i in x.split(" ")]))
    app['app_freq_var'] = app.apps.apply(lambda x: np.var([d[i] for i in x.split(" ")]))

    app_info = app[app.id.isin(df.id)]
    app_info[["id", "app_num", "app_freq_sum", "app_num_mean", "app_freq_median", "app_freq_var"]].to_csv("features/app/app_info.csv", index=False)

    a = pd.DataFrame({"app":list(d.keys()), "count": list(d.values())})
    vocab = a.sort_values("count", ascending=False).head(4000).app.tolist()
    vectorizer = CountVectorizer(vocabulary=vocab)
    vector = vectorizer.fit_transform(app.apps)

    dim = 16
    pca = PCA(n_components=dim)
    pca_res = pca.fit_transform(vector.toarray())
    app_pca = pd.DataFrame(pca_res, columns=["pca_%d" % i for i in range(dim)])
    app_pca["id"] = app.id.values
    app_pca.to_csv("features/app/app_pca_%d.csv" % dim, index=False)

    dim = 16
    lda = LatentDirichletAllocation(n_components=dim, n_jobs=32)
    lda_res = lda.fit_transform(vector.toarray())
    app_lda = pd.DataFrame(lda_res, columns=["lda_%d" % i for i in range(dim)])
    app_lda["id"] = app.id.values
    app_lda.to_csv("features/app/app_lda_%d.csv" % dim, index=False)

    dim = 16
    nmf = NMF(n_components=dim, init='random', random_state=0)
    nmf_res = nmf.fit_transform(vector.toarray())
    app_nmf = pd.DataFrame(nmf_res, columns=["nmf_%d" % i for i in range(dim)])
    app_nmf["id"] = app.id.values
    app_nmf.to_csv("features/app/app_nmf_%d.csv" % dim, index=False)

In [7]:
graph_filter = pd.read_csv("data/graph_filter.csv")

In [12]:
graph_feature(df, graph_filter)
app_feature(graph_filter)
symbol_feature()
risk_feature()



# 联系特征

In [14]:
def feature_with_graph(graph_filter, other_df, feature_cols, to_dir, new_col_name, func, weight_type, ids=df.id):
    task_name = "%s%s_%s" % (new_col_name, weight_type, func)
    start = time.time()
    to_df = graph_filter.rename(columns={"to_id": "id"}).merge(other_df, on="id", how="left").drop("id", axis=1).rename(columns={"from_id":"id"})
    from_df = graph_filter.rename(columns={"from_id": "id"}).merge(other_df, on="id", how="left").drop("id", axis=1).rename(columns={"to_id":"id"})
    
    to_df = to_df.merge(to_df.groupby("id")["num", "weight"].sum().reset_index().rename(columns={"num":"num_sum_total", "weight":"weight_sum_total"}), on="id", how="left")
    from_df = from_df.merge(from_df.groupby("id")["num", "weight"].sum().reset_index().rename(columns={"num":"num_sum_total", "weight":"weight_sum_total"}), on="id", how="left")
    
    if weight_type == "_num":
        for f in feature_cols:
            to_df[f] = to_df[f] * to_df["num"]
            from_df[f] = from_df[f] * from_df["num"]
    elif weight_type == "_weight":
        for f in feature_cols:
            to_df[f] = to_df[f] * to_df["weight"]
            from_df[f] = from_df[f] * from_df["weight"]

    if weight_type in ["_num", "_weight"] and func == "mean":
        to_df[f] /= to_df[weight_type[1:] + "_sum_total"]
        from_df[f] /= from_df[weight_type[1:] + "_sum_total"]
        a = to_df.groupby("id")[feature_cols].agg("sum").reset_index()
        b = from_df.groupby("id")[feature_cols].agg("sum").reset_index()
    else:
        a = to_df.groupby("id")[feature_cols].agg(func).reset_index()
        b = from_df.groupby("id")[feature_cols].agg(func).reset_index()
    
    if new_col_name == "symbol":
        a['to_%s%s_count' % (new_col_name, weight_type)] = a[[c for c in a.columns if c != "id"]].sum(axis=1)
        b['from_%s%s_count' % (new_col_name, weight_type)] = b[[c for c in b.columns if c != "id"]].sum(axis=1)

    a.columns = ["id"] + ["to_%s%s_%s_%d" % (new_col_name, weight_type, func, i) for i in range(1, len(a.columns))]
    b.columns = ["id"] + ["from_%s%s_%s_%d" % (new_col_name, weight_type, func, i) for i in range(1, len(a.columns))]

    a[a.id.isin(ids)].to_csv("features/%s/to_%s%s_%s.csv" % (to_dir, new_col_name, weight_type, func), index=False)
    b[b.id.isin(ids)].to_csv("features/%s/from_%s%s_%s.csv" % (to_dir, new_col_name, weight_type, func), index=False)
    
    end = time.time()
    print('Task %s runs %0.2f seconds.' % (task_name, (end - start)))

一度联系人

symbol_graph

In [14]:
dat_symbol = pd.read_csv("features/symbol/symbol.csv")
lev_f = []
for f in dat_symbol.columns:
    if f[:5] == "lev_1":
        lev_f.append(f)

In [15]:
symbol_args_list = [
    (graph_filter, dat_symbol, lev_f, "symbol_graph", "symbol", "sum", ""),
    (graph_filter, dat_symbol, lev_f, "symbol_graph", "symbol", "sum", "_num"),
    (graph_filter, dat_symbol, lev_f, "symbol_graph", "symbol", "sum", "_weight"),
    (graph_filter, dat_symbol, lev_f, "symbol_graph", "symbol", "mean", ""),
    (graph_filter, dat_symbol, lev_f, "symbol_graph", "symbol", "mean", "_num"),
    (graph_filter, dat_symbol, lev_f, "symbol_graph", "symbol", "mean", "_weight"),
]

app_graph

In [16]:
app_pca = pd.read_csv("features/app/app_pca_16.csv")
app_lda = pd.read_csv("features/app/app_lda_16.csv")
app_nmf = pd.read_csv("features/app/app_nmf_16.csv")

In [17]:
app_args_list = [
    (graph_filter, app_pca, ["pca_%d" % i for i in range(16)], "app_graph", "app_pca", "mean", ""),
    (graph_filter, app_lda, ["lda_%d" % i for i in range(16)], "app_graph", "app_lda", "mean", ""),
    (graph_filter, app_nmf, ["nmf_%d" % i for i in range(16)], "app_graph", "app_nmf", "mean", ""),
    (graph_filter, app_pca, ["pca_%d" % i for i in range(16)], "app_graph", "app_pca", "mean", "_num"),
    (graph_filter, app_lda, ["lda_%d" % i for i in range(16)], "app_graph", "app_lda", "mean", "_num"),
    (graph_filter, app_nmf, ["nmf_%d" % i for i in range(16)], "app_graph", "app_nmf", "mean", "_num"),
    (graph_filter, app_pca, ["pca_%d" % i for i in range(16)], "app_graph", "app_pca", "mean", "_weight"),
    (graph_filter, app_lda, ["lda_%d" % i for i in range(16)], "app_graph", "app_lda", "mean", "_weight"),
    (graph_filter, app_nmf, ["nmf_%d" % i for i in range(16)], "app_graph", "app_nmf", "mean", "_weight")
]

risk_graph

In [18]:
risk = pd.read_csv("features/risk/risk.csv")
risk_f = ["a_cnt", "b_cnt", "c_cnt", "d_cnt", "e_cnt", "total"]
risk_ratio_f = ["a_cnt_ratio", "b_cnt_ratio", "c_cnt_ratio", "d_cnt_ratio", "e_cnt_ratio"]

In [19]:
risk_args_list = [
    (graph_filter, risk, risk_f, "risk_graph", "risk", "mean", ""),
    (graph_filter, risk, risk_f, "risk_graph", "risk", "mean", "_num"),
    (graph_filter, risk, risk_f, "risk_graph", "risk", "mean", "_weight"),
    (graph_filter, risk, risk_f, "risk_graph", "risk", "sum", ""),
    (graph_filter, risk, risk_f, "risk_graph", "risk", "sum", "_num"),
    (graph_filter, risk, risk_f, "risk_graph", "risk", "sum", "_weight"),
    (graph_filter, risk, risk_ratio_f, "risk_graph", "risk_ratio", "mean", ""),
    (graph_filter, risk, risk_ratio_f, "risk_graph", "risk_ratio", "mean", "_num"),
    (graph_filter, risk, risk_ratio_f, "risk_graph", "risk_ratio", "mean", "_weight")
]

In [20]:
p = Pool(10)
for args in symbol_args_list + app_args_list + risk_args_list:
    p.apply_async(feature_with_graph, args)
p.close()
p.join()
print('All subprocesses done.')

Task symbol_sum runs 63.57 seconds.
Task app_pca_mean runs 32.21 seconds.
Task app_lda_mean runs 31.75 seconds.
Task app_nmf_mean runs 33.76 seconds.
Task app_pca_num_mean runs 36.23 seconds.
Task symbol_weight_sum runs 66.53 seconds.
Task symbol_num_sum runs 75.36 seconds.
Task app_lda_num_mean runs 30.99 seconds.
Task app_nmf_num_mean runs 30.29 seconds.
Task app_pca_weight_mean runs 31.69 seconds.
Task symbol_mean runs 78.49 seconds.
Task app_lda_weight_mean runs 31.54 seconds.
Task symbol_num_mean runs 76.20 seconds.
Task app_nmf_weight_mean runs 32.91 seconds.
Task symbol_weight_mean runs 77.83 seconds.
Task risk_mean runs 31.79 seconds.
Task risk_num_mean runs 36.27 seconds.
Task risk_weight_mean runs 33.93 seconds.
Task risk_sum runs 33.67 seconds.
Task risk_weight_sum runs 28.49 seconds.
Task risk_num_sum runs 33.47 seconds.
Task risk_ratio_mean runs 34.42 seconds.
Task risk_ratio_num_mean runs 35.89 seconds.
Task risk_ratio_weight_mean runs 32.52 seconds.
All subprocesses done

In [21]:
graph = pd.read_csv("data/graph")

graph_filter_ids = set(graph_filter.to_id.tolist()) | set(graph_filter.from_id.tolist())

In [22]:
from_filtered = graph[graph.from_id.isin(graph_filter_ids)]
to_filtered = graph[graph.to_id.isin(graph_filter_ids)]

d1_to = from_filtered.groupby("from_id").to_id.count()
d1_from = to_filtered.groupby("to_id").from_id.count()

d1_to_sum = from_filtered.groupby("from_id")["num", "weight"].sum()
d1_from_sum = to_filtered.groupby("to_id")["num", "weight"].sum()

d1_to = d1_to.reset_index().merge(d1_to_sum.reset_index(), on="from_id", how="left").rename(columns={"to_id":"count"})
d1_from = d1_from.reset_index().merge(d1_from_sum.reset_index(), on="to_id", how="left").rename(columns={"from_id":"count"})

d1_to = d1_to.rename(columns={"from_id":"id", "num": "num_sum", "weight":"weight_sum"})
d1_from = d1_from.rename(columns={"to_id":"id", "num": "num_sum", "weight":"weight_sum"})

In [23]:
d1_to["num_mean"] = d1_to.num_sum / d1_to["count"]
d1_to["weight_mean"] = d1_to.weight_sum / d1_to["count"]

d1_from["num_mean"] = d1_from.num_sum / d1_from["count"]
d1_from["weight_mean"] = d1_from.weight_sum / d1_from["count"]

In [24]:
# d2_f = ["count", "num_mean", "weight_mean"]
d2_f = ["count", "num_mean", "weight_mean"]
d2_args_list = [
    (graph_filter, d1_to, d2_f, "graph", "d2_to", "sum", ""),
    (graph_filter, d1_to, d2_f, "graph", "d2_to", "sum", "_weight"),
    (graph_filter, d1_to, d2_f, "graph", "d2_to", "sum", "_num"),
    (graph_filter, d1_to, d2_f, "graph", "d2_to", "mean", ""),
    (graph_filter, d1_to, d2_f, "graph", "d2_to", "mean", "_weight"),
    (graph_filter, d1_to, d2_f, "graph", "d2_to", "mean", "_num"),
    
    (graph_filter, d1_from, d2_f, "graph", "d2_from", "sum", ""),
    (graph_filter, d1_from, d2_f, "graph", "d2_from", "sum", "_weight"),
    (graph_filter, d1_from, d2_f, "graph", "d2_from", "sum", "_num"),
    (graph_filter, d1_from, d2_f, "graph", "d2_from", "mean", ""),
    (graph_filter, d1_from, d2_f, "graph", "d2_from", "mean", "_num"),
    (graph_filter, d1_from, d2_f, "graph", "d2_from", "mean", "_weight"),
]

In [25]:
p = Pool(10)
for args in d2_args_list:
    p.apply_async(feature_with_graph, args)
p.close()
p.join()
print('All subprocesses done.')

Task d2_to_sum runs 24.58 seconds.
Task d2_to_weight_sum runs 27.43 seconds.
Task d2_to_mean runs 26.62 seconds.
Task d2_to_num_sum runs 29.36 seconds.
Task d2_to_weight_mean runs 28.06 seconds.
Task d2_to_num_mean runs 28.30 seconds.
Task d2_from_sum runs 29.14 seconds.
Task d2_from_weight_sum runs 28.73 seconds.
Task d2_from_num_sum runs 28.76 seconds.
Task d2_from_mean runs 27.15 seconds.
Task d2_from_num_mean runs 19.73 seconds.
Task d2_from_weight_mean runs 22.14 seconds.
All subprocesses done.


PageRank

In [26]:
with open("data/edge/pagerank.plk", "rb") as f:
    pr = pickle.load(f)

pr_df = pd.DataFrame({"id": list(pr.keys()), "pr":list(pr.values())})
pr_df[pr_df.id.isin(df.id)].to_csv("features/graph/pagerank.csv", index=False)

In [27]:
pr_f = ['pr']
pr_args_list = [
    (graph_filter, pr_df, pr_f, "graph", "pagerank", "sum", "", ),
    (graph_filter, pr_df, pr_f, "graph", "pagerank", "sum", "_weight", graph_filter_ids),
    (graph_filter, pr_df, pr_f, "graph", "pagerank", "sum", "_num", graph_filter_ids),
    (graph_filter, pr_df, pr_f, "graph", "pagerank", "mean", "", graph_filter_ids),
    (graph_filter, pr_df, pr_f, "graph", "pagerank", "mean", "_weight", graph_filter_ids),
    (graph_filter, pr_df, pr_f, "graph", "pagerank", "mean", "_num", graph_filter_ids),
]

In [28]:
p = Pool(6)
for args in pr_args_list:
    p.apply_async(feature_with_graph, args)
p.close()
p.join()
print('All subprocesses done.')

Task pagerank_sum runs 65.19 seconds.
Task pagerank_weight_sum runs 77.54 seconds.
Task pagerank_num_sum runs 76.52 seconds.
Task pagerank_mean runs 75.12 seconds.
Task pagerank_weight_mean runs 75.42 seconds.
Task pagerank_num_mean runs 77.08 seconds.
All subprocesses done.


PageRank 二度

In [29]:
gf = graph[graph.from_id.isin(graph_filter_ids) | graph.to_id.isin(graph_filter_ids)]

In [30]:
pr_f = ['pr']
feature_with_graph(gf, pr_df, pr_f, "temp", "pagerank", "sum", "", graph_filter_ids)
feature_with_graph(gf, pr_df, pr_f, "temp", "pagerank", "sum", "_weight", graph_filter_ids)
feature_with_graph(gf, pr_df, pr_f, "temp", "pagerank", "sum", "_num", graph_filter_ids)
feature_with_graph(gf, pr_df, pr_f, "temp", "pagerank", "mean", "", graph_filter_ids)
feature_with_graph(gf, pr_df, pr_f, "temp", "pagerank", "mean", "_weight",graph_filter_ids)
feature_with_graph(gf, pr_df, pr_f, "temp", "pagerank", "mean", "_num", graph_filter_ids)

Task pagerank_sum runs 1045.24 seconds.
Task pagerank_weight_sum runs 1064.47 seconds.
Task pagerank_num_sum runs 1053.86 seconds.
Task pagerank_mean runs 1052.96 seconds.
Task pagerank_weight_mean runs 1063.16 seconds.
Task pagerank_num_mean runs 1066.43 seconds.


In [31]:
to_num_mean = pd.read_csv("features/temp/to_pagerank_num_mean.csv")
from_num_mean = pd.read_csv("features/temp/from_pagerank_num_mean.csv")

to_weight_mean = pd.read_csv("features/temp/to_pagerank_weight_mean.csv")
from_weight_mean = pd.read_csv("features/temp/from_pagerank_weight_mean.csv")

to_num_sum = pd.read_csv("features/temp/to_pagerank_num_sum.csv")
from_num_sum = pd.read_csv("features/temp/from_pagerank_num_sum.csv")

to_weight_sum = pd.read_csv("features/temp/to_pagerank_weight_sum.csv")
from_weight_sum = pd.read_csv("features/temp/from_pagerank_weight_sum.csv")

In [32]:
from_num_mean.head()

Unnamed: 0,id,from_pagerank_num_mean_1
0,22,2.940365e-08
1,24,8.918203e-07
2,80,9.372466e-07
3,91,1.578345e-08
4,117,9.312466e-06


In [33]:
pr2_args_list = [
    (graph_filter, to_num_mean, ['to_pagerank_num_mean_1'], "graph", "pg_to_num_mean", "mean", "_num"),
    (graph_filter, from_num_mean, ["from_pagerank_num_mean_1"], "graph", "pg_from_num_mean", "mean", "_num"),
    (graph_filter, to_weight_mean, ['to_pagerank_weight_mean_1'], "graph", "pg_to_weight_mean", "mean", "_weight"),
    (graph_filter, from_weight_mean, ["from_pagerank_weight_mean_1"], "graph", "pg_from_weight_mean", "mean", "_weight"),
    
    (graph_filter, to_num_sum, ['to_pagerank_num_sum_1'], "graph", "pg_to_num_sum", "sum", "_num"),
    (graph_filter, from_num_sum, ["from_pagerank_num_sum_1"], "graph", "pg_from_num_sum", "sum", "_num"),
    (graph_filter, to_weight_sum, ['to_pagerank_weight_sum_1'], "graph", "pg_to_weight_sum", "sum", "_weight"),
    (graph_filter, from_weight_sum, ["from_pagerank_weight_sum_1"], "graph", "pg_from_weight_sum", "sum", "_weight"),
]

In [34]:
p = Pool(4)
for args in pr2_args_list:
    p.apply_async(feature_with_graph, args)
p.close()
p.join()
print('All subprocesses done.')

Task pg_to_num_mean_num_mean runs 19.40 seconds.
Task pg_from_num_mean_num_mean runs 18.77 seconds.
Task pg_to_weight_mean_weight_mean runs 19.28 seconds.
Task pg_from_weight_mean_weight_mean runs 20.31 seconds.
Task pg_from_num_sum_num_sum runs 16.08 seconds.
Task pg_to_num_sum_num_sum runs 16.96 seconds.
Task pg_to_weight_sum_weight_sum runs 16.75 seconds.
Task pg_from_weight_sum_weight_sum runs 16.41 seconds.
All subprocesses done.


hits

In [35]:
with open("data/edge/a.plk", "rb") as f:
    a = pickle.load(f)

a_df = pd.DataFrame({"id": list(a.keys()), "a":list(a.values())})

with open("data/edge/h.plk", "rb") as f:
    h = pickle.load(f)

h_df = pd.DataFrame({"id": list(a.keys()), "h":list(h.values())})


hits = a_df.merge(h_df, on="id")
hits[a_df.id.isin(df.id)].to_csv("features/graph/hits.csv", index=False)

In [36]:
hits.head()

Unnamed: 0,id,a,h
0,2,0.0,2.297083e-21
1,16872051,4.5465880000000004e-17,5.140984e-21
2,3,0.0,1.163187e-24
3,6907348,4.136847e-21,2.879042e-16
4,7911933,7.092638e-23,8.419130000000001e-18


In [37]:
hits_f = ['a', "h"]
hits_args_list = [
    (graph_filter, hits, hits_f, "graph", "hits", "sum", ""),
    (graph_filter, hits, hits_f, "graph", "hits", "sum", "_weight"),
    (graph_filter, hits, hits_f, "graph", "hits", "sum", "_num"),
    (graph_filter, hits, hits_f, "graph", "hits", "mean", ""),
    (graph_filter, hits, hits_f, "graph", "hits", "mean", "_weight"),
    (graph_filter, hits, hits_f, "graph", "hits", "mean", "_num"),
]

In [38]:
p = Pool(10)
for args in hits_args_list:
    p.apply_async(feature_with_graph, args)
p.close()
p.join()
print('All subprocesses done.')

Task hits_sum runs 65.34 seconds.
Task hits_weight_sum runs 66.21 seconds.
Task hits_num_sum runs 68.90 seconds.
Task hits_mean runs 64.11 seconds.
Task hits_weight_mean runs 63.83 seconds.
Task hits_num_mean runs 62.85 seconds.
All subprocesses done.


hits 二度

In [39]:
hits_f = ['a', "h"]
feature_with_graph(gf, hits, hits_f, "temp", "hits", "sum", "", graph_filter_ids)
feature_with_graph(gf, hits, hits_f, "temp", "hits", "sum", "_weight", graph_filter_ids)
feature_with_graph(gf, hits, hits_f, "temp", "hits", "sum", "_num", graph_filter_ids)
feature_with_graph(gf, hits, hits_f, "temp", "hits", "mean", "", graph_filter_ids)
feature_with_graph(gf, hits, hits_f, "temp", "hits", "mean", "_weight",graph_filter_ids)
feature_with_graph(gf, hits, hits_f, "temp", "hits", "mean", "_num", graph_filter_ids)

Task hits_sum runs 1122.12 seconds.
Task hits_weight_sum runs 1138.46 seconds.
Task hits_num_sum runs 1144.98 seconds.
Task hits_mean runs 1152.57 seconds.
Task hits_weight_mean runs 1169.23 seconds.
Task hits_num_mean runs 1146.93 seconds.


In [40]:
to_num_mean = pd.read_csv("features/temp/to_hits_num_mean.csv")
from_num_mean = pd.read_csv("features/temp/from_hits_num_mean.csv")

to_weight_mean = pd.read_csv("features/temp/to_hits_weight_mean.csv")
from_weight_mean = pd.read_csv("features/temp/from_hits_weight_mean.csv")

to_num_sum = pd.read_csv("features/temp/to_hits_num_sum.csv")
from_num_sum = pd.read_csv("features/temp/from_hits_num_sum.csv")

to_weight_sum = pd.read_csv("features/temp/to_hits_weight_sum.csv")
from_weight_sum = pd.read_csv("features/temp/from_hits_weight_sum.csv")

In [41]:
hits2_args_list = [
    (graph_filter, to_num_mean, ['to_hits_num_mean_1', 'to_hits_num_mean_2'], "graph", "hits_to_num_mean", "mean", "_num"),
    (graph_filter, from_num_mean, ["from_hits_num_mean_1", "from_hits_num_mean_2"], "graph", "hits_from_num_mean", "mean", "_num"),
    (graph_filter, to_weight_mean, ['to_hits_weight_mean_1', 'to_hits_weight_mean_2'], "graph", "hits_to_weight_mean", "mean", "_weight"),
    (graph_filter, from_weight_mean, ["from_hits_weight_mean_1", "from_hits_weight_mean_2"], "graph", "hits_from_weight_mean", "mean", "_weight"),
    
    (graph_filter, to_num_sum, ['to_hits_num_sum_1', 'to_hits_num_sum_2'], "graph", "hits_to_num_sum", "sum", "_num"),
    (graph_filter, from_num_sum, ["from_hits_num_sum_1", "from_hits_num_sum_2"], "graph", "hits_from_num_sum", "sum", "_num"),
    (graph_filter, to_weight_sum, ['to_hits_weight_sum_1', 'to_hits_weight_sum_2'], "graph", "hits_to_weight_sum", "sum", "_weight"),
    (graph_filter, from_weight_sum, ["from_hits_weight_sum_1", "from_hits_weight_sum_2"], "graph", "hits_from_weight_sum", "sum", "_weight"),
]

In [42]:
p = Pool(10)
for args in hits2_args_list:
    p.apply_async(feature_with_graph, args)
p.close()
p.join()
print('All subprocesses done.')

Task hits_to_num_mean_num_mean runs 21.48 seconds.
Task hits_from_num_mean_num_mean runs 23.92 seconds.
Task hits_from_weight_mean_weight_mean runs 23.51 seconds.
Task hits_to_weight_mean_weight_mean runs 25.11 seconds.
Task hits_to_num_sum_num_sum runs 23.70 seconds.
Task hits_from_num_sum_num_sum runs 23.28 seconds.
Task hits_to_weight_sum_weight_sum runs 22.55 seconds.
Task hits_from_weight_sum_weight_sum runs 22.57 seconds.
All subprocesses done.


In [3]:
import pickle

dc

In [8]:
with open("data/edge/degree_centrality.plk", "rb") as f:
    a = pickle.load(f)

a_df = pd.DataFrame({"id": list(a.keys()), "a":list(a.values())})

In [12]:
a_df[a_df.id.isin(df.id)].to_csv("features/graph/dc.csv", index=False)

In [15]:
f = ['a']
a_df_args_list = [
    (graph_filter, a_df, f, "graph", "dc", "sum", ""),
    (graph_filter, a_df, f, "graph", "dc", "sum", "_weight"),
    (graph_filter, a_df, f, "graph", "dc", "sum", "_num"),
    (graph_filter, a_df, f, "graph", "dc", "mean", ""),
    (graph_filter, a_df, f, "graph", "dc", "mean", "_weight"),
    (graph_filter, a_df, f, "graph", "dc", "mean", "_num"),
]

p = Pool(10)
for args in a_df_args_list:
    p.apply_async(feature_with_graph, args)
p.close()
p.join()
print('All subprocesses done.')

Task dc_sum runs 62.91 seconds.
Task dc_weight_sum runs 64.62 seconds.
Task dc_num_sum runs 62.13 seconds.
Task dc_mean runs 61.85 seconds.
Task dc_weight_mean runs 61.26 seconds.
Task dc_num_mean runs 62.25 seconds.
All subprocesses done.
