# SGCN用データセット作り

In [1]:
import numpy as np
import pandas as pd
import pickle
import networkx as nx
import matplotlib
%matplotlib inline

  (fname, cnt))
  (fname, cnt))


In [2]:
def get_dist(df,col):
    df_cnt = df.groupby([col]+['rating'])['time'].count().unstack(1,fill_value=0)
    df_dist = pd.DataFrame(df_cnt.values / df_cnt.sum(1).values.reshape(-1,1),
                                               columns=df_cnt.columns,
                                               index=df_cnt.index)
    return df_dist

## Amazon → SGCN

### user-product network

#### network

In [27]:
amazon_network = pd.read_csv('raw_data/amazon/amazon_network.csv',header=None)
amazon_network.columns = ['user_id','product_id','rating','time']
amazon_network['weight'] = amazon_network.rating.map(lambda x:(x-3)/2).round()

amazon_gt = pd.read_csv('raw_data/amazon/amazon_gt.csv',header=None)
amazon_gt.columns = ['user_id','label']

In [28]:
truncated_amazon_network = amazon_network.loc[amazon_network.weight!=0,['user_id','product_id','weight']]

In [29]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

label_encoder.fit(np.hstack((truncated_amazon_network.user_id,
                                                   truncated_amazon_network.product_id,
                                                   amazon_gt.user_id)))

LabelEncoder()

In [30]:
truncated_amazon_network['id1'] = label_encoder.transform(truncated_amazon_network.user_id)

truncated_amazon_network['id2'] = label_encoder.transform(truncated_amazon_network.product_id)

amazon_gt['node_id'] = label_encoder.transform(amazon_gt.user_id)

#### node features

In [None]:
user_dist = get_dist(amazon_network,'user_id')

product_dist = get_dist(amazon_network,'product_id')

# user_product_dist = user_dist.append(product_dist)

user_product_dist = pd.concat([user_dist,product_dist],1).fillna(0)

In [None]:
node_features_df = user_product_dist.loc[label_encoder.classes_]

#### ファイル出力

In [None]:
truncated_amazon_network[['id1','id2','weight']].to_csv('input/amazon/amazon_network.csv',index=None)

amazon_gt[['node_id','label']].to_csv('input/amazon/amazon_gt.csv',index=None)

np.save(arr=label_encoder.classes_,file='input/amazon/amazon_label_encoder.npy')

node_features_df.to_csv('input/amazon/amazon_node_feature.csv',index=None)

## epinions

### network

In [3]:
epinions_network = pd.read_csv('raw_data/epinions/epinions_network.csv',header=None)

epinions_network.columns = ['id1','id2','rating','time']

epinions_network['weight'] = epinions_network.rating.map(lambda x:-1 if x-3.5 < 0 else 1)

In [4]:
epinions_gt = pd.read_csv('raw_data/epinions/epinions_gt.csv',header=None)

epinions_gt.columns = ['user_id','label']

In [None]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

label_encoder.fit(np.hstack((epinions_network.id1,
                                                   epinions_network.id2,
                                                   epinions_gt.user_id)))

In [None]:
epinions_network['id1_'] = label_encoder.transform(epinions_network.id1)

epinions_network['id2_'] = label_encoder.transform(epinions_network.id2)

epinions_gt['node_id'] = label_encoder.transform(epinions_gt.user_id)

### node features

In [None]:
node_features_df = pd.concat([get_dist(epinions_network,'id1_'),get_dist(epinions_network,'id2_')],1).fillna(0).sort_index()

### ファイル出力

In [None]:
epinions_network[['id1_','id2_','weight']].to_csv('input/epinions/epinions_network.csv',index=None)
epinions_gt[['node_id','label']].to_csv('input/epinions/epinions_gt.csv',index=None)
np.save(arr=label_encoder.classes_,file='input/epinions/epinions_label_encoder.npy')
node_features_df.to_csv('input/epinions/epinions_node_feature.csv',index=None)

## epinions_sub

In [None]:
sampled_nodes = np.random.choice()

## alpha

In [None]:
alpha_network = pd.read_csv('raw_data/alpha/alpha_network.csv',header=None)

alpha_network.columns = ['id1','id2','rating','time']

alpha_network['weight'] = alpha_network.rating.map(lambda x:1 if x>0 else -1)

In [None]:
alpha_gt = pd.read_csv('raw_data/alpha/alpha_gt.csv',header=None)

alpha_gt.columns = ['user_id','label']

In [None]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

label_encoder.fit(np.hstack((alpha_network.id1,
                                                   alpha_network.id2,
                                                   alpha_gt.user_id)))

In [None]:
alpha_network['id1_'] = label_encoder.transform(alpha_network.id1)

alpha_network['id2_'] = label_encoder.transform(alpha_network.id2)

alpha_gt['node_id'] = label_encoder.transform(alpha_gt.user_id)

node_features_df = pd.concat([get_dist(alpha_network,'id1_'),get_dist(alpha_network,'id2_')],1).fillna(0).sort_index()

In [None]:
alpha_network[['id1_','id2_','weight']].to_csv('input/alpha/alpha_network.csv',index=None)
alpha_gt[['node_id','label']].to_csv('input/alpha/alpha_gt.csv',index=None)
np.save(arr=label_encoder.classes_,file='input/alpha/alpha_label_encoder.npy')
node_features_df.to_csv('input/alpha/alpha_node_feature.csv',index=None)

## otc

In [None]:
otc_network = pd.read_csv('raw_data/otc/otc_network.csv',header=None)

otc_network.columns = ['id1','id2','rating','time']

otc_network['weight'] = otc_network.rating.map(lambda x:1 if x>0 else -1)

In [None]:
otc_gt = pd.read_csv('raw_data/otc/otc_gt.csv',header=None)

otc_gt.columns = ['user_id','label']

In [None]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

label_encoder.fit(np.hstack((otc_network.id1,
                                                   otc_network.id2,
                                                   otc_gt.user_id)))

In [None]:
otc_network['id1_'] = label_encoder.transform(otc_network.id1)

otc_network['id2_'] = label_encoder.transform(otc_network.id2)

otc_gt['node_id'] = label_encoder.transform(otc_gt.user_id)

node_features_df = pd.concat([get_dist(otc_network,'id1_'),get_dist(otc_network,'id2_')],1).fillna(0).sort_index()

In [None]:
otc_network[['id1_','id2_','weight']].to_csv('input/otc/otc_network.csv',index=None)
otc_gt[['node_id','label']].to_csv('input/otc/otc_gt.csv',index=None)
np.save(arr=label_encoder.classes_,file='input/otc/otc_label_encoder.npy')
node_features_df.to_csv('input/otc/otc_node_feature.csv',index=None)

# Appindix

## amazon user networkを作る

In [None]:
amazon_network

In [None]:
self_joined = pd.merge(amazon_network,amazon_network,on='product_id',how='right')

self_joined = self_joined.loc[~(self_joined.user_id_x==self_joined.user_id_y)]

self_joined['sign'] = self_joined.weight_x*self_joined.weight_y

In [None]:
user_network = self_joined.loc[self_joined.sign!=0,['user_id_x','user_id_y','sign']]

user_network = user_network.groupby(['user_id_x','user_id_y'],as_index=False)['sign'].mean().round()

user_network = user_network.loc[user_network.sign!=0]

In [None]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

label_encoder.fit(np.hstack((user_network.user_id_x,
                                                   user_network.user_id_y,
                                                   amazon_gt.user_id)))

In [None]:
user_network['id1'] = label_encoder.transform(user_network.user_id_x)

user_network['id2'] = label_encoder.transform(user_network.user_id_y)

In [None]:
for_nx_network = user_network.copy()[['id1','id2','sign']]

for_nx_network.columns = ['source','target','weight']

G = nx.from_pandas_edgelist(for_nx_network,edge_attr=True)

In [None]:
amazon_user_network = nx.to_pandas_edgelist(G)

In [None]:
amazon_gt['node_id'] = label_encoder.transform(amazon_gt.user_id)

In [None]:
amazon_user_network[['source','target','weight']].to_csv('input/amazon/user_network.csv',index=None)

amazon_gt[['node_id','label']].to_csv('input/amazon/user_gt.csv',index=None)

np.save(arr=label_encoder.classes_,file='input/amazon/user_label_encoder.npy')