In [1]:
import pandas as pd
import sys
import os
sys.path.insert(0, '../..')

In [2]:
from utils import _ALEXA_DATA_PATH, ModelWrapper

2021-12-26 10:06:40.705577: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
node_features_file = os.path.join(_ALEXA_DATA_PATH, "corpus_2018_audience_overlap_level_0_and_1_node_features.csv")
edge_file = os.path.join(_ALEXA_DATA_PATH, "corpus_2018_audience_overlap_level_0_and_1_edges.csv")

In [4]:
node_features_df = pd.read_csv(node_features_file, index_col=0)
node_features_df.head()

Unnamed: 0_level_0,alexa_ranks,daily_pageviews_per_visitors,daily_time_on_sites,total_sites_linking_ins,bounce_rate
urls,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
crotonmusiccenter.com,,,,,
gracefuneralhome.com,,,,4.0,
haaretz.com,11522.0,1.5,152.0,13974.0,0.65
osubeavers.com,422967.0,2.5,299.0,430.0,0.471
prophezine.com,8782212.0,1.0,,209.0,


In [5]:
node_features_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 11865 entries, crotonmusiccenter.com to yorkregion.com
Data columns (total 5 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   alexa_ranks                   8658 non-null   float64
 1   daily_pageviews_per_visitors  8659 non-null   float64
 2   daily_time_on_sites           6459 non-null   float64
 3   total_sites_linking_ins       11470 non-null  float64
 4   bounce_rate                   6023 non-null   float64
dtypes: float64(5)
memory usage: 556.2+ KB


In [6]:
node_features_df.alexa_ranks = node_features_df.alexa_ranks.fillna(0)
node_features_df.total_sites_linking_ins = node_features_df.total_sites_linking_ins.fillna(0)

In [7]:
node_features_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 11865 entries, crotonmusiccenter.com to yorkregion.com
Data columns (total 5 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   alexa_ranks                   11865 non-null  float64
 1   daily_pageviews_per_visitors  8659 non-null   float64
 2   daily_time_on_sites           6459 non-null   float64
 3   total_sites_linking_ins       11865 non-null  float64
 4   bounce_rate                   6023 non-null   float64
dtypes: float64(5)
memory usage: 556.2+ KB


# Normalizing features

In [8]:
node_features_df['normalized_alexa_rank'] = node_features_df['alexa_ranks'].apply(lambda x: 1/x if x else 0)

In [9]:
import math

node_features_df['normalized_total_sites_linked_in'] = node_features_df['total_sites_linking_ins'].apply(lambda x: math.log2(x) if x else 0)

# Load edges
---

In [10]:
edge_df = pd.read_csv(edge_file)

edge_df.head()

Unnamed: 0,source,target
0,villagevoice.com,nylon.com
1,villagevoice.com,slantmagazine.com
2,villagevoice.com,gothamist.com
3,villagevoice.com,screendaily.com
4,villagevoice.com,amny.com


In [11]:
edge_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20399 entries, 0 to 20398
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   source  20399 non-null  object
 1   target  20399 non-null  object
dtypes: object(2)
memory usage: 318.9+ KB


In [12]:
import stellargraph as sg

G = sg.StellarGraph(node_features_df[['normalized_alexa_rank', 'normalized_total_sites_linked_in']], edge_df)
print(G.info())

StellarGraph: Undirected multigraph
 Nodes: 11865, Edges: 20399

 Node types:
  default: [11865]
    Features: float32 vector, length 2
    Edge types: default-default->default

 Edge types:
    default-default->default: [20399]
        Weights: all 1 (default)
        Features: none


# Unsupervised Attrib2Vec

In [13]:
from stellargraph.mapper import Attri2VecLinkGenerator, Attri2VecNodeGenerator
from stellargraph.layer import Attri2Vec, link_classification
from stellargraph.data import UnsupervisedSampler

from tensorflow import keras

1. Specify the other optional parameter values:
* root nodes
* the number of walks to take per node
* the length of each walk
* random seed

In [14]:
nodes = list(G.nodes())
number_of_walks = 10
length = 100

2. Create the UnsupervisedSampler instance with the relevant parameters passed to it.

In [15]:
unsupervised_samples = UnsupervisedSampler(G, nodes=nodes, length=length, number_of_walks=number_of_walks)

3. Create a node pair generator:

In [16]:
batch_size = 50
# epochs = 4
num_samples = [10, 5]

In [17]:
generator = Attri2VecLinkGenerator(G, batch_size)
train_gen = generator.flow(unsupervised_samples)

In [18]:
layer_sizes = [128]
attri2vec = Attri2Vec(layer_sizes=layer_sizes, generator=generator, bias=False, normalize=None)

In [19]:
# Build the model and expose input and output sockets of attri2vec, for node pair inputs:
x_inp, x_out = attri2vec.in_out_tensors()

In [20]:
prediction = link_classification(output_dim=1, output_act="sigmoid", edge_embedding_method="ip")(x_out)

link_classification: using 'ip' method to combine node embeddings into edge embeddings


In [21]:
model = keras.Model(inputs=x_inp, outputs=prediction)

model.compile(
    optimizer=keras.optimizers.Adam(lr=1e-3),
    loss=keras.losses.binary_crossentropy,
    metrics=[keras.metrics.binary_accuracy],
)

  super(Adam, self).__init__(name, **kwargs)


In [22]:
history = model.fit(
    train_gen,
    epochs=8,
    verbose=2,
    use_multiprocessing=False,
    workers=1,
    shuffle=True,
)

"""
1. (run) previous before normalization

WARNING:tensorflow:sample_weight modes were coerced from
  ...
    to  
  ['...']
Train for 1899 steps
Epoch 1/8
1899/1899 - 47s - loss: 0.7380 - binary_accuracy: 0.5427
Epoch 2/8
1899/1899 - 46s - loss: 0.6368 - binary_accuracy: 0.6424
Epoch 3/8
1899/1899 - 47s - loss: 0.5929 - binary_accuracy: 0.6680
Epoch 4/8
1899/1899 - 48s - loss: 0.5694 - binary_accuracy: 0.6800
Epoch 5/8
1899/1899 - 52s - loss: 0.5564 - binary_accuracy: 0.6865
Epoch 6/8
1899/1899 - 47s - loss: 0.5442 - binary_accuracy: 0.6933
Epoch 7/8
1899/1899 - 48s - loss: 0.5399 - binary_accuracy: 0.6941
Epoch 8/8
1899/1899 - 47s - loss: 0.5272 - binary_accuracy: 0.7013

2 (run) after normalization but without parameters tuning

WARNING:tensorflow:sample_weight modes were coerced from
  ...
    to  
  ['...']
Train for 1899 steps
Epoch 1/8
1899/1899 - 49s - loss: 0.6926 - binary_accuracy: 0.5304
Epoch 2/8
1899/1899 - 52s - loss: 0.6343 - binary_accuracy: 0.5872
Epoch 3/8
1899/1899 - 43s - loss: 0.6147 - binary_accuracy: 0.6016
Epoch 4/8
1899/1899 - 43s - loss: 0.5955 - binary_accuracy: 0.6457
Epoch 5/8
1899/1899 - 43s - loss: 0.5755 - binary_accuracy: 0.6763
Epoch 6/8
1899/1899 - 44s - loss: 0.5604 - binary_accuracy: 0.6914
Epoch 7/8
1899/1899 - 44s - loss: 0.5486 - binary_accuracy: 0.7005
Epoch 8/8
1899/1899 - 43s - loss: 0.5390 - binary_accuracy: 0.7082

"""

Epoch 1/8


KeyboardInterrupt: 

In [None]:
x_inp_src = x_inp[0]
x_out_src = x_out[0]
embedding_model = keras.Model(inputs=x_inp_src, outputs=x_out_src)

In [None]:
node_gen = Attri2VecNodeGenerator(G, batch_size).flow(node_features_df.index)
node_embeddings = embedding_model.predict(node_gen, workers=1, verbose=1)

In [None]:
node_embeddings[213]

In [None]:
embeddings_wv = dict(zip(node_features_df.index.tolist(), node_embeddings))

In [None]:
embeddings_wv['villagevoice.com']

In [None]:
data_year = '2018'
node2vec_model = ModelWrapper('Unsupervised Attrib2Vec', embeddings_wv)

In [None]:
from sklearn import svm
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegressionCV

In [None]:
result_report = []

clf = LogisticRegressionCV(Cs=10, cv=5, scoring="accuracy", multi_class="ovr", max_iter=300, random_state=42)
result_report.append([
    str(node2vec_model),
    'LogisticRegression CV = 5',
    *list(train_model(clf, node2vec_model=node2vec_model, data_year=data_year).values())
]);

clf2 = LogisticRegressionCV(Cs=10, cv=10, scoring="accuracy", multi_class="ovr", max_iter=300, random_state=42)
result_report.append([
    str(node2vec_model),
    'LogisticRegression CV = 10',
    *list(train_model(clf2, node2vec_model=node2vec_model, data_year=data_year).values())
]);

tree_clf = GradientBoostingClassifier(random_state=42)
result_report.append([
    str(node2vec_model),
    'GradientBoostingClassifier',
    *list(train_model(tree_clf, node2vec_model=node2vec_model, data_year=data_year).values())
]);

svm_clf = svm.SVC(decision_function_shape='ovo', probability=True, random_state=42)
result_report.append([
    str(node2vec_model),
    'SVC ovo',
    *list(train_model(svm_clf, node2vec_model=node2vec_model, data_year=data_year).values())
]);

model_res = pd.DataFrame(result_report,
                    columns=["Feature", "Classifier", "Accuracy", "Balanced Accuracy score",
                             "F1 micro score", "F1 macro score", "F1 weighted score", "MAE", "Confusion matrix"])

In [None]:
model_res.head()