In [2]:
import pandas as pd

In [3]:
%pwd

'/media/panayot/cf151fc4-0692-4c72-9d73-892b9c408127/home/panayot/Documents/site_similarity/notebooks/node_features_graphs/corpus 2020'

In [4]:
node_features_file = "../../generate_node_features/corpus_2020_audience_overlap_level_0_and_1_node_features.csv"
edge_file = "../../generate_node_features/combined_data_corpus_2020_level_0_1_df_edges.csv"

In [5]:
node_features_df = pd.read_csv(node_features_file, index_col=0)

In [6]:
node_features_df.head()

Unnamed: 0,alexa_ranks,daily_pageviews_per_visitors,daily_time_on_sites,total_sites_linking_ins,bounce_rate
gradescope.com,11014.0,4.7,296.0,103.0,0.222
parentlink.net,151438.0,3.0,203.0,93.0,0.301
nationalpartnership.org,604522.0,1.3,156.0,811.0,0.765
sharondraper.com,1209734.0,2.0,109.0,209.0,0.615
trade.gov,55944.0,1.9,137.0,2392.0,0.654


In [7]:
node_features_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 12303 entries, gradescope.com to growveg.com
Data columns (total 5 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   alexa_ranks                   9128 non-null   float64
 1   daily_pageviews_per_visitors  9129 non-null   float64
 2   daily_time_on_sites           6780 non-null   float64
 3   total_sites_linking_ins       11966 non-null  float64
 4   bounce_rate                   6300 non-null   float64
dtypes: float64(5)
memory usage: 576.7+ KB


In [8]:
node_features_df.alexa_ranks = node_features_df.alexa_ranks.fillna(0)
node_features_df.total_sites_linking_ins = node_features_df.total_sites_linking_ins.fillna(0)

In [9]:
node_features_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 12303 entries, gradescope.com to growveg.com
Data columns (total 5 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   alexa_ranks                   12303 non-null  float64
 1   daily_pageviews_per_visitors  9129 non-null   float64
 2   daily_time_on_sites           6780 non-null   float64
 3   total_sites_linking_ins       12303 non-null  float64
 4   bounce_rate                   6300 non-null   float64
dtypes: float64(5)
memory usage: 576.7+ KB


# Normalizing features

In [10]:
node_features_df['normalized_alexa_rank'] = node_features_df['alexa_ranks'].apply(lambda x: 1/x if x else 0)

In [11]:
import math

node_features_df['normalized_total_sites_linked_in'] = node_features_df['total_sites_linking_ins'].apply(lambda x: math.log2(x) if x else 0)

---

In [12]:
edge_df = pd.read_csv(edge_file)

edge_df.head()

Unnamed: 0,source,target
0,crooked.com,votesaveamerica.com
1,crooked.com,art19.com
2,crooked.com,promocodeportal.com
3,crooked.com,mediamatters.org
4,crooked.com,actblue.com


In [13]:
edge_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28381 entries, 0 to 28380
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   source  28381 non-null  object
 1   target  28381 non-null  object
dtypes: object(2)
memory usage: 443.6+ KB


In [14]:
import stellargraph as sg

In [15]:
G = sg.StellarGraph(node_features_df[['normalized_alexa_rank', 'normalized_total_sites_linked_in']], edge_df)
print(G.info())

StellarGraph: Undirected multigraph
 Nodes: 12303, Edges: 28381

 Node types:
  default: [12303]
    Features: float32 vector, length 2
    Edge types: default-default->default

 Edge types:
    default-default->default: [28381]
        Weights: all 1 (default)
        Features: none


# Unsupervised Attrib2Vec

In [16]:
from stellargraph.mapper import Attri2VecLinkGenerator, Attri2VecNodeGenerator
from stellargraph.layer import Attri2Vec, link_classification
from stellargraph.data import UnsupervisedSampler

from tensorflow import keras

1. Specify the other optional parameter values: root nodes, the number of walks to take per node, the length of each walk, and random seed.

In [17]:
nodes = list(G.nodes())
number_of_walks = 1
length = 5

2. Create the UnsupervisedSampler instance with the relevant parameters passed to it.

In [18]:
unsupervised_samples = UnsupervisedSampler(
    G, nodes=nodes, length=length, number_of_walks=number_of_walks
)

3. Create a node pair generator:

In [19]:
batch_size = 50
epochs = 4
num_samples = [10, 5]

In [20]:
generator = Attri2VecLinkGenerator(G, batch_size)
train_gen = generator.flow(unsupervised_samples)

In [21]:
layer_sizes = [128]
attri2vec = Attri2Vec(
    layer_sizes=layer_sizes, generator=generator, bias=False, normalize=None
)

In [22]:
# Build the model and expose input and output sockets of attri2vec, for node pair inputs:
x_inp, x_out = attri2vec.in_out_tensors()

In [23]:
prediction = link_classification(
    output_dim=1, output_act="sigmoid", edge_embedding_method="ip"
)(x_out)

link_classification: using 'ip' method to combine node embeddings into edge embeddings


In [24]:
model = keras.Model(inputs=x_inp, outputs=prediction)

model.compile(
    optimizer=keras.optimizers.Adam(lr=1e-3),
    loss=keras.losses.binary_crossentropy,
    metrics=[keras.metrics.binary_accuracy],
)

In [26]:
history = model.fit(
    train_gen,
    epochs=32
    verbose=2,
    use_multiprocessing=False,
    workers=1,
    shuffle=True,
)

# """
# previous before normalization

# WARNING:tensorflow:sample_weight modes were coerced from
#   ...
#     to  
#   ['...']
# Train for 1899 steps
# Epoch 1/8
# 1899/1899 - 47s - loss: 0.7380 - binary_accuracy: 0.5427
# Epoch 2/8
# 1899/1899 - 46s - loss: 0.6368 - binary_accuracy: 0.6424
# Epoch 3/8
# 1899/1899 - 47s - loss: 0.5929 - binary_accuracy: 0.6680
# Epoch 4/8
# 1899/1899 - 48s - loss: 0.5694 - binary_accuracy: 0.6800
# Epoch 5/8
# 1899/1899 - 52s - loss: 0.5564 - binary_accuracy: 0.6865
# Epoch 6/8
# 1899/1899 - 47s - loss: 0.5442 - binary_accuracy: 0.6933
# Epoch 7/8
# 1899/1899 - 48s - loss: 0.5399 - binary_accuracy: 0.6941
# Epoch 8/8
# 1899/1899 - 47s - loss: 0.5272 - binary_accuracy: 0.7013

# """

Epoch 1/32
1969/1969 - 27s - loss: 0.6562 - binary_accuracy: 0.5589
Epoch 2/32
1969/1969 - 26s - loss: 0.6530 - binary_accuracy: 0.5655
Epoch 3/32
1969/1969 - 27s - loss: 0.6534 - binary_accuracy: 0.5657
Epoch 4/32
1969/1969 - 22s - loss: 0.6538 - binary_accuracy: 0.5700
Epoch 5/32
1969/1969 - 22s - loss: 0.6526 - binary_accuracy: 0.5698
Epoch 6/32
1969/1969 - 23s - loss: 0.6549 - binary_accuracy: 0.5746
Epoch 7/32
1969/1969 - 22s - loss: 0.6529 - binary_accuracy: 0.5786
Epoch 8/32
1969/1969 - 25s - loss: 0.6512 - binary_accuracy: 0.5834
Epoch 9/32
1969/1969 - 22s - loss: 0.6514 - binary_accuracy: 0.5840
Epoch 10/32
1969/1969 - 23s - loss: 0.6523 - binary_accuracy: 0.5898
Epoch 11/32
1969/1969 - 22s - loss: 0.6547 - binary_accuracy: 0.5903
Epoch 12/32
1969/1969 - 23s - loss: 0.6520 - binary_accuracy: 0.5958
Epoch 13/32
1969/1969 - 24s - loss: 0.6538 - binary_accuracy: 0.5966
Epoch 14/32
1969/1969 - 27s - loss: 0.6542 - binary_accuracy: 0.6030
Epoch 15/32
1969/1969 - 22s - loss: 0.6493 

In [27]:
x_inp_src = x_inp[0]
x_out_src = x_out[0]
embedding_model = keras.Model(inputs=x_inp_src, outputs=x_out_src)

In [28]:
node_gen = Attri2VecNodeGenerator(G, batch_size).flow(node_features_df.index)
node_embeddings = embedding_model.predict(node_gen, workers=1, verbose=1)



In [29]:
# node_embeddings[213]

In [42]:
embeddings_wv = dict(zip(node_features_df.index.tolist(), node_embeddings.tolist()))

In [43]:
embeddings_wv['crooked.com']

[0.0003732442855834961,
 4.30028330811183e-06,
 2.5482699129497632e-05,
 3.3911762287175407e-10,
 1.558484490635123e-23,
 0.00019913911819458008,
 0.00023236870765686035,
 5.7033391342997675e-09,
 0.05188429355621338,
 0.029672235250473022,
 5.8983729012140884e-09,
 4.743696546682941e-10,
 0.043988555669784546,
 2.054232572845649e-05,
 0.00021141767501831055,
 0.04522329568862915,
 0.00013944506645202637,
 2.6334089852753095e-05,
 1.3350941117096227e-05,
 4.7838506489483734e-09,
 1.7735639346039278e-10,
 5.188054183712715e-22,
 2.2235178601205473e-24,
 8.901382386738987e-08,
 1.5084044449280723e-23,
 0.0004639625549316406,
 3.986377123510465e-06,
 9.197851914321486e-25,
 0.03558462858200073,
 2.815746938722441e-06,
 0.042846739292144775,
 0.031478822231292725,
 0.0415743887424469,
 0.052382439374923706,
 0.000202864408493042,
 2.745870875912741e-21,
 2.3928846751286592e-11,
 0.0001748204231262207,
 2.2130896013550228e-06,
 7.707193105943588e-08,
 1.4789678125742662e-18,
 0.054405093193

In [45]:
class ModelWrapper:
    def __init__(self, embeddings_wv):
        self.wv = embeddings_wv
        
    def __str__(self):
        return 'Unsupervised Attrib2Vec'

In [36]:
%run ../../../utils/notebook_utils.py

ModuleNotFoundError: No module named 'dataprep'

In [34]:
from utils.notebook_utils import train_model

ModuleNotFoundError: No module named 'utils'

In [46]:
data_year = '2020'
node2vec_model = ModelWrapper(embeddings_wv)

In [33]:
! cd

C:\Users\Paco\Documents\site_similarity\notebooks\node_features_graphs


In [34]:
%run ../../utils/notebook_utils.py

In [42]:
from sklearn import svm
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegressionCV

In [43]:
result_report = []

clf = LogisticRegressionCV(Cs=10, cv=5, scoring="accuracy", multi_class="ovr", max_iter=300, random_state=42)
result_report.append([
    str(model),
    'LogisticRegression CV = 5',
    *list(train_model(clf, node2vec_model=node2vec_model, data_year=data_year).values())
]);

clf2 = LogisticRegressionCV(Cs=10, cv=10, scoring="accuracy", multi_class="ovr", max_iter=300, random_state=42)
result_report.append([
    str(model),
    'LogisticRegression CV = 10',
    *list(train_model(clf2, node2vec_model=node2vec_model, data_year=data_year).values())
]);

tree_clf = GradientBoostingClassifier(random_state=42)
result_report.append([
    str(model),
    'GradientBoostingClassifier',
    *list(train_model(tree_clf, node2vec_model=node2vec_model, data_year=data_year).values())
]);

svm_clf = svm.SVC(decision_function_shape='ovo', probability=True, random_state=42)
result_report.append([
    str(model),
    'SVC ovo',
    *list(train_model(svm_clf, node2vec_model=node2vec_model, data_year=data_year).values())
]);

model_res = pd.DataFrame(result_report,
                    columns=["Feature", "Classifier", "Accuracy", "Balanced Accuracy score",
                             "F1 micro score", "F1 macro score", "F1 weighted score", "MAE", "Confusion matrix"])

Start training...
Start training...
Start training...
Start training...


In [44]:
model_res.head()

Unnamed: 0,Feature,Classifier,Accuracy,Balanced Accuracy score,F1 micro score,F1 macro score,F1 weighted score,MAE,Confusion matrix
0,<tensorflow.python.keras.engine.functional.Fun...,LogisticRegression CV = 5,0.54482,0.369032,0.54482,0.313669,0.437061,0.597206,"[[3, 39, 120], [5, 32, 208], [2, 17, 433]]"
1,<tensorflow.python.keras.engine.functional.Fun...,LogisticRegression CV = 10,0.541327,0.364253,0.541327,0.305611,0.430852,0.601863,"[[2, 39, 121], [5, 30, 210], [2, 17, 433]]"
2,<tensorflow.python.keras.engine.functional.Fun...,GradientBoostingClassifier,0.513388,0.392241,0.513388,0.380391,0.468926,0.636787,"[[30, 38, 94], [42, 44, 159], [35, 50, 367]]"
3,<tensorflow.python.keras.engine.functional.Fun...,SVC ovo,0.543655,0.383852,0.543655,0.327749,0.42968,0.61234,"[[28, 13, 121], [32, 4, 209], [13, 4, 435]]"


In [48]:
import json

with open('attrib2vec.json', 'w') as f:
    json.dump(node2vec_model.wv, f)