In [1]:
import pandas as pd

In [2]:
%pwd

'/media/panayot/cf151fc4-0692-4c72-9d73-892b9c408127/home/panayot/Documents/site_similarity/notebooks/node_features_graphs/corpus 2020'

In [3]:
node_features_file = "corpus_2020_mixed_level_0_to_3_node_features.csv"
edge_file = "corpus_2020_mixed_level_0_to_3_labeded_edges.csv"

In [4]:
node_features_df = pd.read_csv(node_features_file, index_col=0)

In [5]:
node_features_df.head()

Unnamed: 0,alexa_ranks,daily_pageviews_per_visitors,daily_time_on_sites,total_sites_linking_ins,bounce_rate
militarypay.org,3679491.0,1.0,,29.0,
nodownpaymentlandloan.com,,,,,
radicalorange.tv,8574492.0,1.0,,1.0,
mattvanderhoff.com,,,,3.0,
cateringdc.com,,,,3.0,


In [6]:
node_features_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 80752 entries, militarypay.org to unsceb.org
Data columns (total 5 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   alexa_ranks                   49619 non-null  float64
 1   daily_pageviews_per_visitors  49633 non-null  float64
 2   daily_time_on_sites           30032 non-null  float64
 3   total_sites_linking_ins       76914 non-null  float64
 4   bounce_rate                   26043 non-null  float64
dtypes: float64(5)
memory usage: 3.7+ MB


In [7]:
node_features_df.alexa_ranks = node_features_df.alexa_ranks.fillna(0)
node_features_df.total_sites_linking_ins = node_features_df.total_sites_linking_ins.fillna(0)

In [8]:
node_features_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 80752 entries, militarypay.org to unsceb.org
Data columns (total 5 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   alexa_ranks                   80752 non-null  float64
 1   daily_pageviews_per_visitors  49633 non-null  float64
 2   daily_time_on_sites           30032 non-null  float64
 3   total_sites_linking_ins       80752 non-null  float64
 4   bounce_rate                   26043 non-null  float64
dtypes: float64(5)
memory usage: 3.7+ MB


# Normalizing features

In [9]:
node_features_df['normalized_alexa_rank'] = node_features_df['alexa_ranks'].apply(lambda x: 1/x if x else 0)

In [10]:
import math

node_features_df['normalized_total_sites_linked_in'] = node_features_df['total_sites_linking_ins'].apply(lambda x: math.log2(x) if x else 0)

---

In [11]:
edge_df = pd.read_csv(edge_file)

edge_df.head()

Unnamed: 0,source,target,label
0,crooked.com,votesaveamerica.com,similar_by_audience_overlap_to
1,crooked.com,art19.com,similar_by_audience_overlap_to
2,crooked.com,promocodeportal.com,similar_by_audience_overlap_to
3,crooked.com,mediamatters.org,similar_by_audience_overlap_to
4,crooked.com,actblue.com,similar_by_audience_overlap_to


In [12]:
edge_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 186379 entries, 0 to 186378
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   source  186379 non-null  object
 1   target  186379 non-null  object
 2   label   186379 non-null  object
dtypes: object(3)
memory usage: 4.3+ MB


In [13]:
import stellargraph as sg

In [14]:
G = sg.StellarGraph(node_features_df[['normalized_alexa_rank', 'normalized_total_sites_linked_in']], edge_df, edge_type_column='label')
print(G.info())

StellarGraph: Undirected multigraph
 Nodes: 80752, Edges: 186379

 Node types:
  default: [80752]
    Features: float32 vector, length 2
    Edge types: default-referral_site_to->default, default-similar_by_audience_overlap_to->default

 Edge types:
    default-similar_by_audience_overlap_to->default: [128582]
        Weights: all 1 (default)
        Features: none
    default-referral_site_to->default: [57797]
        Weights: all 1 (default)
        Features: none


# Unsupervised GraphSAGE

In [15]:
from stellargraph.mapper import GraphSAGELinkGenerator
from stellargraph.layer import GraphSAGE, link_classification
from stellargraph.data import UnsupervisedSampler

from tensorflow import keras

1. Specify the other optional parameter values: root nodes, the number of walks to take per node, the length of each walk, and random seed.

In [16]:
nodes = list(G.nodes())
number_of_walks = 1
length = 5

2. Create the UnsupervisedSampler instance with the relevant parameters passed to it.

In [17]:
unsupervised_samples = UnsupervisedSampler(
    G, nodes=nodes, length=length, number_of_walks=number_of_walks
)

3. Create a node pair generator:

In [18]:
batch_size = 128
epochs = 4
num_samples = [10, 5]

In [19]:
graphsage_link_generator = GraphSAGELinkGenerator(G, batch_size, num_samples)
train_graphsage_link_gen = graphsage_link_generator.flow(unsupervised_samples)

In [20]:
layer_sizes = [128, 512]
graphsage = GraphSAGE(
    layer_sizes=layer_sizes, generator=graphsage_link_generator, bias=True, dropout=0.0, normalize="l2"
)

In [22]:
# Build the model and expose input and output sockets of graphsage, for node pair inputs:
x_inp, x_out = graphsage.in_out_tensors()

In [23]:
prediction = link_classification(
    output_dim=1, output_act="sigmoid", edge_embedding_method="ip"
)(x_out)

link_classification: using 'ip' method to combine node embeddings into edge embeddings


In [26]:
graphsage_model = keras.Model(inputs=x_inp, outputs=prediction)

graphsage_model.compile(
    optimizer=keras.optimizers.Adam(lr=1e-3),
    loss=keras.losses.categorical_crossentropy,
    metrics=['acc'],
)

In [27]:
history = graphsage_model.fit(
    train_graphsage_link_gen,
    epochs=epochs,
    verbose=1,
    use_multiprocessing=False,
    workers=4,
    shuffle=True,
)

"""
Epoch 1/4
   2/5047 [..............................] - ETA: 3:09 - loss: 0.7847 - binary_accuracy: 0.515 - ETA: 9:20 - loss: 0.7903 - binary_accuracy: 0.5039WARNING:tensorflow:Callbacks method `on_train_batch_end` is slow compared to the batch time (batch time: 0.0129s vs `on_train_batch_end` time: 0.2059s). Check your callbacks.
5047/5047 [==============================] - 548s 109ms/step - loss: 0.5881 - binary_accuracy: 0.7075 8:59 - ETA: 8:3 - ETA: 8:01 - - ETA: 7:50 - loss: 0.6312  - ETA: 7:35 - loss: 0.6251 - bin - ETA: 7:13 - loss: 0.6196 - binary_acc - ETA: 7:11 - loss: 0.6192 -  - ETA: 7:08 - loss: 0.6188 -  - ETA: 7:04 - loss: 0.6183 - binary_accu - ETA: 7:04 - loss: 0.6179 - binary_a - ETA: 7:01 - loss: 0.6174 -  - ETA: 6:52 - los - ETA: 6:48 - loss: 0.6148 - binary_accuracy: 0. - ETA: 6:47 - loss: 0.6146 - - ETA: 6:45 - loss: 0.6 - ETA: 6:41  - ETA: 6:31 - loss: 0.6121 - binary_accuracy: 0.6 - ETA: 6:30 - loss: 0. - ETA: 6:26 - los - ETA: 6:02 - loss: 0.608 - ETA: 5:58 -  - ETA: 5:54  - ETA: 4:32 - loss: 0.6004 -  - ETA: 4:30 - loss: 0.6001 - binary_accuracy:  - ETA: 4:29 - loss: 0.6001 - binary_accuracy: - ETA: 4:28 - loss: 0.6001 - binary_accu - ETA: 10s - loss: 0.5883 - bina - ETA: 9s - loss: 0.5883 - binary_accuracy: 0.7 - ETA: 9s - loss: 0.5883 - binary 
Epoch 2/4
5047/5047 [==============================] - 547s 108ms/step - loss: 0.5730 - binary_accuracy: 0.7293 9:22 - loss: 0.5795 - binary_accur - ETA: 8:40 - loss: 0.5779 - binary_accuracy - ETA: 8:38 - loss: 0.5780 - binary_accuracy: 0. - ETA: 8:37 - loss: 0.5778 - binary_accuracy - ETA: 8:36 - loss: 0.5779 - b - ETA:  - ETA: 5:54 - loss: 0.57 - ETA: 5:31 - lo - ETA: 5:02 - loss: 0.57 - ETA: 4:58 - loss: 0.5736 - binary_accuracy: 0.7 - ETA: 4:58 - loss: 0.5737 - - ETA: 4:48 - loss: 0.5738 - binary_accurac - ETA: 4:48 - l - ETA:  - ETA: 4:32 - loss: - ETA: 4:28 - loss: 0.5738 - binary_accuracy: 0.7 - ETA: 4:28 - loss: 0.5738  -  - ETA: 3:46 - loss: 0.5737 - binary_accuracy: 0.726 - ET - ETA: 3:41 - loss: 0.5737 - binary_ac - ETA: 3:39 -  - ETA: 3:34 - loss: 0.5737 - binary_ac - E - ETA: 2:50 - loss: 0.5736 - binary_accuracy: 0. - ETA: 2:24 - loss: 0.5735 - binary_accuracy: 0. - ETA: 2:23 - -
Epoch 3/4
5047/5047 [==============================] - 550s 109ms/step - loss: 0.5698 - binary_accuracy: 0.7398
Epoch 4/4
5047/5047 [==============================] - 546s 108ms/step - loss: 0.5698 - binary_accuracy: 0.7398 10:00 - loss:  - ETA: 4:31 - lo - ETA: 4:27 - loss: 0.5700 - binary_accu - - ETA: 4:07 - loss: 0.5699 - binary_accura - ETA: 3:59 - ETA: 3:55 - loss - ETA:  - ETA: 3:39 - loss: 0.5701 - binary_accuracy - ETA: 3:38 - loss: 0.5701 - binary_ac - ETA: 3: - ETA: 3:31 - loss: 0.5701 - binary_acc - ETA: 3:30 - loss: 0.5702 - bin - ETA: 3:27 - loss: 0 - ETA:  - ETA: 2:53 - lo - E - ETA: 2:37 - l - ETA: 2:32 - loss:  - ETA: 2:28 - loss: 0.5703 - binary_accuracy: 0. - ETA: 2:28 - loss: 0.5703 - binary_accura - ETA: 2:27 - loss: 0.57 - ETA: 2:23 - loss: 0.5702 - b - ETA: 2:21 -  - E - ETA: 54s - loss: 0.5698 - binary_a - ETA: 4s - loss: 0.5
"""

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4




In [28]:
embedding_model = keras.Model(inputs=x_inp[0::2], outputs=x_out[0])

In [29]:
from stellargraph.mapper import GraphSAGENodeGenerator

In [30]:
node_gen = GraphSAGENodeGenerator(G, batch_size, num_samples).flow(node_features_df.index)
node_embeddings = embedding_model.predict(node_gen, workers=4, verbose=1)



In [31]:
embeddings_wv = dict(zip(node_features_df.index.tolist(), node_embeddings.tolist()))

In [35]:
embeddings_wv['crooked.com'][:10]

[0.09117964655160904,
 -0.0014781644567847252,
 -0.05536714196205139,
 0.08759300410747528,
 0.024940982460975647,
 0.041420988738536835,
 0.03607349470257759,
 -0.003150839125737548,
 0.014768715016543865,
 0.04115275666117668]

In [33]:
class ModelWrapper:
    def __init__(self, embeddings_wv):
        self.wv = embeddings_wv
        
    def __str__(self):
        return 'Unsupervised GraphSAGE'

In [31]:
import sys,os
sys.path.append("C:\\Users\\Paco\\Documents\\site_similarity")

In [37]:
from utils.notebook_utils import train_model

In [34]:
data_year = '2020'
node2vec_model = ModelWrapper(embeddings_wv)

C:\Users\Paco\Documents\site_similarity\notebooks\node_features_graphs


In [65]:
%run ../../utils/notebook_utils.py

In [39]:
from sklearn import svm
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegressionCV

In [57]:
result_report = []

clf = LogisticRegressionCV(Cs=10, cv=5, scoring="accuracy", multi_class="ovr", max_iter=300, random_state=42)
result_report.append([
    str(node2vec_model),
    'LogisticRegression CV = 5',
    *list(train_model(clf, node2vec_model=node2vec_model, data_year=data_year).values())
]);

clf2 = LogisticRegressionCV(Cs=10, cv=10, scoring="accuracy", multi_class="ovr", max_iter=300, random_state=42)
result_report.append([
    str(node2vec_model),
    'LogisticRegression CV = 10',
    *list(train_model(clf2, node2vec_model=node2vec_model, data_year=data_year).values())
]);

tree_clf = GradientBoostingClassifier(random_state=42)
result_report.append([
    str(node2vec_model),
    'GradientBoostingClassifier',
    *list(train_model(tree_clf, node2vec_model=node2vec_model, data_year=data_year).values())
]);

svm_clf = svm.SVC(decision_function_shape='ovo', probability=True, random_state=42)
result_report.append([
    str(node2vec_model),
    'SVC ovo',
    *list(train_model(svm_clf, node2vec_model=node2vec_model, task='bias', data_year=data_year).values())
]);

model_res = pd.DataFrame(result_report,
                    columns=["Feature", "Classifier", "Accuracy", "Balanced Accuracy score",
                             "F1 micro score", "F1 macro score", "F1 weighted score", "MAE", "Confusion matrix"])

Start training...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

Start training...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

Start training...
Start training...


In [42]:
model_res.head()

Unnamed: 0,Feature,Classifier,Accuracy,Balanced Accuracy score,F1 micro score,F1 macro score,F1 weighted score,MAE,Confusion matrix
0,Unsupervised GraphSAGE,LogisticRegression CV = 5,0.602817,0.517979,0.602817,0.44296,0.519788,0.540845,"[[177, 2, 76], [108, 1, 159], [77, 1, 464]]"
1,Unsupervised GraphSAGE,LogisticRegression CV = 10,0.583099,0.480394,0.583099,0.432259,0.511468,0.562441,"[[135, 26, 94], [88, 8, 172], [61, 3, 478]]"
2,Unsupervised GraphSAGE,GradientBoostingClassifier,0.552113,0.473805,0.552113,0.45545,0.521618,0.571831,"[[133, 50, 72], [89, 32, 147], [60, 59, 423]]"
3,Unsupervised GraphSAGE,SVC ovo,0.612207,0.52973,0.612207,0.448777,0.526555,0.523944,"[[186, 0, 69], [115, 0, 153], [76, 0, 466]]"


In [43]:
model_res.head()

Unnamed: 0,Feature,Classifier,Accuracy,Balanced Accuracy score,F1 micro score,F1 macro score,F1 weighted score,MAE,Confusion matrix
0,Unsupervised GraphSAGE,LogisticRegression CV = 5,0.529686,0.394836,0.529686,0.363607,0.454971,0.6461,"[[35, 20, 107], [38, 21, 186], [44, 9, 399]]"
1,Unsupervised GraphSAGE,LogisticRegression CV = 10,0.532014,0.379328,0.532014,0.349408,0.453807,0.633295,"[[16, 29, 117], [23, 34, 188], [25, 20, 407]]"
2,Unsupervised GraphSAGE,GradientBoostingClassifier,0.49709,0.391707,0.49709,0.384743,0.465828,0.643772,"[[36, 46, 80], [45, 47, 153], [41, 67, 344]]"
3,Unsupervised GraphSAGE,SVC ovo,0.48312,0.455694,0.48312,0.381256,0.411339,0.636787,"[[3, 136, 100], [0, 211, 60], [3, 145, 201]]"


In [58]:
model_res.head()

Unnamed: 0,Feature,Classifier,Accuracy,Balanced Accuracy score,F1 micro score,F1 macro score,F1 weighted score,MAE,Confusion matrix
0,Unsupervised GraphSAGE,LogisticRegression CV = 5,0.536671,0.376078,0.536671,0.339325,0.452775,0.603027,"[[8, 43, 111], [14, 41, 190], [9, 31, 412]]"
1,Unsupervised GraphSAGE,LogisticRegression CV = 10,0.536671,0.374906,0.536671,0.337133,0.448573,0.615832,"[[9, 30, 123], [15, 37, 193], [8, 29, 415]]"
2,Unsupervised GraphSAGE,GradientBoostingClassifier,0.518044,0.404907,0.518044,0.398006,0.481649,0.607683,"[[35, 48, 79], [42, 49, 154], [29, 62, 361]]"
3,Unsupervised GraphSAGE,SVC ovo,0.476135,0.445895,0.476135,0.36956,0.401445,0.648428,"[[0, 133, 106], [2, 201, 68], [1, 140, 208]]"


In [36]:
import json

with open('graphsage.json', 'w') as f:
    json.dump(node2vec_model.wv, f)

In [37]:
print('DONE')

DONE
