In [1]:
import pandas as pd

In [2]:
%pwd

'/home/panayot/Documents/site_similarity/notebooks/node_features_graphs/corpus 2020'

In [3]:
node_features_file = "../../generate_node_features/corpus_2020_audience_overlap_level_0_and_1_node_features.csv"
edge_file = "../../generate_node_features/combined_data_corpus_2020_level_0_1_df_edges.csv"

In [4]:
node_features_df = pd.read_csv(node_features_file, index_col=0)

In [5]:
node_features_df.head()

Unnamed: 0,alexa_ranks,daily_pageviews_per_visitors,daily_time_on_sites,total_sites_linking_ins,bounce_rate
gradescope.com,11014.0,4.7,296.0,103.0,0.222
parentlink.net,151438.0,3.0,203.0,93.0,0.301
nationalpartnership.org,604522.0,1.3,156.0,811.0,0.765
sharondraper.com,1209734.0,2.0,109.0,209.0,0.615
trade.gov,55944.0,1.9,137.0,2392.0,0.654


In [6]:
node_features_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 12303 entries, gradescope.com to growveg.com
Data columns (total 5 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   alexa_ranks                   9128 non-null   float64
 1   daily_pageviews_per_visitors  9129 non-null   float64
 2   daily_time_on_sites           6780 non-null   float64
 3   total_sites_linking_ins       11966 non-null  float64
 4   bounce_rate                   6300 non-null   float64
dtypes: float64(5)
memory usage: 576.7+ KB


In [7]:
node_features_df.alexa_ranks = node_features_df.alexa_ranks.fillna(0)
node_features_df.total_sites_linking_ins = node_features_df.total_sites_linking_ins.fillna(0)

In [8]:
node_features_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 12303 entries, gradescope.com to growveg.com
Data columns (total 5 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   alexa_ranks                   12303 non-null  float64
 1   daily_pageviews_per_visitors  9129 non-null   float64
 2   daily_time_on_sites           6780 non-null   float64
 3   total_sites_linking_ins       12303 non-null  float64
 4   bounce_rate                   6300 non-null   float64
dtypes: float64(5)
memory usage: 576.7+ KB


# Normalizing features

In [9]:
node_features_df['normalized_alexa_rank'] = node_features_df['alexa_ranks'].apply(lambda x: 1/x if x else 0)

In [10]:
import math

node_features_df['normalized_total_sites_linked_in'] = node_features_df['total_sites_linking_ins'].apply(lambda x: math.log2(x) if x else 0)

---

In [11]:
edge_df = pd.read_csv(edge_file)

edge_df.head()

Unnamed: 0,source,target
0,crooked.com,votesaveamerica.com
1,crooked.com,art19.com
2,crooked.com,promocodeportal.com
3,crooked.com,mediamatters.org
4,crooked.com,actblue.com


In [12]:
edge_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28381 entries, 0 to 28380
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   source  28381 non-null  object
 1   target  28381 non-null  object
dtypes: object(2)
memory usage: 443.6+ KB


In [13]:
import stellargraph as sg

In [14]:
G = sg.StellarGraph(node_features_df[['normalized_alexa_rank', 'normalized_total_sites_linked_in']], edge_df)
print(G.info())

StellarGraph: Undirected multigraph
 Nodes: 12303, Edges: 28381

 Node types:
  default: [12303]
    Features: float32 vector, length 2
    Edge types: default-default->default

 Edge types:
    default-default->default: [28381]
        Weights: all 1 (default)
        Features: none


# Unsupervised GraphSAGE

In [15]:
from stellargraph.mapper import GraphSAGELinkGenerator
from stellargraph.layer import GraphSAGE, link_classification
from stellargraph.data import UnsupervisedSampler

from tensorflow import keras

1. Specify the other optional parameter values: root nodes, the number of walks to take per node, the length of each walk, and random seed.

In [16]:
nodes = list(G.nodes())
number_of_walks = 1
length = 5

2. Create the UnsupervisedSampler instance with the relevant parameters passed to it.

In [17]:
unsupervised_samples = UnsupervisedSampler(
    G, nodes=nodes, length=length, number_of_walks=number_of_walks
)

3. Create a node pair generator:

In [18]:
batch_size = 50
epochs = 4
num_samples = [10, 5]

In [19]:
graphsage_link_generator = GraphSAGELinkGenerator(G, batch_size, num_samples)
train_graphsage_link_gen = graphsage_link_generator.flow(unsupervised_samples)

In [20]:
layer_sizes = [50, 50]
graphsage = GraphSAGE(
    layer_sizes=layer_sizes, generator=graphsage_link_generator, bias=True, dropout=0.0, normalize="l2"
)

In [21]:
# Build the model and expose input and output sockets of graphsage, for node pair inputs:
x_inp, x_out = graphsage.in_out_tensors()

In [22]:
prediction = link_classification(
    output_dim=1, output_act="sigmoid", edge_embedding_method="ip"
)(x_out)

link_classification: using 'ip' method to combine node embeddings into edge embeddings


In [23]:
graphsage_model = keras.Model(inputs=x_inp, outputs=prediction)

graphsage_model.compile(
    optimizer=keras.optimizers.Adam(lr=1e-3),
    loss=keras.losses.binary_crossentropy,
    metrics=[keras.metrics.binary_accuracy],
)

In [24]:
history = graphsage_model.fit(
    train_graphsage_link_gen,
    epochs=epochs,
    verbose=1,
    use_multiprocessing=False,
    workers=4,
    shuffle=True,
)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [25]:
embedding_model = keras.Model(inputs=x_inp[0::2], outputs=x_out[0])

In [26]:
from stellargraph.mapper import GraphSAGENodeGenerator

In [27]:
node_gen = GraphSAGENodeGenerator(G, batch_size, num_samples).flow(node_features_df.index)
node_embeddings = embedding_model.predict(node_gen, workers=4, verbose=1)



In [28]:
embeddings_wv = dict(zip(node_features_df.index.tolist(), node_embeddings))

In [29]:
embeddings_wv['crooked.com']

array([ 0.05971478,  0.00864197, -0.08105478,  0.0357924 ,  0.04657223,
       -0.05845314,  0.04726389, -0.07202234,  0.06571876,  0.06593163,
       -0.08006115,  0.05414655, -0.01804004,  0.03212657,  0.07043181,
       -0.06138796, -0.02328918, -0.08573235,  0.00427705, -0.05260979,
       -0.08888519,  0.06268027, -0.12115914,  0.08072276, -0.00535161,
        0.03818655, -0.28611764,  0.00416019, -0.17649493,  0.126567  ,
       -0.1497932 ,  0.34577852,  0.07816377,  0.16836183,  0.12882394,
       -0.23789914, -0.36519507, -0.11175837, -0.03432125,  0.20505273,
        0.16518307,  0.05929676, -0.20232311,  0.20741707, -0.2757201 ,
        0.09132462, -0.25488302,  0.1479867 ,  0.12509027, -0.18111931],
      dtype=float32)

In [30]:
class ModelWrapper:
    def __init__(self, embeddings_wv):
        self.wv = embeddings_wv
        
    def __str__(self):
        return 'Unsupervised GraphSAGE'

In [31]:
import sys,os
sys.path.append("C:\\Users\\Paco\\Documents\\site_similarity")

In [32]:
from utils.notebook_utils import train_model

ModuleNotFoundError: No module named 'utils'

In [37]:
data_year = '2020'
node2vec_model = ModelWrapper(embeddings_wv)

C:\Users\Paco\Documents\site_similarity\notebooks\node_features_graphs


In [65]:
%run ../../utils/notebook_utils.py

In [38]:
from sklearn import svm
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegressionCV

In [41]:
result_report = []

clf = LogisticRegressionCV(Cs=10, cv=5, scoring="accuracy", multi_class="ovr", max_iter=300, random_state=42)
result_report.append([
    str(node2vec_model),
    'LogisticRegression CV = 5',
    *list(train_model(clf, node2vec_model=node2vec_model, data_year=data_year).values())
]);

clf2 = LogisticRegressionCV(Cs=10, cv=10, scoring="accuracy", multi_class="ovr", max_iter=300, random_state=42)
result_report.append([
    str(node2vec_model),
    'LogisticRegression CV = 10',
    *list(train_model(clf2, node2vec_model=node2vec_model, data_year=data_year).values())
]);

tree_clf = GradientBoostingClassifier(random_state=42)
result_report.append([
    str(node2vec_model),
    'GradientBoostingClassifier',
    *list(train_model(tree_clf, node2vec_model=node2vec_model, data_year=data_year).values())
]);

svm_clf = svm.SVC(decision_function_shape='ovo', probability=True, random_state=42)
result_report.append([
    str(node2vec_model),
    'SVC ovo',
    *list(train_model(svm_clf, node2vec_model=node2vec_model, data_year=data_year).values())
]);

model_res = pd.DataFrame(result_report,
                    columns=["Feature", "Classifier", "Accuracy", "Balanced Accuracy score",
                             "F1 micro score", "F1 macro score", "F1 weighted score", "MAE", "Confusion matrix"])

Start training...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

Start training...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

Start training...
Start training...


In [42]:
model_res.head()

Unnamed: 0,Feature,Classifier,Accuracy,Balanced Accuracy score,F1 micro score,F1 macro score,F1 weighted score,MAE,Confusion matrix
0,Unsupervised GraphSAGE,LogisticRegression CV = 5,0.602817,0.517979,0.602817,0.44296,0.519788,0.540845,"[[177, 2, 76], [108, 1, 159], [77, 1, 464]]"
1,Unsupervised GraphSAGE,LogisticRegression CV = 10,0.583099,0.480394,0.583099,0.432259,0.511468,0.562441,"[[135, 26, 94], [88, 8, 172], [61, 3, 478]]"
2,Unsupervised GraphSAGE,GradientBoostingClassifier,0.552113,0.473805,0.552113,0.45545,0.521618,0.571831,"[[133, 50, 72], [89, 32, 147], [60, 59, 423]]"
3,Unsupervised GraphSAGE,SVC ovo,0.612207,0.52973,0.612207,0.448777,0.526555,0.523944,"[[186, 0, 69], [115, 0, 153], [76, 0, 466]]"
