In [1]:
import pandas as pd

In [2]:
!cd

C:\Users\Paco\Documents\site_similarity\notebooks\node_features_graphs


In [3]:
node_features_file = "../generate_node_features/corpus_2018_audience_overlap_level_0_and_1_node_features.csv"
edge_file = "../generate_node_features/corpus_2018_audience_overlap_level_0_and_1_edges.csv"

In [4]:
node_features_df = pd.read_csv(node_features_file, index_col=0)

In [5]:
node_features_df.head()

Unnamed: 0_level_0,alexa_ranks,daily_pageviews_per_visitors,daily_time_on_sites,total_sites_linking_ins,bounce_rate
urls,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
crotonmusiccenter.com,,,,,
gracefuneralhome.com,,,,4.0,
haaretz.com,11522.0,1.5,152.0,13974.0,0.65
osubeavers.com,422967.0,2.5,299.0,430.0,0.471
prophezine.com,8782212.0,1.0,,209.0,


In [6]:
node_features_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 11865 entries, crotonmusiccenter.com to yorkregion.com
Data columns (total 5 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   alexa_ranks                   8658 non-null   float64
 1   daily_pageviews_per_visitors  8659 non-null   float64
 2   daily_time_on_sites           6459 non-null   float64
 3   total_sites_linking_ins       11470 non-null  float64
 4   bounce_rate                   6023 non-null   float64
dtypes: float64(5)
memory usage: 556.2+ KB


In [7]:
node_features_df.alexa_ranks = node_features_df.alexa_ranks.fillna(0)
node_features_df.total_sites_linking_ins = node_features_df.total_sites_linking_ins.fillna(0)

In [8]:
node_features_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 11865 entries, crotonmusiccenter.com to yorkregion.com
Data columns (total 5 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   alexa_ranks                   11865 non-null  float64
 1   daily_pageviews_per_visitors  8659 non-null   float64
 2   daily_time_on_sites           6459 non-null   float64
 3   total_sites_linking_ins       11865 non-null  float64
 4   bounce_rate                   6023 non-null   float64
dtypes: float64(5)
memory usage: 556.2+ KB


# Normalizing features

In [10]:
node_features_df['normalized_alexa_rank'] = node_features_df['alexa_ranks'].apply(lambda x: 1/x if x else 0)

In [11]:
import math

node_features_df['normalized_total_sites_linked_in'] = node_features_df['total_sites_linking_ins'].apply(lambda x: math.log2(x) if x else 0)

---

In [12]:
 edge_df = pd.read_csv(edge_file)

edge_df.head()

Unnamed: 0,source,target
0,villagevoice.com,nylon.com
1,villagevoice.com,slantmagazine.com
2,villagevoice.com,gothamist.com
3,villagevoice.com,screendaily.com
4,villagevoice.com,amny.com


In [13]:
edge_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20399 entries, 0 to 20398
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   source  20399 non-null  object
 1   target  20399 non-null  object
dtypes: object(2)
memory usage: 318.9+ KB


In [14]:
import stellargraph as sg

In [15]:
G = sg.StellarGraph(node_features_df[['normalized_alexa_rank', 'normalized_total_sites_linked_in']], edge_df)
print(G.info())

StellarGraph: Undirected multigraph
 Nodes: 11865, Edges: 20399

 Node types:
  default: [11865]
    Features: float32 vector, length 2
    Edge types: default-default->default

 Edge types:
    default-default->default: [20399]
        Weights: all 1 (default)
        Features: none


# Unsupervised GraphSAGE

In [16]:
from stellargraph.mapper import GraphSAGELinkGenerator
from stellargraph.layer import GraphSAGE, link_classification
from stellargraph.data import UnsupervisedSampler

from tensorflow import keras

1. Specify the other optional parameter values: root nodes, the number of walks to take per node, the length of each walk, and random seed.

In [17]:
nodes = list(G.nodes())
number_of_walks = 1
length = 5

2. Create the UnsupervisedSampler instance with the relevant parameters passed to it.

In [18]:
unsupervised_samples = UnsupervisedSampler(
    G, nodes=nodes, length=length, number_of_walks=number_of_walks
)

3. Create a node pair generator:

In [19]:
batch_size = 50
epochs = 4
num_samples = [10, 5]

In [20]:
graphsage_link_generator = GraphSAGELinkGenerator(G, batch_size, num_samples)
train_graphsage_link_gen = graphsage_link_generator.flow(unsupervised_samples)

In [21]:
layer_sizes = [50, 50]
graphsage = GraphSAGE(
    layer_sizes=layer_sizes, generator=graphsage_link_generator, bias=True, dropout=0.0, normalize="l2"
)

In [22]:
# Build the model and expose input and output sockets of graphsage, for node pair inputs:
x_inp, x_out = graphsage.in_out_tensors()

In [23]:
prediction = link_classification(
    output_dim=1, output_act="sigmoid", edge_embedding_method="ip"
)(x_out)

link_classification: using 'ip' method to combine node embeddings into edge embeddings


In [24]:
graphsage_model = keras.Model(inputs=x_inp, outputs=prediction)

graphsage_model.compile(
    optimizer=keras.optimizers.Adam(lr=1e-3),
    loss=keras.losses.binary_crossentropy,
    metrics=[keras.metrics.binary_accuracy],
)

In [25]:
history = graphsage_model.fit(
    train_graphsage_link_gen,
    epochs=epochs,
    verbose=1,
    use_multiprocessing=False,
    workers=4,
    shuffle=True,
)

  ...
    to  
  ['...']
Train for 1899 steps
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [26]:
embedding_model = keras.Model(inputs=x_inp[0::2], outputs=x_out[0])

In [27]:
from stellargraph.mapper import GraphSAGENodeGenerator

In [28]:
node_gen = GraphSAGENodeGenerator(G, batch_size, num_samples).flow(node_features_df.index)
node_embeddings = embedding_model.predict(node_gen, workers=4, verbose=1)



In [29]:
embeddings_wv = dict(zip(node_features_df.index.tolist(), node_embeddings))

In [30]:
embeddings_wv['villagevoice.com']

array([-0.02094992, -0.1328248 ,  0.07507049, -0.00463302,  0.06558063,
       -0.07781564,  0.10833558,  0.04245037, -0.06713997, -0.15222527,
       -0.13741234, -0.09656978, -0.04023855, -0.07948432,  0.03206547,
       -0.05928008,  0.0360953 , -0.12924862,  0.12300256,  0.07467916,
       -0.11858932, -0.08533636,  0.10595208,  0.08225381,  0.13885903,
       -0.25729102, -0.00386654,  0.0494529 ,  0.112965  , -0.18183158,
       -0.00151612,  0.2918245 , -0.02819349,  0.23774901,  0.22809309,
       -0.25974405, -0.02869534, -0.23449852,  0.09044959, -0.11390558,
        0.29788902, -0.12123151,  0.00601559,  0.29668498, -0.11312789,
        0.06312427,  0.17530625, -0.227219  , -0.15350504,  0.13797605],
      dtype=float32)

In [31]:
class ModelWrapper:
    def __init__(self, embeddings_wv):
        self.wv = embeddings_wv
        
    def __str__(self):
        return 'Unsupervised GraphSAGE'

In [32]:
import sys,os
sys.path.append("C:\\Users\\Paco\\Documents\\site_similarity")

In [33]:
from utils.notebook_utils import train_model

In [37]:
data_year = '2018'
node2vec_model = ModelWrapper(embeddings_wv)

In [52]:
! cd

C:\Users\Paco\Documents\site_similarity\notebooks\node_features_graphs


In [65]:
%run ../../utils/notebook_utils.py

In [38]:
from sklearn import svm
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegressionCV

In [39]:
clf = LogisticRegressionCV(Cs=10, cv=5, scoring="accuracy", multi_class="ovr", max_iter=300, random_state=42)
resss = train_model(clf, node2vec_model=node2vec_model, data_year=data_year)


Start training...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

In [40]:
resss

{'Accuracy': 0.6028169014084507,
 'Balanced Accuracy score': 0.5179791837426714,
 'F1 micro score': 0.6028169014084507,
 'F1 macro score': 0.44296030283519466,
 'F1 weighted score': 0.5197880195822198,
 'MAE': 0.5408450704225352,
 'Confusion matrix': [[177, 2, 76], [108, 1, 159], [77, 1, 464]]}

In [41]:
result_report = []

clf = LogisticRegressionCV(Cs=10, cv=5, scoring="accuracy", multi_class="ovr", max_iter=300, random_state=42)
result_report.append([
    str(model),
    'LogisticRegression CV = 5',
    *list(train_model(clf, node2vec_model=node2vec_model, data_year=data_year).values())
]);

clf2 = LogisticRegressionCV(Cs=10, cv=10, scoring="accuracy", multi_class="ovr", max_iter=300, random_state=42)
result_report.append([
    str(model),
    'LogisticRegression CV = 10',
    *list(train_model(clf2, node2vec_model=node2vec_model, data_year=data_year).values())
]);

tree_clf = GradientBoostingClassifier(random_state=42)
result_report.append([
    str(model),
    'GradientBoostingClassifier',
    *list(train_model(tree_clf, node2vec_model=node2vec_model, data_year=data_year).values())
]);

svm_clf = svm.SVC(decision_function_shape='ovo', probability=True, random_state=42)
result_report.append([
    str(model),
    'SVC ovo',
    *list(train_model(svm_clf, node2vec_model=node2vec_model, data_year=data_year).values())
]);

model_res = pd.DataFrame(result_report,
                    columns=["Feature", "Classifier", "Accuracy", "Balanced Accuracy score",
                             "F1 micro score", "F1 macro score", "F1 weighted score", "MAE", "Confusion matrix"])

Start training...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

Start training...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

Start training...
Start training...


In [42]:
model_res.head()

Unnamed: 0,Feature,Classifier,Accuracy,Balanced Accuracy score,F1 micro score,F1 macro score,F1 weighted score,MAE,Confusion matrix
0,Unsupervised GraphSAGE,LogisticRegression CV = 5,0.602817,0.517979,0.602817,0.44296,0.519788,0.540845,"[[177, 2, 76], [108, 1, 159], [77, 1, 464]]"
1,Unsupervised GraphSAGE,LogisticRegression CV = 10,0.583099,0.480394,0.583099,0.432259,0.511468,0.562441,"[[135, 26, 94], [88, 8, 172], [61, 3, 478]]"
2,Unsupervised GraphSAGE,GradientBoostingClassifier,0.552113,0.473805,0.552113,0.45545,0.521618,0.571831,"[[133, 50, 72], [89, 32, 147], [60, 59, 423]]"
3,Unsupervised GraphSAGE,SVC ovo,0.612207,0.52973,0.612207,0.448777,0.526555,0.523944,"[[186, 0, 69], [115, 0, 153], [76, 0, 466]]"
