# Basic definitions. Centrality Measures. Feature engineering.

In this notebook we introduce some basics of networkx library. More in-depth tutorial can be found [here](https://networkx.github.io/documentation/stable/tutorial.html)

In [None]:
import os
import numpy as np
import pandas as pd

from sklearn.manifold import TSNE
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import accuracy_score
import networkx as nx

In [None]:
import matplotlib.pyplot as plt

%matplotlib inline

### Networkx basics

In [None]:
# creating an empty graph
G = nx.Graph(name="Friendship")

In [None]:
# adding one node
G.add_node("John")

In [None]:
# adding several nodes as a list
G.add_nodes_from(["Liz", "Katie", "Sam", "Bob", "Tom", "Mary"])

In [None]:
# adding an edge
G.add_edge("Liz","John")

In [None]:
# adding a list of edges:
G.add_edges_from([("Liz","John"),("Katie","Sam"), ("Bob","Katie"), ("Katie", "Tom"), ("Liz", "Tom")])

In [None]:
print(G.nodes())
print(G.edges())

In [None]:
print(G)

In [None]:
nx.draw(G, with_labels = True)

### __Question: what will happen if we add edges with the nodes that do not exist in the graph?__

In [None]:
G.add_edges_from([("A","B"), (1,2), ("Bob","Santa")])

In [None]:
# Answer

In [None]:
# removing an edge
print(G.edges())
G.remove_edge(1,2)
print(G.edges()) # edge is removed
print(G.nodes()) # but nodes are still there

In [None]:
G.remove_nodes_from([1,2])

In [None]:
# Relabeling nodes
correction = {"A":"Tim", "B":"Ben"}

H = nx.relabel_nodes(G, correction)
nx.draw(H, with_labels = True)

In [None]:
# Changing the graph to directed
H_directed = H.to_directed()
print(H_directed)
nx.draw(H_directed, with_labels = True)

In [None]:
# connected components and isolates
nx.number_connected_components(H)

In [None]:
nx.connected_components(H) # displays an object

In [None]:
list(nx.connected_components(H)) # transform it to the list for the results

In [None]:
list(nx.isolates(H))

### Creating graph from diffrent formats

In [None]:
adjacency_matrix = np.array([[0, 1, 1],
              [1, 0, 1],
              [1, 1, 0]])
G = nx.from_numpy_array(adjacency_matrix)
print("Nodes: ")
print(G.nodes())
print("Edges: ")
print(G.edges())
nx.draw(G)

In [None]:
adjacency_list = {'A':['B','C','D', 'F', 'G'], 
                  'B': ['A', 'G'], 
                  'D': ['A', 'C'], 
                  'G': ['C','F']
                 }

In [None]:
G = nx.from_dict_of_lists(adjacency_list, create_using=nx.DiGraph())
print(G)
nx.draw(G, with_labels=True)

### Loading from data

#### Data description

>Network of coappearances of characters in the Game of Thrones series, by George R. R. Martin, and in particular coappearances in the book "A Storm of Swords." Nodes are unique characters, and edges are weighted by the number of times the two characters' names appeared within 15 words of each other in the text.

Donwload data from http://www.macalester.edu/~abeverid/data/stormofswords.csv

In [None]:
data_dir = "../data/GoT"
data_location = os.path.expanduser(os.path.join(data_dir, "stormofswords.csv"))

In [None]:
dt = pd.read_csv(data_location)

In [None]:
dt.head()

In [None]:
dt.shape

In [None]:
G = nx.from_pandas_edgelist(dt, source='Source', target='Target')
print(G)

In [None]:
nx.number_connected_components(G)

### Plotting

In [None]:
# More sophisticated graph drawing

plt.figure(figsize=(10,8)) # change the size of the figure
pos = nx.fruchterman_reingold_layout(G, iterations=10) # specify layout
# nodes
nx.draw_networkx_nodes(G, pos, node_size=600, node_color='#ec008c', edgecolors='grey', alpha=0.3)

# edges
nx.draw_networkx_edges(G, pos,
                       width=1, alpha=0.5, edge_color='grey')

# labels
nx.draw_networkx_labels(G, pos, font_size=10, font_family='sans-serif')

plt.axis('off')
plt.show()

### Centrality measures

In [None]:
degree_centrality = nx.degree_centrality(G) 

In [None]:
degree_centrality_sorted = sorted(degree_centrality.items(), key=lambda x: x[1], reverse=True)

In [None]:
degree_centrality_sorted[:5]

In [None]:
node_size = [v * 3000 for v in degree_centrality.values()]

In [None]:
plt.figure(figsize=(10,8)) # change the size of the figure
pos = nx.kamada_kawai_layout(G) # specify layout
# nodes
nx.draw_networkx_nodes(G, pos, node_size=node_size, node_color='#ec008c', edgecolors='grey', alpha=0.3)

# edges
nx.draw_networkx_edges(G, pos,
                       width=1, alpha=0.5, edge_color='grey')

# labels
nx.draw_networkx_labels(G, pos, font_size=10, font_family='sans-serif')

plt.axis('off')
plt.show()

In [None]:
betweenness = nx.betweenness_centrality(G)
betweenness_sorted = sorted(betweenness.items(), key=lambda x: x[1], reverse=True)
betweenness_sorted[:5]

In [None]:
closeness = nx.closeness_centrality(G)
closeness_sorted = sorted(closeness.items(), key=lambda x: x[1], reverse=True)
closeness_sorted[:5]

In [None]:
eigenvector = nx.eigenvector_centrality(G)
eigenvector_sorted = sorted(eigenvector.items(), key=lambda x: x[1], reverse=True)
eigenvector_sorted[:5]

### Cora dataset

The dataset is the citation network Cora.

It can be downloaded by clicking [here](https://linqs-data.soe.ucsc.edu/public/lbc/cora.tgz)

The following is the description of the dataset from the publisher,

> The Cora dataset consists of 2708 scientific publications classified into one of seven classes. The citation network consists of 5429 links. Each publication in the dataset is described by a 0/1-valued word vector indicating the absence/presence of the corresponding word from the dictionary. The dictionary consists of 1433 unique words. The README file in the dataset provides more details. 

Download and unzip the cora.tgz file to a location on your computer. 

We assume that the dataset is stored in the directory

`../data/cora/`

where the files `cora.cites` and `cora.content` can be located.

In [None]:
data_dir = "../data/cora"

In [None]:
# load edgelist data
edgelist = pd.read_table(os.path.join(data_dir, "cora.cites"), header=None, names=["source", "target"])
edgelist["label"] = "cites"

In [None]:
edgelist.head()

In [None]:
g_nx = nx.from_pandas_edgelist(edgelist)

In [None]:
print(g_nx)

In [None]:
# Processing cora data attributes
feature_names = ["w_{}".format(ii) for ii in range(1433)]
column_names =  ['paper_id'] + feature_names + ["subject"]
node_data = pd.read_table(os.path.join(data_dir, "cora.content"), header=None, names=column_names)

In [None]:
node_data.head()

In [None]:
values = { row.tolist()[0]: row.tolist()[-1] for _, row in node_data.iterrows()}
nx.set_node_attributes(g_nx, values, 'subject') # assig attributes to nodes

In [None]:
list(g_nx.nodes(data=True))[:5] # print first few lines of the nodes with their attributes

In [None]:
nx.number_connected_components(g_nx)

In [None]:
# Select the largest connected component
g_nx_ccs = (g_nx.subgraph(c).copy() for c in nx.connected_components(g_nx))
g_nx = max(g_nx_ccs, key=len)
print("Largest subgraph statistics: {} nodes, {} edges".format(
    g_nx.number_of_nodes(), g_nx.number_of_edges()))

### Graph filtering

In [None]:
# Create a subgraph of papers that belong to a class: "Rule learning" and plot it.

In [None]:
selected_nodes = [n for n,v in g_nx.nodes(data=True) if v['subject'] == 'Rule_Learning']
subgraph_rl = g_nx.subgraph(selected_nodes)

In [None]:
print(subgraph_rl)

In [None]:
plt.figure(figsize=(10,8)) # change the size of the figure
pos = nx.fruchterman_reingold_layout(subgraph_rl, iterations=10) # specify layout
# nodes
nx.draw_networkx_nodes(subgraph_rl, pos, node_size=800, node_color='lightblue', edgecolors='grey')

# edges
nx.draw_networkx_edges(subgraph_rl, pos,
                       width=1, alpha=0.5, edge_color='grey')

plt.axis('off')
plt.show()

### Exercise: output now subgraph for any other paper subject. Optional: try to change the size of the nodes based on a degree or any other metric

## Predict a paper's subject

We develop a Logistic regression classifier for predicting a paper's subject given the provided features.

In [None]:
# filter the node data that is in the largest connected component
node_data_gcc = node_data[node_data.paper_id.isin(list(g_nx.nodes))]
print(node_data_gcc.shape)

In [None]:
node_data_gcc.head()

In [None]:
# X are the features that are calcualted from the graph
X = node_data_gcc.drop(['paper_id','subject'], axis=1).values 
# y holds the corresponding target values
y = node_data_gcc['subject'].values

### Data Splitting

We split the data into train and test sets. 

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=140, test_size=None, random_state=42)
print("Array shapes:\n X_train = {}\n y_train = {}\n X_test = {}\n y_test = {}".format(X_train.shape, y_train.shape, X_test.shape, y_test.shape))

### Classifier Training

We train a Logistic Regression classifier on the training data. 

In [None]:
clf = LogisticRegressionCV(Cs=10, 
                           cv=5, 
                           verbose=False,
                           multi_class='multinomial', 
                           max_iter=1000)
clf.fit(X_train, y_train)

In [None]:
print("score on X_train {}".format(clf.score(X_train, y_train)))
print("score on X_test {}".format(clf.score(X_test, y_test)))

In [None]:
y_pred = clf.predict(X_test)

In [None]:
y_actual = pd.Series(y_test, name='Actual')
y_pred = pd.Series(y_pred, name='Predicted')
df_confusion = pd.crosstab(y_actual, y_pred)

In [None]:
df_confusion

In [None]:
from collections import Counter
Counter(y_test)

### __Question: What class is the easiest/hardest to predict?__

## Feature Engineering

Let us now develop a classification model that utilises graph-based features. We will train and evaluate a
Logistic regression model with centrality measures as the input node features.

In [None]:
# Calculate centrality measures
nx.set_node_attributes(g_nx, nx.degree_centrality(g_nx), 'degree_centrality')
nx.set_node_attributes(g_nx, nx.betweenness_centrality(g_nx), 'betweenness')
nx.set_node_attributes(g_nx, nx.closeness_centrality(g_nx), 'closeness')
nx.set_node_attributes(g_nx, nx.eigenvector_centrality(g_nx), 'eigenvector')

In [None]:
list(g_nx.nodes(data=True))[:5]

In [None]:
graph_features = pd.DataFrame.from_dict(dict(g_nx.nodes(data=True)), orient='index')

In [None]:
graph_features.head()

In [None]:
# X are the features that are calcualted from the graph
X = graph_features.drop(['subject'], axis=1).values 
# y holds the corresponding target values
y = graph_features['subject'].values

### Data Splitting

We split the data into train and test sets. 

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=140, test_size=None, random_state=42)
print("Array shapes:\n X_train = {}\n y_train = {}\n X_test = {}\n y_test = {}".format(X_train.shape, y_train.shape, X_test.shape, y_test.shape))

### Classifier Training

We train a Logistic Regression classifier on the training data. 

In [None]:
clf = LogisticRegressionCV(Cs=10, 
                           cv=5, 
                           verbose=False,
                           multi_class='multinomial', 
                           max_iter=1000)
clf.fit(X_train, y_train)

In [None]:
print("score on X_train {}".format(clf.score(X_train, y_train)))
print("score on X_test {}".format(clf.score(X_test, y_test)))

In [None]:
y_pred = clf.predict(X_test)

In [None]:
y_actual = pd.Series(y_test, name='Actual')
y_pred = pd.Series(y_pred, name='Predicted')
df_confusion = pd.crosstab(y_actual, y_pred)

In [None]:
df_confusion

### Exercise: Combine the node attribute vectors with the centrality features and train a classifier. How does it perform in comparison to using only the node attribute vectors or the centrality measure as the data?