# Initial Setup

## Import the required libraries

In [1]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
import networkx as nx
import cudf
import cugraph

## Set the Device

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


# Load the Dataset

## Load the Training Data

In [3]:
trainEdgeList = []
with open('train.txt') as inputFile:
    numNodes = int(inputFile.readline())
    for line in inputFile.readlines():
        src, dst, weight = map(int, line.split())
        trainEdgeList.append([src, dst, weight])
print(len(trainEdgeList))

1113811


In [4]:
trainEdgeDF = pd.DataFrame(trainEdgeList, columns = ['from', 'to', 'weight'])
trainEdgeDF.head()

Unnamed: 0,from,to,weight
0,701683,871966,1
1,729650,974012,1
2,93571,927327,1
3,339405,563090,1
4,1003443,652581,1


## Load the computed Topological Features

In [7]:
features = pd.read_csv('features.csv')

Unnamed: 0,id
0,0
1,1
2,2
3,3
4,4


# Domain based Feature Extraction

## Extraction of Node Features from Incident Edges

In [9]:
group_from_df_to = pd.DataFrame(trainEdgeDF.groupby('from')['to'].nunique())
group_from_df_to.reset_index(inplace = True)
group_from_df_to.rename(columns = {'from':'id', 'to':'unique_retweets'}, inplace = True)
group_from_df_weight = pd.DataFrame(trainEdgeDF.groupby('from')['weight'].sum())
group_from_df_weight.reset_index(inplace = True)
group_from_df_weight.rename(columns = {'from':'id', 'weight':'total_retweets'}, inplace = True)
group_to_df_from = pd.DataFrame(trainEdgeDF.groupby('to')['from'].nunique())
group_to_df_from.reset_index(inplace = True)
group_to_df_from.rename(columns = {'to':'id', 'from':'unique_retweets'}, inplace = True)
group_to_df_weight = pd.DataFrame(trainEdgeDF.groupby('to')['weight'].sum())
group_to_df_weight.reset_index(inplace = True)
group_to_df_weight.rename(columns = {'to':'id', 'weight':'total_retweets'}, inplace = True)

In [10]:
group_from_df = group_from_df_to.merge(group_from_df_weight, on = 'id')
group_to_df = group_to_df_from.merge(group_to_df_weight, on = 'id')
group_df = pd.concat([group_from_df, group_to_df], axis = 0)
group_df

Unnamed: 0,id,unique_retweets,total_retweets
0,2,1,1
1,10,2,2
2,11,1,1
3,17,1,1
4,19,1,1
...,...,...,...
1018683,1563414,2,2
1018684,1563415,1,1
1018685,1563416,1,1
1018686,1563417,1,1


In [11]:
nodes = np.asarray([node for node in range(numNodes)])
features = pd.DataFrame(nodes, columns = ['id'])
features = features.merge(group_df, on = 'id', how = 'left')
features.head()

Unnamed: 0,id,unique_retweets,total_retweets
0,0,1.0,1.0
1,1,1.0,1.0
2,2,1.0,1.0
3,3,1.0,1.0
4,4,1.0,1.0


In [12]:
graph = cugraph.Graph()
graph.from_pandas_edgelist(pdf = trainEdgeDF, source = 'from', destination = 'to', weight = 'weight')
graph.add_nodes_from(nodes)

In [13]:
degree_centrality = cugraph.degree_centrality(graph, normalized = False).to_pandas()
degree_centrality.rename(columns = {'vertex' : 'id'}, inplace = True)
features = features.merge(degree_centrality, on = 'id', how = 'left')
features.head()

Unnamed: 0,id,unique_retweets,total_retweets,degree_centrality
0,0,1.0,1.0,2.0
1,1,1.0,1.0,2.0
2,2,1.0,1.0,2.0
3,3,1.0,1.0,2.0
4,4,1.0,1.0,2.0


In [14]:
labels, score = cugraph.louvain(graph, max_level = 500)
labels = labels.to_pandas()
labels.rename(columns = {'vertex' : 'id'}, inplace = True)
features = features.merge(labels, on = 'id', how = 'left')
features.head()

Unnamed: 0,id,unique_retweets,total_retweets,degree_centrality,partition
0,0,1.0,1.0,2.0,176587.0
1,1,1.0,1.0,2.0,75.0
2,2,1.0,1.0,2.0,166285.0
3,3,1.0,1.0,2.0,96304.0
4,4,1.0,1.0,2.0,173.0


In [15]:
features.fillna(0, inplace = True)
features.set_index(['id'], inplace = True)
features.head()

Unnamed: 0_level_0,unique_retweets,total_retweets,degree_centrality,partition
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,1.0,1.0,2.0,176587.0
1,1.0,1.0,2.0,75.0
2,1.0,1.0,2.0,166285.0
3,1.0,1.0,2.0,96304.0
4,1.0,1.0,2.0,173.0


In [17]:
features.to_csv('features.csv', index = False)