In [None]:
import os
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim

from common.encoding import *
from common.neural_embedding import *
from common.network_analysis import *
from common.encoding import *
from numpy.ma.core import shape

from utils.file_utils import *

### 1. Load data. ###

In [None]:
# Load edge list
global_edge_table = pd.read_csv('data/processed_data/global_edge_list.csv')
global_edge_list = list(global_edge_table.itertuples(index = False, name = None))

print(len(global_edge_list))

### 2. Train embedding neural network. ###

In [7]:
# List of (k_in, k_out)
# k_list = [(1, 1),
#           (2, 1),
#           (3, 1),
#           (5, 1),
#           (10, 1)]

k_list = [(3, 1),
          (10, 1)]

# embedding_vector_dim_list = [5, 10, 20, 30]
embedding_vector_dim_list = [5, 20]

# Training configurations.
epoch_num = 15
batch_size = 128

file_directory = 'data/output_data/'

k_to_loss_history = {}

device = ("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")

for k in k_list:
    filtered_edge_list = remove_low_degree_nodes(global_edge_list, k[0], k[1])
    filtered_edge_list_df = pd.DataFrame(filtered_edge_list, columns=['followee', 'follower'])
    edge_file_name = 'global_edge_list_kin' + str(k[0]) + '_kout' + str(k[1]) + '.csv'
    save_to_csv(file_directory, edge_file_name, filtered_edge_list_df)
    
    # Obtain follower_map and followee map, which are dictionaries mapping usernames to integer values.
    congress_map, followee_map = build_dict(filtered_edge_list)
    
    for embedding_vector_dim in embedding_vector_dim_list:
        """
        network_input_dim = congress_map_length
        network_hidden_dim = embedding_dim
        network_output_dim = followee_map_length
        """
        congress_map_length = len(congress_map)
        followee_map_length = len(followee_map)
        
        print('Input dimension is ' + str(congress_map_length) + '.')
        print('Hidden dimension is ' + str(embedding_vector_dim) + '.')
        print('Output dimension is ' + str(followee_map_length) + '.')
        
        print('Training NN with k_in >=' + str(k[0]) + ', kout >=' + str(k[1]) + ' and embedding_vector_dim = ' + str(embedding_vector_dim) + ' ...')

        # Build NN.
        shallow_nn = ShallowNN(congress_map_length, embedding_vector_dim, followee_map_length)

        # Training configurations.
        loss_function = nn.CrossEntropyLoss()
        optimizer = optim.Adam(shallow_nn.parameters(), lr=0.001)

        # Train NN and get loss value of every epoch.
        loss_history = train_network(shallow_nn, epoch_num, batch_size, loss_function, 
                             optimizer, filtered_edge_list, congress_map, followee_map)
        k_to_loss_history[k] = loss_history
        
        # Get weight matrix and bias.
        input_to_hidden_weights = shallow_nn.linear_stack[0].weight.data
        input_to_hidden_bias = shallow_nn.linear_stack[0].bias
        
        # Get congress-to-embedding dict and save.
        # A congress list is needed to determine one hot vector length.
        congress_list = [item[1] for item in filtered_edge_list]
        _, congress_to_vector_dict = one_hot_encode(congress_list, congress_map)
        user_to_embedding_dict = calculate_embeddings(congress_to_vector_dict, input_to_hidden_weights, input_to_hidden_bias)
        
        embedding_file_name = 'global_embedding_kin' + str(k[0]) + '_kout' + str(k[1]) + '_dim' + str(embedding_vector_dim) + '.pth'
        save_to_pth(file_directory, embedding_file_name, user_to_embedding_dict)

Input dimension is 164.
Hidden dimension is 5.
Output dimension is 15661.
Training NN with k_in >=3, kout >=1 and embedding_vector_dim = 5 ...
Epoch [1/15], Loss: 9.6582
Epoch [2/15], Loss: 9.6387
Epoch [3/15], Loss: 9.5567
Epoch [4/15], Loss: 9.4229
Epoch [5/15], Loss: 9.2947
Epoch [6/15], Loss: 9.1944
Epoch [7/15], Loss: 9.1170
Epoch [8/15], Loss: 9.0561
Epoch [9/15], Loss: 9.0070
Epoch [10/15], Loss: 8.9661
Epoch [11/15], Loss: 8.9312
Epoch [12/15], Loss: 8.9009
Epoch [13/15], Loss: 8.8741
Epoch [14/15], Loss: 8.8501
Epoch [15/15], Loss: 8.8283
Input dimension is 164.
Hidden dimension is 20.
Output dimension is 15661.
Training NN with k_in >=3, kout >=1 and embedding_vector_dim = 20 ...
Epoch [1/15], Loss: 9.6564
Epoch [2/15], Loss: 9.5846
Epoch [3/15], Loss: 9.3504
Epoch [4/15], Loss: 9.1197
Epoch [5/15], Loss: 8.9705
Epoch [6/15], Loss: 8.8636
Epoch [7/15], Loss: 8.7803
Epoch [8/15], Loss: 8.7099
Epoch [9/15], Loss: 8.6465
Epoch [10/15], Loss: 8.5873
Epoch [11/15], Loss: 8.5312
Ep

### 3. Training results analysis. ###
接下来应该画图，同一嵌入向量维度不同kin的对比，同一kin不同嵌入向量维度的对比，每次对比放在一张图中，先对比同kin的情况，后对比同一嵌入向量维度的情况。

In [17]:
# input_to_hidden_weights = shallow_nn.fc1.weight.data
# print('The shape of input-to-hidden weight matrix is ' + str(input_to_hidden_weights.shape) + '.')
# 
# input_to_hidden_bias = shallow_nn.fc1.bias.data
# print('The shape of input-to-hidden bias is ' + str(input_to_hidden_bias.shape) + '.')
# 
# input_to_hidden_weights_df = pd.DataFrame(input_to_hidden_weights)
# input_to_hidden_bias_df = pd.DataFrame(input_to_hidden_bias)
# 
# save_to_csv('data/output_data/', 'input_to_hidden_weights.csv', input_to_hidden_weights_df)
# save_to_csv('data/output_data/', 'input_to_hidden_bias.csv', input_to_hidden_bias_df)

The shape of input-to-hidden weight matrix is torch.Size([20, 164]).
The shape of input-to-hidden bias is torch.Size([20]).


### 4. Obtain embedding vectors. ###

In [18]:
# # Read congress member info
# congress_member_info = pd.read_csv('data/raw_data/all_congress_members.csv')
# congress_list = congress_member_info['twitter_name'].tolist()
# 
# weights = torch.tensor(input_to_hidden_weights_df.values).float()
# bias = torch.tensor(input_to_hidden_bias_df.values).float()
# 
# congress_vectors, congress_to_vector_dict = one_hot_encode(congress_list, congress_map)
# 
# congress_to_embedding_dict = {}
# 
# # Throw this to encoding.py, as it can be seen as an encoding manipulation.
# for congress_member in congress_list:
#     if congress_member not in congress_to_vector_dict:
#         continue
#     
#     vector = congress_to_vector_dict[congress_member]
#     vector = torch.tensor(vector).float()
#     vector = vector.unsqueeze(0)
#     embedding_vector = torch.matmul(vector, weights.t()) + bias.squeeze()
#     embedding_vector = embedding_vector.squeeze()
#     
#     congress_member_info.loc[congress_member_info['twitter_name'] == congress_member, 'embedding_vector'] = embedding_vector.tolist()
#     
# congress_one_hot_vectors = torch.stack(congress_vectors)
# embedding_vectors = torch.matmul(congress_one_hot_vectors, weights.t()) + bias.squeeze()
# 
# embedding_vectors_df = pd.DataFrame(embedding_vectors)
# save_to_csv('data/output_data/', 'embedding_vectors.csv', embedding_vectors_df)
# 
# print('The size of embedding vectors is ' + str(embedding_vectors_df.shape) + '.')

The size of embedding vectors is (164, 20).
