# Pre-processing Notebook
- Input for this notebook: SCSE faculty staff DBLP info XML as input
- Task: Select correct network representation
- Output: process and create faculty network for analysis (assuming dataflow methodology, a network should be returned

In [21]:
import networkx as nx
import pandas as pd
import numpy as np
from itertools import combinations
import matplotlib.pyplot as plt
import pickle

In [22]:
# Import faculty details into df
faculty_df = pd.read_excel('Faculty.xlsx')

# Select relevant columns
faculty_df = faculty_df[['Faculty', 'Position', 'Gender', 'Management', 'DBLP', 'Area']]

# Create dictionary
faculty_dict = faculty_df.to_dict('index')

# Declare empty List
faculty_list = []

'''
# each[1] corresponds to lumped meta data
# each[1]['Faculty'] corresonds to 'Faculty'
# each[1]['Position'] corresonds to 'Position'
# each[1]['Gender'] corresonds to 'Gender'
# each[1]['Management'] corresponds to 'Management'
# each[1]['DBLP'] corresponds to 'DBLP'
# each[1]['Area'] corresponds to 'Area'
'''
# Iterate over faculty_dict to fill up faculty_list
for each in faculty_dict.items():
    
    node_no = each[0] 
    faculty = each[1]['Faculty']
    position = each[1]['Position']
    gender = each[1]['Gender']
    management = each[1]['Management']
    dblp = each[1]['DBLP']
    area = each[1]['Area']
    
    faculty_list.append((node_no, {'Faculty': faculty}))
    faculty_list.append((node_no, {'Position': position}))
    faculty_list.append((node_no, {'Gender': gender}))
    faculty_list.append((node_no, {'Management': management}))
    faculty_list.append((node_no, {'DBLP': dblp}))
    faculty_list.append((node_no, {'Area': area}))
    
# Declare empty new graph for faculty network
faculty_graph = nx.MultiGraph()

# Fill up empty graph w/ faculty_list
faculty_graph.add_nodes_from(faculty_list)

In [None]:
faculty_list

In [None]:
file_name = "empty_faculty_graph.pkl"

open_file = open(file_name, "wb")
pickle.dump(faculty_graph, open_file)
open_file.close()

In [None]:
#list(faculty_graph.nodes)

In [None]:
# need to scrape correspondly DBL data, then augment them with existing dataset
# have to then decide how we'll be using augment faculty details to connect and generate the network

In [None]:
# Segment to use previously-extracted DBLP raw data to map collaborations between faculty staff

# Import dblp_df.csv as DF
dblp_df = pd.read_csv('dblp_df.csv')

# Create categorical class counts for each article

# Use categorical class counts to map all related faculty members with a dict

In [None]:
dblp_multi_df = dblp_df[dblp_df['key'].duplicated(keep=False)]
dblp_multi_df.sort_values(by=['key'])

In [None]:
# get categorical uniques in dataframe
categorical_list = dblp_multi_df['key'].drop_duplicates().tolist()

# create list to store key:f_index_list mappins
key_findex_list = []

# use categorical uniques to return df records
for each in categorical_list:
    # extract f_index values from returned df records into a list
    mappings = dblp_multi_df[dblp_multi_df['key'] == each]['f_index'].tolist()
    # create unique pair-wise combinations for mappings (needed for networkx) 
    mappings_pair = list(combinations(mappings, 2)) 
    year = dblp_multi_df[dblp_multi_df['key'] == each]['Year'].iloc[0]
    key_findex_list.append([mappings_pair, each, year])

In [None]:
pairings = np.array(key_findex_list)[:,[0,2]]

In [None]:
distinct_paired_edges = [x for x in pairings if x[0][0][0] != x[0][0][1]]

In [None]:
distinct_paired_edges = [[list(set(x[0])), x[1]] for x in distinct_paired_edges]

In [None]:
distinct_paired_edges = [[[y, x[1]]for y in x[0]] for x in distinct_paired_edges]

In [None]:
# removing the flipped ones i.e I only want (0,16) and not (16,0)
distinct_paired_edges = [[y for y in x if y[0][0] < y[0][1]] for x in distinct_paired_edges]


In [None]:
distinct_paired_edges = [[x[0][0], x[0][0], x[0][1]] for x in distinct_paired_edges]

In [None]:
edge_year_pairings = [[x[0][0], x[0][1], x[1]] for x in distinct_paired_edges]

In [None]:
edge_year_pairings = [tuple(x) for x in edge_year_pairings]

In [None]:
# save key_findex_list into pickle file for easy replication

file_name = "edge_year_pairings.pkl"

open_file = open(file_name, "wb")
pickle.dump(edge_year_pairings, open_file)
open_file.close()


In [None]:
# iterate over key_findex_list to populate initial network

for edge in edge_year_pairings:
    faculty_graph.add_edge(edge[0], edge[1], label = edge[2])

In [None]:
# save faculty_graph into pickle file for easy replication

file_name = "faculty_graph.pkl"

open_file = open(file_name, "wb")
pickle.dump(faculty_graph, open_file)
open_file.close()


In [None]:
def save_plot_faculty_graph():

    # Set figure for graph
    plt.figure(figsize=(35, 20))

    # Draw the graph
    nx.draw(faculty_graph, with_labels=True, font_size=10,
            node_color='red', font_color='white', edge_color='grey', node_size=250)

    # Save the graph
    plt.savefig("collab_graph.png", dpi=326)

In [None]:
faculty_graph.edges()

In [None]:
# Remove isolates
faculty_graph.remove_nodes_from(list(nx.isolates(faculty_graph)))

In [None]:
# Set figure for graph
plt.figure(figsize=(15, 10))

# Draw the graph (with isolates removed)
nx.draw(faculty_graph, with_labels=True, font_size=10,
        node_color='red', font_color='white', edge_color='grey', node_size=300)

# Save the graph
plt.savefig("collab_graph_connected_only.png", dpi=326)

# Sourav & Frens

In [1]:
import networkx as nx
import pandas as pd
import numpy as np
from itertools import combinations
import matplotlib.pyplot as plt
import pickle
from itertools import combinations

In [2]:
# Import faculty details into df
faculty_df = pd.read_excel('Faculty.xlsx')

# Select relevant columns
faculty_df = faculty_df[['Faculty', 'Position', 'Gender', 'Management', 'DBLP', 'Area']]

In [3]:
# create another networkx graph with existing SCSE graph + 1000 apostles

# Retrieve 1000 apostles
with open('1000_faculty.pkl', 'rb') as f:
    thousand_apostles_list = pickle.load(f)
    
# Augment Previously created faculty_df w/ 1000 apostles
for new_faculty in thousand_apostles_list:
    row = pd.Series([new_faculty, '-', '-', '-', '-', '-'], index=faculty_df.columns)
    faculty_df = faculty_df.append(row, ignore_index=True)

In [4]:
# Create dictionary
faculty_dict = faculty_df.to_dict('index')

# Declare empty List
faculty_list = []

'''
# each[1] corresponds to lumped meta data
# each[1]['Faculty'] corresonds to 'Faculty'
# each[1]['Position'] corresonds to 'Position'
# each[1]['Gender'] corresonds to 'Gender'
# each[1]['Management'] corresponds to 'Management'
# each[1]['DBLP'] corresponds to 'DBLP'
# each[1]['Area'] corresponds to 'Area'
'''
# Iterate over faculty_dict to fill up faculty_list
for each in faculty_dict.items():
    
    node_no = each[0] 
    faculty = each[1]['Faculty']
    position = each[1]['Position']
    gender = each[1]['Gender']
    management = each[1]['Management']
    dblp = each[1]['DBLP']
    area = each[1]['Area']
    
    faculty_list.append((node_no, {'Faculty': faculty}))
    faculty_list.append((node_no, {'Position': position}))
    faculty_list.append((node_no, {'Gender': gender}))
    faculty_list.append((node_no, {'Management': management}))
    faculty_list.append((node_no, {'DBLP': dblp}))
    faculty_list.append((node_no, {'Area': area}))
    
# Declare empty new graph for faculty network
faculty_graph_1k = nx.MultiGraph()

# Fill up empty graph w/ faculty_list
faculty_graph_1k.add_nodes_from(faculty_list)

# Assign f_index to all augmented faculty members list
faculty_df.insert(0, 'f_index', faculty_df.index.values.tolist())

In [5]:
# Segment to use previously-extracted DBLP raw data to map collaborations between faculty staff and 1k apostles

# Import dblp_df_2.csv as DF
with open('dblp_12k_processed_df.pkl', 'rb') as f:
    dblp_df = pickle.load(f)

unique_oa_list = dblp_df['Other Authors'].tolist()

In [6]:
# list of lists with all authors:
f_list = []
oa_list = []
for i, row in dblp_df.iterrows():
    f_list.append(row['Faculty'])
    oa_list.append(row['Other Authors'])
    
# create the combined list
combined_list = []
for i in range(len(f_list)):
    if f_list[i] not in oa_list[i]:
        temp_list = oa_list[i]
        temp_list.append(f_list[i])
        combined_list.append(temp_list)
        
# filter out duplicate lists in list of lists        
seen = set()
uc_list = [x for x in combined_list if frozenset(x) not in seen and not seen.add(frozenset(x))]

In [7]:
# Augmented Faculty f_index dict:
af_dict = dict(zip(faculty_df['Faculty'], faculty_df['f_index']))

In [8]:
# create list of edges from uc_list

uc_edges_list = []
for each in uc_list:
    uc_temp_list = []
    for each_2 in each:
        try:
            uc_temp_list.append(af_dict[each_2])
        except:
            continue
    uc_edges_list.append(uc_temp_list)

In [9]:
# filter out duplicate lists in list of lists again         
seen = set()
uuc_edges_list = [x for x in uc_edges_list if frozenset(x) not in seen and not seen.add(frozenset(x))]

In [10]:
# generation of list of edges (faculty + 1k apostles) for graphx
faculty_1000_list = []
for each in uuc_edges_list:
    pairs = list(combinations(each, 2))
    faculty_1000_list.append(pairs)

# flatten list
flat_edge_list = []
for each in faculty_1000_list:
    for each_2 in each:
        flat_edge_list.append(each_2)
    
# retrieve only unique edge pairings
unique_flat_edge_list = list(set(flat_edge_list))

In [11]:
# Add edges to faculty_1k Graph
faculty_graph_1k.add_edges_from(unique_flat_edge_list)

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,


# Note: Even when extracting values from a DF into a list, any updates to the list will cascade changes back into the source DF the values came from. I'm not sure why there's this pointer issue.

In [None]:
# Set figure for graph
plt.figure(figsize=(200, 100))

# Draw the graph (with isolates removed)
nx.spring_layout(faculty_graph_1k, k=0.25, iterations=20)
nx.draw(faculty_graph_1k, with_labels=True, font_size=20,
        node_color='red', font_color='white', edge_color='grey', node_size=1500)

# Save the graph
plt.savefig("faculty_graph_1k.png", dpi=50)

In [20]:
# Store 1085 faculty df and nx graph as 2 respective pickles

faculty_df.to_csv('faculty_df.csv', index=False)  

with open('faculty_1k_df.pkl', 'wb') as f:
    pickle.dump(faculty_df, f)

with open('faculty_1k_graph.pkl', 'wb') as f:
    pickle.dump(faculty_graph_1k, f)

In [17]:
faculty_df

Unnamed: 0,f_index,Faculty,Position,Gender,Management,DBLP,Area
0,0,A S Madhukumar,Associate Professor,M,N,https://dblp.uni-trier.de/pers/m/Madhukumar:A=...,Computer Networks
1,1,Alexei Sourin,Associate Professor,M,N,https://dblp.org/pers/s/Sourin:Alexei.html,Computer Graphics
2,2,Anupam Chattopadhyay,Associate Professor,M,N,https://dblp.org/pers/c/Chattopadhyay:Anupam.html,Computer Architecture
3,3,Anwitaman Datta,Associate Professor,M,N,https://dblp.org/pers/d/Datta:Anwitaman.html,Distributed Systems
4,4,Arijit Khan,Assistant Professor,M,N,https://dblp.org/pers/k/Khan:Arijit.html,Data Management
...,...,...,...,...,...,...,...
1080,1080,Amitabha Das,-,-,-,-,-
1081,1081,Herty Liany,-,-,-,-,-
1082,1082,Whye Loon Tung,-,-,-,-,-
1083,1083,Virgil D. Gligor,-,-,-,-,-
