**Company** : <br>
Tech Firm

**Notebook Function** : <br>
    This notebook generates the local clustering measures
    
**Output File(s)** : <br>
    tech_network_embed.csv - The final output file containing the network measure

**Author(s)** : <br>
Lara Yang, Sarayu Anshuman

Install packages and import libraries

In [None]:
pip install cdlib

In [None]:
pip install igraph leidenalg

In [None]:
import os
import sys
from collections import defaultdict
from datetime import datetime
import pandas as pd
import numpy as np
from mittens import Mittens
import csv
from operator import itemgetter
import ujson as json
import re
from gensim.matutils import cossim
from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec
from statistics import mean 
from sklearn.decomposition import PCA
from collections import Counter
from datetime import timedelta
import multiprocessing
import random
from ast import literal_eval
import networkx.algorithms.community as nx_comm
from cdlib import algorithms

Run helper functions 

In [None]:
#########################################################################
########### Helper Functions for Generating Mittens Embeddings ##########
#########################################################################
def _window_based_iterator(toks, window_size, weighting_function):
    for i, w in enumerate(toks):
        yield w, w, 1
        left = max([0, i-window_size])
        for x in range(left, i):
            yield w, toks[x],weighting_function(x)
        right = min([i+1+window_size, len(toks)])
        for x in range(i+1, right):
            yield w, toks[x], weighting_function(x)
    return

def glove2dict(glove_filename):
    """
    Reads word vectors into a dictionary
    Parameters
    ----------
    glove_filename : str
        Name of file that contains vectors
    Returns
    -------
    data : dict
        A dictionary matching words to their vectors
    """
    with open(glove_filename) as f:
        reader = csv.reader(f, delimiter=' ', quoting=csv.QUOTE_NONE)
        data = {line[0]: np.array(list(map(float, line[1: ]))) for line in reader}
    return data

# Inspired by original build_weighted_matrix in utils.py in the Mittens paper source codebase
def build_weighted_matrix(emails,
        mincount=300, vocab_size=None, window_size=10,
        weighting_function=lambda x: 1 / (x + 1)):
    """
    Builds a count matrix based on a co-occurrence window of
    `window_size` elements before and `window_size` elements after the
    focal word, where the counts are weighted based on proximity to the
    focal word.
    Parameters
    ----------
    emails : list of dicts
        Emails converted from JSON formats
    mincount : int
        Only words with at least this many tokens will be included.
    vocab_size : int or None
        If this is an int above 0, then, the top `vocab_size` words
        by frequency are included in the matrix, and `mincount`
        is ignored.
    window_size : int
        Size of the window before and after. (So the total window size
        is 2 times this value, with the focal word at the center.)
    weighting_function : function from ints to floats
        How to weight counts based on distance. The default is 1/d
        where d is the distance in words.
    Returns
    -------
    X : pd.DataFrame
        Cooccurence matrix guaranteed to be symmetric because of the way the counts are collected.
    """
    wc = defaultdict(int)
    corpus = read_corpus(emails, sentence_delim=False)
    #---print("corpus------")
    #---print(corpus)
    if corpus is None:
        print("These emails are empty\t{}.\n".format(str(emails)))
        return pd.DataFrame()
    for toks in corpus:
        for tok in toks:
            wc[tok] += 1
    #---print("wc------")
    #---print(wc)
    if vocab_size:
        srt = sorted(wc.items(), key=itemgetter(1), reverse=True)
        vocab_set = {w for w, c in srt[: vocab_size]}
        #---print('using vocabsize')
    else:
        vocab_set = {w for w, c in wc.items() if c >= mincount} #this is being printed
        #---print('using mincount')
    vocab = sorted(vocab_set)
    n_words = len(vocab)
    #---print(f"n_words: {n_words}")
    # Weighted counts:
    counts = defaultdict(float)
    for toks in corpus:
        window_iter = _window_based_iterator(toks, window_size, weighting_function)
        for w, w_c, val in window_iter:
            if w in vocab_set and w_c in vocab_set:
                counts[(w, w_c)] += val
    X = np.zeros((n_words, n_words))
    for i, w1 in enumerate(vocab):
        for j, w2 in enumerate(vocab):
            X[i, j] = counts[(w1, w2)]
    X = pd.DataFrame(X, columns=vocab, index=pd.Index(vocab))
    return X

def read_corpus(emails, sentence_delim=False):
    """
    Parameters
    ----------
    emails : list of dict
        A list of emails converted from JSON formats
    sentence_delim : bool, optional
        If true, co-occurrences across sentence boundaries are ignored.
    Returns
    -------
    list of list of str
        Corpus converted from emails.
        If sentence_delim is false, returns a list of emails, which are represented as lists of tokens
        If sentence_delim is true, returns a list of sentences, which are represented as lists of tokens
    """
    if not sentence_delim:
        return [email['body'].replace('\n', ' ').replace("SENT_END", "").strip().split() for email in emails]
    else:
        return [sent.strip().split() for email in emails for line in email['body'].split('\n') for sent in line.split('SENT_END') if len(sent) > 0]

def output_embeddings(mittens_df, filename, compress=False):
    if compress:
        mittens_df.to_csv(filename + '.gz', quoting=csv.QUOTE_NONE, header=False, sep=" ", compression='gzip')
    else:
        mittens_df.to_csv(filename, quoting=csv.QUOTE_NONE, header=False, sep=" ")
    return

def isnull_wrapper(x):
    r = pd.isnull(x)
    if type(r) == bool:
        return r
    return r.any()

def cossim_with_none(vec1, vec2, vec_format='sparse'):
    """
    Auxiliary function that calls cossim function to test if vectors are None to prevent erroring out.
    Parameters
    ----------
    vec1 : list of (int, float), gensim sparse vector format
    vec2 : list of (int, float), gensim sparse vector format
    format : str, optional
        Either sparse or dense. If sparse, vec1 and vec2 are in gensim sparse vector format; use cossim function from gensim.
        Otherwise, vec1 and vec2 are numpy arrays and cosine similarity is hand calculated
    Returns
    -------
    float
        Cosine similarity between vec1 and vec2
    """
    if not (isnull_wrapper(vec1) or isnull_wrapper(vec2)):
        if vec_format == 'sparse':
            return cossim(vec1, vec2)
        elif vec_format == 'dense':
            if len(vec1) == 0 or len(vec2) == 0:
                return None
            return np.dot(vec1, vec2)/(np.linalg.norm(vec1) * np.linalg.norm(vec2))
        else:
            raise ValueError()
    return None

#########################################################################
############# Helper Functions for Working with JSON Emails #############
#########################################################################

def slice_user_corpus(emails, train_mode):
    """
    Similar to slice_user_corpus in the Coco, modified to work with tech firm data structure
    Parameters
    ----------
    emails : list of dict
        A list of emails converted from JSON formats to dictionaries 
    train_mode : str
        One of 'annual', 'quarterly', 'all'
        Indicates how to chunk up emails - into quarters, years, or both
    Returns
    -------
    timekey2emails : dict
        Matches quarters or years to respective emails
    """
    timekey2emails = defaultdict(list)
    for email in emails:
        if train_mode == 'annual':
            timekey2emails[email['year']].append(email)
        elif train_mode == 'quarterly':
            timekey2emails[email['quarter']].append(email)
        elif train_mode == 'halfyearly':
            timekey2emails[email['halfyear']].append(email)
        elif train_mode == 'thirdyearly': #1/3 or a year or 4 months
            timekey2emails[email['thirdyear']].append(email)
        elif train_mode == 'all':
            timekey2emails[email['year']].append(email)
            timekey2emails[email['quarter']].append(email)
    return timekey2emails

#########################################################################
############# Helper Functions for Working with Date Objects ############
#########################################################################
def str_to_datetime(date):
    if date is None:
        return None
    date = re.sub(r"(\([A-Z]{3,}\))", "", date)
    date = date.strip()
    dt = None
    for fmt in ('%a, %d %b %Y %H:%M:%S %z', '%d %b %Y %H:%M:%S %z', '%a %d, %b %Y %H:%M:%S %z', '%a, %d %b %Y %H:%M:%S %Z', '%a, %d %b %Y %H:%M:%S (%Z)', '%Y-%m-%d %H:%M:%S%z'):
        try:
            dt = datetime.strptime(date, fmt)
        except ValueError:
            pass
    return dt

def to_quarter(date, format):
    """
    Returns quarter of date as str using date formats used in tech firm.
    Support for string format is provided, but if both to_quarter and to_year are needed,
    it is computationally more efficient to first convert string to datetime and then call
    both to_quarter and to_year on the datetime object to avoid duplicated datetime conversion.
    """
    if date is None:
        return None
    year, month = 0, 0
    if format == 'str':
        dt = str_to_datetime(date)
        if dt is None:
            return None
        year = dt.year
        month = dt.month
    elif format == 'datetime':
        year = date.year
        month = date.month
    quarter = ((int(month)-1) // 3) + 1
    timekey = str(year) + 'Q' + str(quarter)
    return timekey

def to_year(date, date_type):
    """
    Returns year of date as str using date formats used in tech firm.
    Support for string format is provided, but if both to_quarter and to_year are needed,
    it is computationally more efficient to first convert string to datetime and then call
    both to_quarter and to_year on the datetime object to avoid duplicated datetime conversion.
    """
    if date is None:
        return None
    if date_type == 'str':
        date = re.sub(r"\([A-Z]{3,}\)", "", date)
        date = date.strip()
        dt = None
        for fmt in ('%a, %d %b %Y %H:%M:%S %z', '%d %b %Y %H:%M:%S %z', '%a %d, %b %Y %H:%M:%S %z', '%a, %d %b %Y %H:%M:%S %Z'):
            try:
                dt = datetime.strptime(date, fmt)
            except ValueError:
                pass
        if dt is None:
            return None
        return str(dt.year)
    elif date_type == 'datetime':
        return str(date.year)

def to_halfyear(date, format):
    """
    Returns quarter of date as str using date formats used in tech firm.
    Support for string format is provided, but if both to_quarter and to_year are needed,
    it is computationally more efficient to first convert string to datetime and then call
    both to_quarter and to_year on the datetime object to avoid duplicated datetime conversion.
    """
    if date is None:
        return None
    year, month = 0, 0
    if format == 'str':
        dt = str_to_datetime(date)
        if dt is None:
            return None
        year = dt.year
        month = dt.month
    elif format == 'datetime':
        year = date.year
        month = date.month
    halfyear = ((int(month)-1) // 6) + 1
    timekey = str(year) + 'HY' + str(halfyear)
    return timekey

def to_thirdyear(date, format):
    """
    Return third of year of date in string
    """
    year, month = 0, 0
    if date is None:
        return None
    if format == 'str':
        year = date[0:4]
        month = date[5:7]
    elif format == 'datetime':
        year = date.year
        month = date.month
    thirdyear = ((int(month)-1) // 4) + 1
    timekey = str(year) + 'TH' + str(thirdyear)
    return timekey

def word_similarity(model, w1, w2):
    """
    This is an auxilary function that allows for comparing one word to another word or multiple words
    If w1 and w2 are both single words, n_similarity returns their cosine similarity which is the same as 
    simply calling similarity(w1, w2)
    If w1 or w2 is a set of words, n_similarity essentially takes the mean of the set of words and then computes
    the cosine similarity between that vector mean and the other vector. This functionality is both reflected
    in its source code and has been verified manually.
    Parameters
    ----------
    model : KeyedVectors
        The model that contains all the words and vectors
    w1 : str or list
        The first word or word list to be compared
    w2 : str or list
        The second word or word list to be compared
    Returns
    -------
    float
        Cosine similarity between w1 and w2
    """
    if not isinstance(w1, list):
        w1 = [w1]
    if not isinstance(w2, list):
        w2 = [w2]
    w1 = [w for w in w1 if w in model.key_to_index]
    w2 = [w for w in w2 if w in model.key_to_index]
    if len(w1) == 0 or len(w2) == 0:
        return None
    return model.n_similarity(w1, w2)

#########################################################################
############## Helper Functions for Working with Dataframes #############
#########################################################################
def dict_to_df(index2rows, cols, index_name):
    """
    Parameters
    ----------
    index2rows : dict
        Dictionary mapping index to rows to be coverted
    cols : list
        List of column names of type str
    index : list
        List of index names
    Returns
    -------
    df : pd.DataFrame
        Constructed dataframe
    """
    if index2rows is None or len(index2rows) == 0:
        return None
    if len(index_name) == 1:
        df = pd.DataFrame.from_dict(index2rows, orient='index', columns=cols)
        df.index.name = index_name[0]
        df.sort_index(axis=0, inplace=True)
        return df
    else:
        df = pd.DataFrame.from_dict(index2rows, orient='index', columns=cols)
        df = pd.DataFrame(df, pd.MultiIndex.from_tuples(df.index, names=index_name))
        df.sort_index(axis=0, inplace=True)
        return df

def get_recipients(msg):
    """
    Return a set of recipients of the current message.
    self is removed from list of recipients if in recipients
    Tech firm's from fields are all lists based on visual inspection
    but we check for the type of the field just in case
    """
    sender = msg['from'][0] if type(msg['from']) == list else msg['from']
    return set(msg.get('to', []) + msg.get('cc', []) + msg.get('bcc', [])) - set([sender])

def is_internal_msg(msg):
    recipients = get_recipients(msg)
    internal = True
    for r in recipients:
        if not re.match(r'\d+', r):
            internal = False
            break
    return internal
    
def extract_variables_from_file(file):
    """ 
    Extract relevant information from name of embedding file, with format: {}(_{})?.txt
    """
    file_chunks = file[0:-4].split('_')
    usr = file_chunks[0]
    time_key = file_chunks[1] if len(file_chunks) == 2 else None
    return (usr, time_key)

def month2timekey(month, time_key):
    """
    Converts month numbers to actual year or quarter
    """
    result = ''
    if time_key == 'year':
        year = 2006 + month // 12 
        if month % 12 > 2:
            year += 1
        result = str(year)
    elif time_key == 'quarter':
        year = 2006 + month // 12 
        remainder = month % 12
        if remainder > 2:
            year += 1
        quarter = ''
        if remainder <=2:
            quarter = 'Q4'
        elif remainder <= 5:
            quarter = 'Q1'
        elif remainder <= 8:
            quarter = 'Q2'
        else:
            quarter = 'Q3'
        result = str(year)+quarter
    return result

def extract_hr_df(hr_file, time_key=None):
    """
    Extract and preprocess tech HR data
    """
    hr = pd.read_csv(hr_file, index_col=0)
    # spell is a counter of the number of observations, not tenure
    hr['hire_month'] = hr.apply(lambda row : row['month'] if row['spell'] == 1 else np.nan, axis=1)
    hr['hire_month'] = hr['hire_month'].fillna(method='ffill')
    hr['tenure_months'] = hr.apply(lambda row : (row['month'] - row['hire_month'])+1, axis=1)
    hr['tenure_days'] = hr.apply(lambda row: row['tenure_months'] * 365/12, axis=1)
    hr['employeeid'] = hr['employeeid'].astype(str)
    hr['bonus_eligible'] = hr['bonus_eligible'].astype(int)
    if not time_key:
        hr.drop_duplicates(subset='employeeid', keep='last', inplace=True)
        hr = (hr[['employeeid', 'hire_month', 'sales', 'marketing', 'tech', 'vol_exit_event', 'invol_exit_event', 'manager', 'female', 'fav_rating', 'bonus', 'bonus_eligible', 'age', 'age2', 'cumbonus', 'tenure_months', 'tenure_days']])
        hr.rename(columns={"employeeid": "anon_id"}, inplace=True)
        hr.set_index('anon_id', inplace=True)
    else:
        hr[time_key] = hr['month'].apply(lambda row : month2timekey(row, time_key))
        hr['bonus'] = hr.groupby(['employeeid', time_key])['bonus'].transform('sum')
        hr['vol_exit_event'] = hr.groupby(['employeeid', time_key])['vol_exit_event'].transform('max')
        hr['invol_exit_event'] = hr.groupby(['employeeid', time_key])['invol_exit_event'].transform('max')
        hr['manager'] = hr.groupby(['employeeid', time_key])['manager'].transform('max')
        hr['fav_rating'] = hr.groupby(['employeeid', time_key])['fav_rating'].transform('max')
        hr['bonus_eligible'] = hr.groupby(['employeeid', time_key])['bonus_eligible'].transform('max')
        hr.drop_duplicates(subset=['employeeid', time_key], keep='last', inplace=True)
        hr = (hr[['employeeid', time_key, 'hire_month', 'sales', 'marketing', 'tech', 'vol_exit_event', 'invol_exit_event', 'manager', 'female', 'fav_rating', 'bonus', 'bonus_eligible', 'age', 'age2', 'cumbonus', 'tenure_months', 'tenure_days']])
        hr.rename(columns={"employeeid": "anon_id"}, inplace=True)
        hr.set_index(['anon_id', time_key], inplace=True)
    return hr

Set the current directory

In [None]:
import os
current_dir = os.getcwd()
current_dir

Set the input and output directories

In [None]:
num_cores = 9
build_threshold_network = False
weighted_mode = 'unweighted'
quarter_colname = 'quarter'
home_dir = current_dir
corpus_dir = "/zfs/projects/faculty/amirgo-identification/identification-Sarayu/tech/training/email_data_v2"
output_dir = "/zfs/projects/faculty/amirgo-identification/identification-Sarayu/downsampled_network_files/tech"
test_dir = os.path.join(home_dir, "idtf_output_data_test")
hr_file = "/zfs/projects/faculty/amirgo-identification/identification-Sarayu/tech/cossim/tech_ph2_data_for_analysis.csv"

print(corpus_dir)
print(output_dir)
print(hr_file)

Run functions to generate the local clustering measure, along with other variations

In [None]:
def get_quarterly_edges(test_mode):
    """
    Reads all network connections from emails. Emails processed in this function are internal emails only, as mittens1_generate_corpus.py
    have filtered out all emails addressed to an external party.
    Parameter
    ---------
    test_mode : bool
        If true, restrict nuber of emails read
    Returns
    -------
    quarterly_edges : dict
        A dictionary mapping quarters to edges, where quarters are represented by strings and each edge is represented as a 2-tuple (frm, to)
    """
    usr_files = os.listdir(corpus_dir)
    # Testing two percent of all users
    if test_mode: usr_files = [usr_files[random.randint(0, len(usr_files)-1)] for _ in range(len(usr_files)//10)]
    quarterly_edges = defaultdict(list)
    
    for filename in usr_files:
        with open(os.path.join(corpus_dir, filename), encoding='utf-8') as f:
            emails = json.load(f)
            for e in emails:
                if is_internal_msg(e):
                    frm = e['from']
                    if isinstance(frm, list): frm = frm[0]
                    quarter = e['quarter']
                    if quarter is None:
                        continue
                    curr_edges = []
                    for r in get_recipients(e):
                        assert frm != r, "Creating self-loop!"
                        curr_edges.append((frm, r))
                    quarterly_edges[quarter] += curr_edges
    return quarterly_edges

def generate_network_embeddedness(G_directed, weighted, edges2weights):
    """
    Generate embeddedness of one's network
    Parameter
    ---------
    G_directed : NetworkX graph
    weighted : bool
        Whether to engage in weighted computations
    edges2weights : dict of {tuple : int}
        Maps directed edges to weights
    Returns
    -------
        Three different measures of network embeddedness at ego, alter, and alter's alter levels.
    """
    node2embed_ego, node2embed_alter, node2embed_alter2 = defaultdict(lambda : None), defaultdict(lambda : None), defaultdict(lambda : None)
    for u in G_directed:
        local_network = set(G_directed.neighbors(u))
        within_cluster, without_cluster = 0, 0
        # As G is a directed network, u's network only includes those who u has sent an email to
        for v in G_directed.neighbors(u):
            for w in G_directed.neighbors(v):
                add = edges2weights[v, w] if weighted else 1
                if w in local_network:
                    within_cluster += add
                else:
                    without_cluster += add

        if (without_cluster+within_cluster) == 0:
            node2embed_ego[u] = np.nan
        else:
            # if 1, no within_cluster ties, if -1, all within-cluster ties
            node2embed_ego[u] = (without_cluster-within_cluster)/(without_cluster+within_cluster)

    # alter level measure
    for u in G_directed:
        local_network = set(G_directed.neighbors(u))
        v2ei, uv2weight = {}, {}
        # As G is a directed network, u's network only includes those who u has sent an email to
        for v in G_directed.neighbors(u):
            uv2weight[v] = edges2weights[u, v]
            # skip any alter who doesn't have an alter
            if len(list(G_directed.neighbors(v))) == 0:
                continue
            within_cluster, without_cluster = 0, 0    
            for w in G_directed.neighbors(v):
                add = edges2weights[v, w] if weighted else 1
                if w in local_network:
                    within_cluster += add
                else:
                    without_cluster += add
            v2ei[v] = (without_cluster-within_cluster)/(without_cluster+within_cluster)
        
        if len(v2ei) == 0:
            node2embed_alter[u] = np.nan
        else:
            u_ei = 0
            if weighted:
                u_weight = sum(uv2weight.values())
                uv2prop = {v : weight / u_weight for v, weight in uv2weight.items()}
                for v, ei in v2ei.items():
                    u_ei += (uv2prop[v] * ei)
            else:
                u_ei = sum(v2ei.values()) / len(v2ei)
            node2embed_alter[u] = u_ei

        
    # alter alter level measure
    for u in G_directed:
        local_network = set(G_directed.neighbors(u))
        v2ei, uv2weight = {}, {}
        # As G is a directed network, u's network only includes those who u has sent an email to
        for v in G_directed.neighbors(u):
            uv2weight[v] = edges2weights[u, v]
            # skip any alter who doesn't have an alter
            if len(list(G_directed.neighbors(v))) == 0:
                continue
            w2ei, vw2weight = {}, {}
            for w in G_directed.neighbors(v):
                vw2weight[w] = edges2weights[v, w]
                if len(list(G_directed.neighbors(w))) == 0:
                    continue
                within_cluster, without_cluster = 0, 0
                for z in G_directed.neighbors(w):
                    add = edges2weights[w, z] if weighted else 1
                    if z in local_network:
                        within_cluster += add
                    else:
                        without_cluster += add
                w2ei[w] = (without_cluster-within_cluster)/(without_cluster+within_cluster)
            
            if len(w2ei) > 0:
                v_ei = 0
                if weighted:
                    v_weight = sum(vw2weight.values())
                    vw2prop = {w : weight/v_weight for w, weight in vw2weight.items()}
                    for w, ei in w2ei.items():
                        v_ei += (vw2prop[w] * ei)
                else:
                    v_ei = sum(w2ei.values()) / len(w2ei)
                v2ei[v] = v_ei
        
        if len(v2ei) == 0:
            node2embed_alter2[u] = np.nan
        else:
            u_ei = 0
            if weighted:
                u_weight = sum(uv2weight.values())
                uv2prop = {u : weight / u_weight for u, weight in uv2weight.items()}
                for v, ei in v2ei.items():
                    u_ei += (uv2prop[v] * ei)
            else:
                u_ei = sum(v2ei.values()) / len(v2ei)
            node2embed_alter2[u] = u_ei

    return [node2embed_ego, node2embed_alter, node2embed_alter2]

def generate_community_ei(G_directed, node2community, weighted, edges2weights):
    """
    Calculate EI index based on community structure.
    Parameter
    ---------
    G_directed : NetworkX graph
    node2community : dict of {str : int}
        A dictionary mapping user IDs to integers that represent distinct communities
    weighted : bool
        Whether to engage in weighted computations
    edges2weights : dict of {tuple : int}
        Maps directed edges to weights
    Returns
    -------
        Six different measures of network embeddedness at ego, alter, and alter's alter levels, with optional
        filtering.
    """
    node2embed_ego, node2embed_ego_filter, node2embed_alter, node2embed_alter_filter, node2embed_alter2, node2embed_alter2_filter = [defaultdict(lambda : None) for _ in range(6)]
    for u in G_directed:
        local_community = node2community[u]
        local_network = set(G_directed.neighbors(u))
        within_cluster, within_cluster_filter, without_cluster, without_cluster_filter = 0, 0, 0, 0
        for v in G_directed.neighbors(u):
            filtered = node2community[v] == local_community
            for w in G_directed.neighbors(v):
                add = edges2weights[v, w] if weighted else 1
                if w in local_network or node2community[w] == local_community:
                    within_cluster += add
                    if filtered:
                        within_cluster_filter += add
                else:
                    without_cluster += add
                    if filtered:
                        without_cluster_filter += add

        if (without_cluster+within_cluster) == 0:
            node2embed_ego[u] = np.nan
        else:
            node2embed_ego[u] = (without_cluster-within_cluster)/(without_cluster+within_cluster)
        
        if (without_cluster_filter + within_cluster_filter) == 0:
            node2embed_ego_filter[u] = np.nan
        else:
            node2embed_ego_filter[u] = (without_cluster_filter-within_cluster_filter)/(without_cluster_filter+within_cluster_filter) 

    for u in G_directed:
        local_community = node2community[u]
        local_network = set(G_directed.neighbors(u))
        v2ei, v2ei_filtered, uv2weight = {}, {}, {}
        # As G is a directed network, u's network only includes those who u has sent an email to
        for v in G_directed.neighbors(u):
            uv2weight[v] = edges2weights[u, v]
            filtered = node2community[v] == local_community
            # skip any alter who doesn't have an alter
            if len(list(G_directed.neighbors(v))) == 0:
                continue
            within_cluster, without_cluster = 0, 0    
            for w in G_directed.neighbors(v):
                add = edges2weights[v, w] if weighted else 1
                if w in local_network or node2community[w] == local_community:
                    within_cluster += add
                else:
                    without_cluster += add
            v_ei = (without_cluster-within_cluster)/(without_cluster+within_cluster)
            v2ei[v] = v_ei
            if node2community[v] == local_community:
                v2ei_filtered[v] = v_ei
        
        if len(v2ei) == 0:
            node2embed_alter[u] = np.nan
        else:
            u_ei = 0
            if weighted:
                u_weight = sum(uv2weight.values())
                uv2prop = {u : weight / u_weight for u, weight in uv2weight.items()}
                for v, ei in v2ei.items():
                    u_ei += (uv2prop[v] * ei)
            else:
                u_ei = sum(v2ei.values()) / len(v2ei)
            node2embed_alter[u] = u_ei
        
        if len(v2ei_filtered) == 0:
            node2embed_alter_filter[u] = np.nan
        else:
            u_ei_filtered = 0 
            # if weight by total traffic to same community alters only, need to create a new uv2prop
            if weighted: 
                for v, ei in v2ei_filtered.items():
                    u_ei_filtered += (uv2prop[v] * ei)
            else:
                u_ei_filtered = sum(v2ei_filtered.values()) / len(v2ei_filtered)
            node2embed_alter_filter[u] = u_ei_filtered

    # alter alter level measure
    for u in G_directed:
        local_network = set(G_directed.neighbors(u))
        local_community = node2community[u]
        v2ei, v2ei_filtered, uv2weight = {}, {}, {}
        # As G is a directed network, u's network only includes those who u has sent an email to
        for v in G_directed.neighbors(u):
            uv2weight[v] = edges2weights[u, v]
            # skip any alter who doesn't have an alter
            if len(list(G_directed.neighbors(v))) == 0:
                continue
            w2ei, vw2weight = {}, {}
            for w in G_directed.neighbors(v):
                vw2weight[w] = edges2weights[v, w]
                if len(list(G_directed.neighbors(w))) == 0:
                    continue
                within_cluster, without_cluster = 0, 0
                for z in G_directed.neighbors(w):
                    add = edges2weights[w, z] if weighted else 1
                    if z in local_network or node2community[z] == local_community:
                        within_cluster += add
                    else:
                        without_cluster += add
                w2ei[w] = (without_cluster-within_cluster)/(without_cluster+within_cluster)
            
            if len(w2ei) > 0:
                v_ei = 0
                if weighted:
                    v_weight = sum(vw2weight.values())
                    vw2prop = {w : weight/v_weight for w, weight in vw2weight.items()}
                    for w, ei in w2ei.items():
                        v_ei += (vw2prop[w] * ei)
                else:
                    v_ei = sum(w2ei.values()) / len(w2ei)
                v2ei[v] = v_ei
                if node2community[v] == local_community:
                    v2ei_filtered[v] = v_ei
        if len(v2ei) == 0:
            node2embed_alter2[u] = np.nan
        else:
            u_ei = 0
            if weighted:
                u_weight = sum(uv2weight.values())
                uv2prop = {u : weight / u_weight for u, weight in uv2weight.items()}
                for v, ei in v2ei.items():
                    u_ei += (uv2prop[v] * ei)
            else:
                u_ei = sum(v2ei.values()) / len(v2ei)
            node2embed_alter2[u] = u_ei

        if len(v2ei_filtered) == 0:
            node2embed_alter2[u] = np.nan
        else:
            u_ei_filtered = 0
            if weighted:
                for v, ei in v2ei_filtered.items():
                    u_ei += (uv2prop[v] * ei)
            else:
                u_ei = sum(v2ei_filtered.values()) / len(v2ei)
            node2embed_alter2_filter[u] = u_ei

    return [node2embed_ego, node2embed_ego_filter, node2embed_alter, node2embed_alter_filter, node2embed_alter2, node2embed_alter2_filter]

def generate_community_embeddedness(G_directed, community_algorithm, weighted, edges2weights):
    """
    Generate the degree to which one's local network is embedded in one's community or outside of one's community
    Parameter
    ---------
    G_directed : NetworkX DiGraph
    community_algorithm : str
        Indicates type of algorithm to use for community detection
    weight : str
        Either None or weight attribute
    Returns
    -------
    measures : [dict, dict, dict, dict, dict, dict, dict, n_comm]
        A list of possible community measures
    """
    sys.stderr.write("Computing communities using {}'s algorithm at {}.\n".format(community_algorithm, datetime.now()))
    communities = []
    if community_algorithm == 'cnm':
        communities = algorithms.greedy_modularity(G_directed, weight='weight').communities
    elif community_algorithm == 'leiden':
        try:
            communities = algorithms.leiden(G_directed, weights='weight').communities
        except nx.exception.AmbiguousSolution as e:
            print('No community found using {} due to AmbiguousSolution error.'.format(community_algorithm))
            return defaultdict(lambda : np.nan)
    elif community_algorithm == 'surprise':
        try:
          communities = algorithms.surprise_communities(G_directed, weights='weight').communities
        except nx.exception.AmbiguousSolution as e:
            print('No community found using {} due to AmbiguousSolution error.'.format(community_algorithm))
            return defaultdict(lambda : np.nan)
    else:
        print("Community detection algorithm {} not supported".format(community_algorithm))
        return defaultdict(lambda : np.nan)
    
    node2community = {node : i for i, c in enumerate(communities) for node in c}
    measures = generate_community_ei(G_directed, node2community, weighted, edges2weights)
    measures.append(len(communities))
    return measures

def compute_threshold(edges2weights):
    """
    Computes the 20th percentage edge weight for all nodes
    Parameters
    ----------
    edges2weights : dict of {tuple : list}
        Maps all directed edges to the weight of the edge
    Returns
    -------
    node2threshold : dict of {str : int}
        Maps all nodes to the 20th percentile threshold
    """
    node2weights = defaultdict(list)
    node2threshold = defaultdict(lambda : None)
    for edge, weight in edges2weights.items():
        node2weights[edge[0]].append(weight)

    for n, weights in node2weights.items():
        node2threshold[n] = np.percentile(weights, 20)
    return node2threshold

def generate_network_measures(timekey, edges, test_mode):
    """
    Generating network measures for a given time period using edges
    Parameters
    ----------
    timekey : str
        A string that represents the time period for which network measures are being computed
    edges : list
        A list of directd edges represented by 2-tuples
    test_mode : bool
        If true, restrict edges to a hundredth of all edges
    """
    
    if len(edges) < 10:
        sys.stderr.write('Returning empty network at %s with %d edges at %s.\n' % (timekey, len(edges), datetime.now()))
        return dict()

    G_directed = nx.DiGraph()
    sys.stderr.write('Generating weighted network measures for %s with %d edges at %s.\n' % (timekey, len(edges), datetime.now()))
    edges2weights = Counter(edges)
    weighted_edges = [(edge[0], edge[1], weight) for edge, weight in edges2weights.items()]
    G_directed.add_weighted_edges_from(weighted_edges)
    usr_quarter2network_measures = defaultdict(list)

    weighted_degree = G_directed.degree(weight='weight')
    unweighted_degree = G_directed.degree(weight=None)
    weighted_clustering = nx.clustering(G_directed, weight='weight')
    unweighted_clustering = nx.clustering(G_directed, weight=None)

    for n in G_directed:
        row = ([weighted_degree[n], unweighted_degree[n], weighted_clustering[n], unweighted_clustering[n]])
        usr_quarter2network_measures[(n, timekey)] = row
    return dict(usr_quarter2network_measures)

def time_edges_to_df(time_edges, test_mode=False):
    """
    Calculates network measures using edge lists
    Parameters
    ----------
    time_edges : dict
        A dictionary that maps quarters (quarters only) to a list of edges belonging to that time period
    test_mode : bool, optional
        If true, only generate one network
    Returns
    -------
    df : pd.DataFrame
        A dataframe of network measures with user id and timekey_type as index
    """
    if test_mode:
        time_edges = {quarter:edges for quarter, edges in time_edges.items() if len(edges) > 5}
        test_timekey = random.choice(list(time_edges))
        sys.stderr.write("Testing timekey %s out of %d time periods.\n" % (test_timekey, len(time_edges)))
        network_measures = generate_network_measures(test_timekey, time_edges[test_timekey], test_mode)
    else:
        pool = multiprocessing.Pool(processes = num_cores)
        results = [pool.apply_async(generate_network_measures, args=(timekey, edges, test_mode, )) for timekey, edges in time_edges.items()]
        pool.close()
        pool.join()
        network_measures = defaultdict(list)
        for r in results:
            network_measures.update(r.get())

    cols = (['weighted_degree', 'unweighted_degree', 'weighted_clustering', 'unweighted_clustering'])    
    df = dict_to_df(network_measures, cols, index_name=['anon_id', 'quarter'])
    return df.round(5)

def extract_quarterly_network_measures(corpus_dir, test_mode=False):
    """
    Main workhorse function for computing netwrork measures and writing them to file.
    Parameters
    ----------
    corpus_dir : str
        Location of cleaned emails
    test_mode : bool, optional
        If testing, modify file_name to include flags for testing in final output file name
    """
    if test_mode:
        global output_dir
        output_dir = test_dir
    
    edges_file = 'edges_corrected.txt'
    edges_file = os.path.join(output_dir, edges_file)
    
    quarterly_edges = defaultdict(list)
    if os.path.exists(edges_file):
        sys.stderr.write("Reading edges from edge file at %s.\n" % str(datetime.now()))
        with open(edges_file, 'r') as f:
            for line in f:
                tup = literal_eval(line)
                quarterly_edges[tup[0]].append((tup[1], tup[2]))
    else:
        sys.stderr.write("Computing edges at %s.\n" % str(datetime.now()))
        quarterly_edges = get_quarterly_edges(test_mode)
        sys.stderr.write("Writing edges to edge file at %s.\n" % str(datetime.now()))
        with open(edges_file, 'w') as f:
            for quarter, edges in quarterly_edges.items():
                for e in edges:
                    f.write(str((quarter, e[0], e[1]))+'\n')

    network_df_quarterly = time_edges_to_df(quarterly_edges, test_mode)
    network_quarterly_filename = os.path.join(output_dir, "tech_network_embed.csv")
    network_df_quarterly.to_csv(network_quarterly_filename)
    return

In [None]:
starttime = datetime.now()
test_mode = False
try:
    test_mode = sys.argv[1].lower() == 'test'
except IndexError as error:
    pass

sys.stderr.write('Reading HR and survey data at %s.\n' % datetime.now())
hr_df = extract_hr_df(hr_file)
usr2gender = {usr: 'female' if female == 1 else 'male' for usr, female in hr_df['female'].dropna().to_dict().items()}
hr_df_quarterly = extract_hr_df(hr_file, 'quarter')
hr_df_quarterly['department'] = np.select([(hr_df_quarterly['sales'] == 1), (hr_df_quarterly['tech'] == 1), (hr_df_quarterly['marketing'] == 1)], ['sales', 'tech', 'marketing'], default='other')
usr_quarter2department = hr_df_quarterly['department'].dropna().to_dict()
    
sys.stderr.write('Generating network measures at %s.\n' % datetime.now())
extract_quarterly_network_measures(corpus_dir, test_mode)

sys.stderr.write('Finished generating network measures at %s with a duration of %s.\n' % (datetime.now(), str(datetime.now()-starttime)))