**Company** : <br>
Design Firm

**Notebook Function** : <br>
    This notebook generates the global reach measures

**Output File(s)** : <br>
    design_diversity_bridging_all_internal.csv - The final output file containing the network measure

**Author(s)** : <br>
Lara Yang, Sarayu Anshuman

Install packages and load libraries

In [None]:
import os
import sys
from collections import defaultdict
from datetime import datetime
import pandas as pd
import numpy as np
from mittens import Mittens
import csv
from operator import itemgetter
import json
import ujson
import re
from gensim.matutils import cossim
from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec
from statistics import mean 
import multiprocessing
year_colname, quarter_colname, yearmonth_colname = 'year', 'quarter', 'yearmonth'
num_cores = 12
from collections import defaultdict, Counter
from datetime import datetime, timedelta
import networkx as nx
import multiprocessing
import random
from ast import literal_eval
import networkx.algorithms.community as nx_comm
from cdlib import algorithms

Run helper functions

In [None]:
#########################################################################
########### Helper Functions for Generating Mittens Embeddings ##########
#########################################################################

def _window_based_iterator(toks, window_size, weighting_function):
    for i, w in enumerate(toks):
        yield w, w, 1
        left = max([0, i-window_size])
        for x in range(left, i):
            yield w, toks[x],weighting_function(x)
        right = min([i+1+window_size, len(toks)])
        for x in range(i+1, right):
            yield w, toks[x], weighting_function(x)
    return

def glove2dict(glove_filename):
    with open(glove_filename) as f:
        reader = csv.reader(f, delimiter=' ', quoting=csv.QUOTE_NONE)
        data = {line[0]: np.array(list(map(float, line[1: ]))) for line in reader}
    return data

# Inspired by original build_weighted_matrix in utils.py in the Mittens paper source codebase
def build_weighted_matrix(files,
        mincount=300, vocab_size=None, window_size=10,
        weighting_function=lambda x: 1 / (x + 1), load_to_mem=False, parallel=False, verbose=False,
        internal_only=False):
    """
    Builds a count matrix based on a co-occurrence window of
    `window_size` elements before and `window_size` elements after the
    focal word, where the counts are weighted based on proximity to the
    focal word.
    Parameters
    ----------
    files : list of strings
        Filenames where documents in a corpus are located
    corpus : iterable of str
        Texts to tokenize.
    mincount : int
        Only words with at least this many tokens will be included.
    vocab_size : int or None
        If this is an int above 0, then, the top `vocab_size` words
        by frequency are included in the matrix, and `mincount`
        is ignored.
    window_size : int
        Size of the window before and after. (So the total window size
        is 2 times this value, with the focal word at the center.)
    weighting_function : function from ints to floats
        How to weight counts based on distance. The default is 1/d
        where d is the distance in words.
    load_to_mem : bool, optional
        Whether to load entire corpus to memory
    parallel : bool, optional
        Whether to process emails in parallel. Only makes sense if loading to memory.
    internal_only : bool, optional
        Whether to only include internal emails - emails whose recipients only include company employees
    Returns
    -------
    pd.DataFrame
        This is guaranteed to be a symmetric matrix, because of the
        way the counts are collected.
    """
    wc = defaultdict(int)
    # This variable will be a list when load_to_mem is true and a generator when load_to_mem is false
    corpus = None
    if load_to_mem:
        if parallel:
            pool = multiprocessing.Pool(processes = num_cores)
            results = [pool.apply_async(read_corpus_parallel, args=(f, internal_only, )) for f in files]
            corpus = [r.get() for r in results if r.get()]
            pool.close()
            pool.join()
        else:
            # previously used split(' ') in 2yp
            corpus = [text.split() for text in read_corpus(files, internal_only)]
        for toks in corpus:
            for tok in toks:
                wc[tok] += 1
        sys.stderr.write('\n')
    else:
        corpus = read_corpus(files, internal_only)
        for text in corpus:
            for tok in text.split():
                wc[tok] += 1
    if verbose: sys.stderr.write('Finished counting %d unique words in corpus at %s.\n' % (len(wc), datetime.now()))
    if vocab_size:
        srt = sorted(wc.items(), key=itemgetter(1), reverse=True)
        vocab_set = {w for w, c in srt[: vocab_size]}
    else:
        vocab_set = {w for w, c in wc.items() if c >= mincount}
    vocab = sorted(vocab_set)
    n_words = len(vocab)
    if verbose: sys.stderr.write('Finished generating vocab of %d words at %s.\n' % (n_words, datetime.now()))
    # Weighted counts:
    # Generator function needs to be re-initated
    if not load_to_mem: corpus = read_corpus(files, internal_only)
    counts = defaultdict(float)
    for toks in corpus:
        if not load_to_mem: toks = toks.split()
        window_iter = _window_based_iterator(toks, window_size, weighting_function)
        for w, w_c, val in window_iter:
            if w in vocab_set and w_c in vocab_set:
                counts[(w, w_c)] += val
    if verbose: sys.stderr.write('Finished counting co-occurrences across %d word pairs at %s.\n' % (len(counts), datetime.now()))
    X = np.zeros((n_words, n_words))
    for i, w1 in enumerate(vocab):
        for j, w2 in enumerate(vocab):
            X[i, j] = counts[(w1, w2)]
    if verbose: sys.stderr.write('Finished converting co-occurrences to sorted matrix of shape %s at %s.\n' % (str(X.shape), datetime.now()))
    X = pd.DataFrame(X, columns=vocab, index=pd.Index(vocab))
    return X

# this method is currently unused as it was used for proof-of-concept during macro lunch,
# where only employees with the top 200 vocab counts are included
def select_usr_corpus(inpath, num_usrs_to_process):
    infile = pd.read_csv(inpath)
    usr2num_tokens = infile.set_index(["usr"]).dropna().to_dict()["num_tokens"]
    usrs_to_analyze = [usr for usr, num_tokens in sorted(usr2num_tokens.items(), key=lambda item: item[1])]
    usrs_to_analyze = usrs_to_analyze[-num_usrs_to_process:]
    return usrs_to_analyze  

# The function currently ignores sentence delimiters and considers co-occurrences across sentence boundaries to be
# co-occurrences as well. If desired, we can vary yield behavior based on the sentence_delim boolean to yield every
# sentence instead of every message as it currently does. However, if we were to yield every sentence, we need to make
# sure to not simply output the body but to yield every new line in the body individually.
def read_corpus(files, internal_only, sentence_delim=False):
    for file in files:
        with open(file, errors='ignore', encoding='utf-8') as infile:
            try:
                msg = ujson.load(infile)
                if internal_only and not is_internal_msg(msg):
                    continue
                # This line ignores all sentence structures as we are using \n to represent sentence splits
                body = msg['hashed-body'].replace('\n', ' ').strip()
                if len(body) > 0:
                    yield body
                else:
                    continue
            except (ValueError, json.decoder.JSONDecodeError) as error:
                continue
    return

def read_corpus_parallel(f, internal_only, sentence_delim=False):
    with open(f, errors='ignore', encoding='utf-8') as infile:
        try:
            msg = ujson.load(infile)
            if internal_only and not is_internal_msg(msg):
                return None
            # This line ignores all sentence structures as we are using \n to represent sentence splits
            body = msg['hashed-body'].replace('\n', ' ').strip()
            if len(body) > 0:
                return body.split()
            else:
                return None
        except (ValueError, json.decoder.JSONDecodeError) as error:
            return None

def output_embeddings(mittens_df, filename, compress=False):
    """
    Writes embeddings in the format of pd.DataFrame to a text file, replacing any existing file with the
    same name
    """
    if compress:
        mittens_df.to_csv(filename + '.gz', quoting=csv.QUOTE_NONE, header=False, sep=" ", compression='gzip')
    else:
        mittens_df.to_csv(filename, quoting=csv.QUOTE_NONE, header=False, sep=" ")
    return

#########################################################################
############# Helper Functions for Working with JSON Emails #############
#########################################################################
# Same as get_recipient function in Analysis/acculturaltion/lingdistance/jensen_shannon.py
def get_recipients(msg):
    sender = msg['from'][0] if type(msg['from']) == list else msg['from']
    rec = set(msg['to'] + msg['cc'] + msg['bcc']) - set([sender])
    return rec

def is_internal_msg(msg):
    """
    Determines whether msg is an internal message. An internal message is a message that is only sent to internal employees,
    whose anon IDs start with 'U'. An individual who is not an internal employee does not have a domain associated with design firm and
    their anon IDs start with 'E'.
    """
    for r in get_recipients(msg):
        if re.match('E---', r):
            return False
    return True

def slice_user_corpus(files, train_mode):
    timekey2files = defaultdict(list)
    #the function iterates through each email in the email list
    for file in files:
        with open(file, errors='ignore') as infile:
            try:
                msg = ujson.load(infile)
                if train_mode == 'annual':
                    timekey2files[to_year(msg['date'], format='str')].append(file)
                elif train_mode == 'quarterly':
                    timekey2files[to_quarter(msg['date'], format='str')].append(file)
                elif train_mode == 'monthly':
                    timekey2files[to_yearmonth(msg['date'], format='str')].append(file)
                elif train_mode == 'halfyear':
                    timekey2files[to_halfyear(msg['date'], format='str')].append(file)
                elif train_mode == 'all':
                    timekey2files[to_year(msg['date'], format='str')].append(file)
                    timekey2files[to_quarter(msg['date'], format='str')].append(file)
                    timekey2files[to_halfyear(msg['date'], format='str')].append(file)
                    #timekey2files[to_yearmonth(msg['date'], format='str')].append(file) #removed as it leads to too many output files
            except (ValueError, json.decoder.JSONDecodeError) as error:
                continue
    return timekey2files

def read_message(files):
    errors = 0
    for file in files:
        with open(file, errors='ignore') as infile:
            try:
                msg = ujson.load(infile)
                yield msg
            except (ValueError, json.decoder.JSONDecodeError) as error:
                errors += 1
                continue
    sys.stderr.write("{} files produced errors out of a total of {} files.\n".format(errors, len(files)))
    return

#########################################################################
############# Helper Functions for Working with Date Objects ############
#########################################################################

def to_yearmonth(date, format):
    if format == 'str':
        return date[0:7]
    elif format == 'datetime':
        return date.strftime('%Y-%m')

def to_quarter(date, format):
    year, month = 0, 0
    if format == 'str':
        year = date[0:4]
        month = date[5:7]    
    elif format == 'datetime':
        year = date.year
        month = date.month
    quarter = ((int(month)-1) // 3) + 1
    timekey = str(year) + 'Q' + str(quarter)
    return timekey

def to_halfyear(date, format):
    """
    Return half year of date in string
    """
    year, month = 0, 0
    if format == 'str':
        year = date[0:4]
        month = date[5:7]    
    elif format == 'datetime':
        year = date.year
        month = date.month
    halfyear = ((int(month)-1) // 6) + 1
    timekey = str(year) + 'HY' + str(halfyear)
    return timekey

def to_year(date, format):
    if format == 'str':
        return date[0:4]
    elif format == 'datetime':
        return str(date.year)

def datetime_to_timekey(date, time_key):
    if time_key == 'year':
        return to_year(date, format='datetime')
    elif time_key == 'quarter':
        return to_quarter(date, format='datetime')
    elif time_key == 'yearmonth':
        return to_yearmonth(date, format='datetime')

def is_month_before_equal(datetime1, datetime2):
    if datetime1.year < datetime2.year:
        return 1
    elif (datetime1.year == datetime2.year) and (datetime1.month <= datetime2.month):
        return 1
    else:
        return 0

def num_months_between_dates(datetime1, datetime2):
    return abs((datetime1.year - datetime2.year) * 12 + datetime1.month - datetime2.month)

def num_quarters_between_dates(datetime1, datetime2):
    return abs((datetime1.year - datetime2.year) * 12 + datetime1.month - datetime2.month) // 3

def num_years_between_dates(datetime1, datetime2):
    return abs(datetime1.year - datetime2.year)

def time_between_dates(datetime1, datetime2, time_key):
    if time_key == 'monthly':
        return num_months_between_dates(datetime1, datetime2)
    elif time_key == 'quarterly':
        return num_quarters_between_dates(datetime1, datetime2)
    elif time_key == 'annual':
        return num_years_between_dates(datetime1, datetime2)

#########################################################################
############## Helper Functions for Working with Dataframes #############
#########################################################################
def dict_to_df(index2rows, cols, index_name):
    """
    Parameters
    ----------
    index2rows : dict
        Dictionary mapping index to rows to be coverted
    cols : list
        List of column names of type str
    index : list
        List of index names
    Returns
    -------
    df : pd.DataFrame
        Constructed dataframe
    """
    if len(index_name) == 1:
        df = pd.DataFrame.from_dict(index2rows, orient='index', columns=cols)
        df.index.name = index_name[0]
        df.sort_index(axis=0, inplace=True)
        return df
    else:
        df = pd.DataFrame.from_dict(index2rows, orient='index', columns=cols)
        df = pd.DataFrame(df, pd.MultiIndex.from_tuples(df.index, names=index_name))
        df.sort_index(axis=0, inplace=True)
        return df

def generate_test_df():
    """
    A function to generate a DataFrame for testing purposes.

    This DataFrame uses Gender as the index and has a column for simulated word vectors for 'i'
    and a column for simulated word vectors for 'we'. Each of these word vectors is a size-10 numpy vector.
    """
    df = pd.DataFrame()
    df['gender'] = ['F', 'M', 'M', 'F', 'M', 'F', 'F', 'M', 'M', 'F']
    df['i'] = df.apply(lambda x: np.random.rand(10).round(5), axis=1)
    df['we'] = df.apply(lambda x: np.random.rand(10).round(5), axis=1)
    df.set_index('gender', inplace=True)
    return df

#########################################################################
########### Helper Functions for Working with Embedding Output ##########
#########################################################################

def remove_empty_embeddings(embeddings_dir):
    """
    Removes all empty files in embeddings_dir that were produced when vocab size was 0.
    Parameters
    ----------
    embeddings_dir : str
        Full path to directory where embedding files are located
    """
    for file in os.listdir(embeddings_dir):
        mittens_file = os.path.join(embeddings_dir, file)
        if os.path.getsize(mittens_file) == 0:
            os.remove(mittens_file)
    return

def word_similarity(model, w1, w2):
    """
    This is an auxilary function that allows for comparing one word to another word or multiple words
    If w1 and w2 are both single words, n_similarity returns their cosine similarity which is the same as 
    simply calling similarity(w1, w2)
    If w1 or w2 is a set of words, n_similarity essentially takes the mean of the set of words and then computes
    the cosine similarity between that vector mean and the other vector. This functionality is both reflected
    in its source code and has been verified manually.
    Parameters
    ----------
    model : KeyedVectors
        The model that contains all the words and vectors
    w1 : str or list
        The first word or word list to be compared
    w2 : str or list
        The second word or word list to be compared
    Returns
    -------
    float
        Cosine similarity between w1 and w2
    """
    if not isinstance(w1, list):
        w1 = [w1]
    if not isinstance(w2, list):
        w2 = [w2]
    w1 = [w for w in w1 if w in model.vocab]
    w2 = [w for w in w2 if w in model.vocab]
    if len(w1) == 0 or len(w2) == 0:
        return None
    return model.n_similarity(w1, w2)

def extract_company_embedding(company_embeddings_filename, tmp_dir, words):
    """
    Parameters
    ----------
    company_embeddings_filename : str
        File path of the company embeddings
    tmp_dir : str
        Path to the directory for gensim to output its tmp files in order to load embeddings into word2vec format
    words : list
        A list of strings to retrieve vectors for
    Returns
    -------
    vecs : list
        A list of vectors of type numpy.ndarray that correspond to the list of words given as parameters
    """ 
    tmp_mittens = os.path.join(tmp_dir, "mittens_embeddings_all_word2vec.txt")
    word2vec_mittens_file = get_tmpfile(tmp_mittens)
    glove2word2vec(company_embeddings_filename, word2vec_mittens_file)
    model = KeyedVectors.load_word2vec_format(word2vec_mittens_file)
    vecs = []
    for w in words:
        if w in model.vocab:
            vecs.append(model.wv[w])
        else:
            print('%s not in company embeddings' % w)
    return vecs

def isnull_wrapper(x):
    r = pd.isnull(x)
    if type(r) == bool:
        return r
    return r.any()

def cossim_with_none(vec1, vec2, vec_format='sparse'):
    """
    Auxiliary function that calls cossim function to test if vectors are None to prevent erroring out.
    Parameters
    ----------
    vec1 : list of (int, float), gensim sparse vector format
    vec2 : list of (int, float), gensim sparse vector format
    format : str, optional
        Either sparse or dense. If sparse, vec1 and vec2 are in gensim sparse vector format; use cossim function from gensim.
        Otherwise, vec1 and vec2 are numpy arrays and cosine similarity is hand calculated
    Returns
    -------
    float
        Cosine similarity between vec1 and vec2
    """
    if not (isnull_wrapper(vec1) or isnull_wrapper(vec2)):
        if vec_format == 'sparse':
            return cossim(vec1, vec2)
        elif vec_format == 'dense':
            if len(vec1) == 0 or len(vec2) == 0:
                return None
            return np.dot(vec1, vec2)/(np.linalg.norm(vec1) * np.linalg.norm(vec2))
        else:
            raise ValueError()
    return None

def calculate_pairwise_cossim(col1, col2=None, reference=False, reference_group=None, anon_ids=None):
    """
    Calculates averaged cosine similarity of every row vector in col1 with every other row vector in col2.
    If no col2 is provided, cosine similarity of every row vector with every other row vector in col1 is calculated.
    The two columns should have equal length.
    Parameters
    ----------
    col1 : pd.Series
        A column where each row is a sparse word vector (BoW format that gensim code is written for).
    col2 : pd.Series, optional
        A column where each row is a sparse word vector (BoW format that gensim code is written for).
    reference : bool, optional
        Indicator variable for whether filtering for reference groups is needed
    reference_group : pd.Series, optional
        If filtering for reference groups, a list containing reference group members for every employee in col1
    anon_ids : pd.Series, optional
        If filtering for reference groups, a list containing anon_ids for every employee in col1
    Returns
    -------
    results : list
        A list where the ith element is the averaged cosine similarity between the ith vector in col1 and every vector
        in col2 for which i != j. If no col2 is provided, a list where the ith element is the averaged cosine similarity
        between the ith vector in col1 and every other vector in col1 is returned.
    """
    vectors1 = col1.tolist()
    vectors2 = col2.tolist() if col2 is not None else col1.tolist()
    reference_group = reference_group.tolist() if reference else None
    anon_ids = anon_ids.tolist() if anon_ids is not None else None
    results = list()
    for i in range(len(vectors1)):
        total_sim = []
        if vectors1[i]:
            for j in range(len(vectors2)):
                if i != j and vectors2[j]:
                    # filter out any np.nans as our reference group
                    if not reference or (type(reference_group[i]) == set and anon_ids[j] in reference_group[i]):
                        total_sim.append(cossim(vectors1[i], vectors2[j]))
            results.append(mean(total_sim) if len(total_sim) > 0 else None)
        else:
            results.append(None)
    return results

def vector_mean(col):
    """
    Calculate vector means of row vectors
    Parameters
    ----------
    col : pd.Series
        The column to be averaged
    Returns
    -------
    np.array
        A vector that is the numerical average of all vectors in col
    """
    return np.array(col[col.notna()].tolist()).mean(axis=0)

def fix_department(hr_df):
    """
    Correcting all typos in the department field. Mostly for generating a correct department
    roster used by mittens3d_explore_alternative_embeddings.py
    Parameters
    ----------
    hr_df : pd.DataFrame
        A dataframe of HR data
    Returns
    -------
    hr_df : pd.DataFrame
        The original dataframe after typos are fixed in the department column
    """
    hr_df['department'] = hr_df['department'].str.replace("Enviroments", "Environments", regex=True)
    hr_df['department'] = hr_df['department'].str.replace("Leanrnign", "Learning", regex=True)
    hr_df['department'] = hr_df['department'].str.replace(" +$", "", regex=True)
    return hr_df

def convert_to_datetime(date_str):
    try:
        return pd.to_datetime(date_str, format='%m-%d-%y')
    except ValueError:
        return pd.to_datetime(date_str, format='%m-%d-%Y')

def extract_hr_df(hr_filepath, time_key=None):
    hr = pd.read_csv(hr_filepath)
    cols = hr.columns
    cols = cols.map(lambda x: x.replace(' ', '_') if isinstance(x, str) else x)
    hr.columns = cols
    #hr['snapshot_date'] = pd.to_datetime(hr['snapshot_date'])
    hr['snapshot_date'] = hr['snapshot_date'].apply(convert_to_datetime)
    hr = hr.sort_values(by=['anon_id', 'snapshot_date'])
    # drop all rehires
    hr_rehires = set(hr.loc[hr['rehire_date'].notna(), 'anon_id'])
    hr['drop'] = hr.apply(lambda row : 1 if row.anon_id in hr_rehires else 0, axis=1)
    hr = hr.loc[hr['drop'] == 0]
    hr['hire_date'] = pd.to_datetime(hr['hire_date'], 'coerce')
    hr['termination_date'] = pd.to_datetime(hr['termination_date'])
    hr['termination_reason'] = hr['termination_reason'].str.lower()
    hr.dropna(subset=['hire_date', 'snapshot_date'], axis=0, inplace=True)
    hr['entry_age'] = hr.groupby('anon_id')['age'].transform('min')
    hr['exit'] = hr.apply(lambda row : 1 if pd.notna(row['termination_reason']) else 0, axis=1)
    hr['exit_vol'] = hr.apply(lambda row : 1 if (row['termination_reason'] == 'voluntary') else 0, axis=1)
    hr['exit_invol'] = hr.apply(lambda row : 1 if (row['termination_reason'] == 'involuntary') else 0, axis=1)
    hr = fix_department(hr)
    if not time_key:
        supervisors = set(hr['supervisor_id'])
        hr['is_supervisor'] = hr.apply(lambda row : 1 if row.anon_id in supervisors else 0, axis=1)
        hr['initial_salary'] = hr.apply(lambda row: row.annual_salary if (row.snapshot_date.month == row.hire_date.month) else None, axis=1)
        hr['initial_salary'].fillna(method = 'ffill', inplace=True)
        hr['salary_delta'] = (hr['annual_salary'] - hr['initial_salary']) / hr['initial_salary']
        hr['salary_avg'] = hr.groupby('anon_id')['annual_salary'].transform('mean')
        hr['tenure'] = hr.apply(lambda row:
            (row.termination_date - row.hire_date).days if pd.notna(row.termination_reason) else (row.snapshot_date - row.hire_date).days, axis=1)
        hr.drop_duplicates(subset='anon_id', keep='last', inplace=True)
        hr = (hr[['anon_id', 'gender', 'ethnicity', 'department', 'location', 'job_title',
            'tenure', 'exit', 'exit_vol', 'exit_invol', 'is_supervisor', 'salary_delta', 'salary_avg', 'entry_age']])
        hr.set_index('anon_id', inplace=True)
        hr.index.name = 'usr'
    else:
        hr[time_key] = hr.apply(lambda row : datetime_to_timekey(row['snapshot_date'], time_key), axis=1)
        hr['past_salary'] = hr.groupby('anon_id')['annual_salary'].shift(1)
        hr['past_job_title'] = hr.groupby('anon_id')['job_title'].shift(1)
        hr['promotion'] = hr.apply(lambda row : 1 if (row['annual_salary'] > row['past_salary'] and (row['past_job_title'] != row['job_title'])) else 0, axis=1)
        monthly_supervisors = {k: set(sups['supervisor_id'].tolist()) for k, sups in hr[['snapshot_date', 'supervisor_id']].groupby("snapshot_date")}
        hr['is_supervisor'] = hr.apply(lambda row : 1 if row['anon_id'] in monthly_supervisors[row['snapshot_date']] else 0, axis=1)
        if time_key == quarter_colname or time_key == year_colname:
            hr['promotion'] = hr.groupby(['anon_id', time_key])['promotion'].transform('max')
            hr['exit'] = hr.groupby(['anon_id', time_key])['exit'].transform('max')
            hr['exit_vol'] = hr.groupby(['anon_id', time_key])['exit_vol'].transform('max')
            hr['exit_invol'] = hr.groupby(['anon_id', time_key])['exit_invol'].transform('max')
            hr['is_supervisor'] = hr.groupby(['anon_id', time_key])['is_supervisor'].transform('max')
            hr.drop_duplicates(subset=['anon_id', time_key], keep='last', inplace=True)
        hr = (hr[['anon_id', time_key, 'snapshot_date', 'gender', 'ethnicity', 'department', 'location', 'job_title', 'annual_salary', 'hire_date',
                'is_supervisor', 'age', 'entry_age', 'promotion', 'exit', 'exit_vol', 'exit_invol']])
        hr.set_index(['anon_id', time_key], inplace=True)
    return hr

Print current directory

In [None]:
import os
current_dir = os.getcwd()
current_dir

Set the file paths

In [None]:
internal_only = True
first_batches = False
build_threshold_network = False
num_cores = 14
weighted_mode = 'both'
year_colname, quarter_colname, yearmonth_colname = 'year', 'quarter', 'yearmonth'
home_dir = "/zfs/projects/faculty/amirgo-transfer/design/"
corpus_dir = "/zfs/projects/faculty/amirgo-identification/identification-Sarayu/design/fine-tuning/hashed_email"
output_dir = "/zfs/projects/faculty/amirgo-identification/identification-Sarayu/network_files/design"

print(corpus_dir)
print(output_dir)

Run functions to generate the network measures

In [None]:
def find_all_neighbors(G_directed, u):
    """
    Find the union of in and out-neighbors of u in G_directed.
    """
    return set(G_directed.successors(u)).union(set(G_directed.predecessors(u)))

def generate_department_bridging(G_directed, usr2department, edges2weights, weighted):
    """
    Generate department-based EI index
    Parameter
    ---------
    G_directed : NetworkX graph
    usr2department : dict
        Maps useres to departments
    edges2weights: dict
        Maps directed weights to weights
    weighted : bool
        Whether computations should be weighted
    Returns
    -------
    tuple
        A 2-tuple of dictionaries matching node (i.e., user ids) to department EI index using two kinds of neighbors.
    """
    node2embed, node2embed_all_neighbors = defaultdict(lambda : None), defaultdict(lambda : None)
    for u in G_directed:    
        if u in usr2department:
            within_department, without_department = 0, 0
            for v in G_directed.neighbors(u):
                if v in usr2department:
                    add = edges2weights[(u, v)] if weighted else 1
                    if usr2department[u] == usr2department[v]:
                        within_department += add
                    else:
                        without_department += add
            if (within_department + without_department == 0):
                node2embed[u] = np.nan
            else:
                node2embed[u] = ((without_department - within_department) / (without_department + within_department))
            
            within_department, without_department = 0, 0
            for v in find_all_neighbors(G_directed, u):
                if v in usr2department:
                    add = (edges2weights.get((u, v), 0) + edges2weights.get((v, u), 0)) if weighted else 1
                    if usr2department[u] == usr2department[v]:
                        within_department += add
                    else:
                        without_department += add
            if (within_department + without_department == 0):
                node2embed_all_neighbors[u] = np.nan
            else:
                node2embed_all_neighbors[u] = ((without_department - within_department) / (without_department + within_department))

    return (node2embed, node2embed_all_neighbors)

def generate_ego_bridging(G_directed, edges2weights, communities, weighted):
    """
    Generating community EI based on immediate alters. This function returns both the results of using successors only vs
    using both successors and predecessor nodes.
    The unweighted version of both successor only and successor and predecessor both measure have already been separately computed in
    corpcorp_ego_bridging.py and corpcorp_union_bridging.py. They are re-computed here so that there's a correctly computed weighted version
    of these measures.
    """
    node2embed, node2embed_all_neighbors = defaultdict(lambda : None), defaultdict(lambda : None)
    node2community = {node : i for i, c in enumerate(communities) for node in c}

    for u in G_directed:
        local_community = node2community[u]
        within_cluster, without_cluster = 0, 0
        for v in G_directed.neighbors(u):
            add = edges2weights[(u, v)] if weighted else 1
            if node2community[v] == local_community:
                within_cluster += add
            else:
                without_cluster += add

        if (without_cluster+within_cluster) == 0:
            node2embed[u] = np.nan
        else:
            node2embed[u] = (without_cluster-within_cluster)/(without_cluster+within_cluster)

        within_cluster, without_cluster = 0, 0
        for v in find_all_neighbors(G_directed, u):
            add = (edges2weights.get((u, v), 0) + edges2weights.get((v, u), 0)) if weighted else 1
            if node2community[v] == local_community:
                within_cluster += add
            else:
                without_cluster += add

        if (without_cluster+within_cluster) == 0:
            node2embed_all_neighbors[u] = np.nan
        else:
            node2embed_all_neighbors[u] = (without_cluster-within_cluster)/(without_cluster+within_cluster)
    
    return (node2embed, node2embed_all_neighbors)

def generate_global_bridging(G_directed, edges2weights, communities, weighted):
    """
    Generate the degree of global bridging of G_directed. This is basically the EI index of the number of alters' alters who are in ego's community to
    number of alters who are not in ego's community.
    Parameters
    ----------
    G : NetworkX graph
    weighted : bool
        Whether to engage in weighted computations
    node2community : dict
        Maps nodes to the communities they belong to
    Returns
    -------
    node2embed : dict
        Maps node to global bridging
    """
    node2embed, node2embed_all_neighbors = defaultdict(lambda : None), defaultdict(lambda : None)
    node2community = {node : i for i, c in enumerate(communities) for node in c}

    if weighted:
        for u in G_directed:
            local_community = node2community[u]
            within_cluster, without_cluster = 0, 0
            uv2weight = {v : edges2weights[(u, v)] for v in G_directed.neighbors(u)}
            total_weight = sum(uv2weight.values())
            for v in G_directed.neighbors(u):
                for w in G_directed.neighbors(v):
                    add = (uv2weight[v]/total_weight) * edges2weights[(v, w)]
                    if u == w:
                        continue
                    elif node2community[w] == local_community:
                        within_cluster += add
                    else:
                        without_cluster += add
            if (without_cluster+within_cluster) == 0:
                node2embed[u] = np.nan
            else:
                node2embed[u] = (without_cluster-within_cluster)/(without_cluster+within_cluster)

            all_neighbors = find_all_neighbors(G_directed, u)
            uv2weight = {v : (edges2weights.get((u, v), 0) + edges2weights.get((v, u), 0)) for v in all_neighbors}
            total_weight = sum(uv2weight.values())
            for v in all_neighbors:
                for w in find_all_neighbors(G_directed, v):
                    add = (uv2weight[v]/total_weight) * (edges2weights.get((v, w), 0) + edges2weights.get((w, v), 0))
                    if u == w:
                        continue
                    elif node2community[w] == local_community:
                        within_cluster += add
                    else:
                        without_cluster += add
            if (without_cluster+within_cluster) == 0:
                node2embed_all_neighbors[u] = np.nan
            else:
                node2embed_all_neighbors[u] = (without_cluster-within_cluster)/(without_cluster+within_cluster)
    else:
        for u in G_directed:
            local_community = node2community[u]
            within_cluster, without_cluster = 0, 0
            for v in G_directed.neighbors(u):
                for w in G_directed.neighbors(v):
                    if u == w:
                        continue
                    elif node2community[w] == local_community:
                        within_cluster += 1
                    else:
                        without_cluster += 1
            if (without_cluster+within_cluster) == 0:
                node2embed[u] = np.nan
            else:
                node2embed[u] = (without_cluster-within_cluster)/(without_cluster+within_cluster)

            within_cluster, without_cluster = 0, 0
            for v in find_all_neighbors(G_directed, u):
                for w in find_all_neighbors(G_directed, v):
                    if u == w:
                        continue
                    elif node2community[w] == local_community:
                        within_cluster += 1
                    else:
                        without_cluster += 1
            if (without_cluster+within_cluster) == 0:
                node2embed_all_neighbors[u] = np.nan
            else:
                node2embed_all_neighbors[u] = (without_cluster-within_cluster)/(without_cluster+within_cluster)
    return (node2embed, node2embed_all_neighbors)

def generate_community_diversity(G_directed, edges2weights, communities, weighted):
    """
    Computes the diversity of communities an individual is able to access in her local network. If weighted,
    each individual label is included as many times as the weight. When using bi-directional neighbors, 
    weights across both edges are summed up, with a non-existent edge treated as having zero weight.
    Parameters
    ----------
    G_directed : NetworkX DiGraph
    edges2weights : dict of {tuple : int}
    weighted : bool
    Returns
    ------
    tuple of (dict, dict, int)
    """
    node2community = {node : i for i, c in enumerate(communities) for node in c}
    node2herf, node2herf_all_neighbors = defaultdict(lambda : None), defaultdict(lambda : None)
    if weighted:
        for u in G_directed:
            alter_communities = [node2community[v] for v in G_directed.neighbors(u) for _ in range(edges2weights[(u, v)])]
            node2herf[u] = sum([(counts/len(alter_communities))**2 for group, counts in Counter(alter_communities).items()])
            alter_communities = [node2community[v] for v in find_all_neighbors(G_directed, u) for _ in range(
                edges2weights.get((u, v), 0) + edges2weights.get((v, u), 0))]
            node2herf_all_neighbors[u] = sum([(counts/len(alter_communities))**2 for group, counts in Counter(alter_communities).items()])
            
    else:
        for u in G_directed:
            alter_communities = [node2community[v] for v in G_directed.neighbors(u)]
            node2herf[u] = sum([(counts/len(alter_communities))**2 for group, counts in Counter(alter_communities).items()])
            
            alter_communities = [node2community[v] for v in find_all_neighbors(G_directed, u)]
            node2herf_all_neighbors[u] = sum([(counts/len(alter_communities))**2 for group, counts in Counter(alter_communities).items()])
    return (node2herf, node2herf_all_neighbors)

def generate_network_measures(timekey, edges, test_mode):
    """
    Generating network measures for a given time period using edges
    Parameters
    ----------
    timekey : str
        A string that represents the time period for which network measures are being computed
    edges : list
        A list of directd edges represented by 2-tuples
    test_mode : bool
        If true, restrict edges to a hundredth of all edges
    """
    
    if len(edges) < 10:
        sys.stderr.write('Returning empty network at %s with %d edges at %s.\n' % (timekey, len(edges), datetime.now()))
        return dict()

    G_directed = nx.DiGraph()
    sys.stderr.write('Generating weighted network measures for %s with %d edges at %s.\n' % (timekey, len(edges), datetime.now()))
    edges2weights = Counter(edges)
    weighted_edges = [(edge[0], edge[1], weight) for edge, weight in edges2weights.items()]
    G_directed.add_weighted_edges_from(weighted_edges)
    usr_quarter2network_measures = defaultdict(list)

    sys.stderr.write('Computing community-based connectivity at %s.\n' % datetime.now())
    # pre-computing community membership so that the two methods share communities
    try:
        weighted_communities = algorithms.leiden(G_directed, weights='weight').communities
        unweighted_communities = algorithms.leiden(G_directed).communities

        node2ego_bridging_weighted, node2ego_bridging_all_neighbors_weighted = generate_ego_bridging(G_directed, edges2weights, weighted_communities, True)
        node2ego_bridging_unweighted, node2ego_bridging_all_neighbors_unweighted = generate_ego_bridging(G_directed, edges2weights, unweighted_communities, False)

        node2comm_bridging_weighted, node2comm_bridging_all_neighbors_weighted = generate_global_bridging(G_directed, edges2weights, weighted_communities, True)
        node2comm_bridging_unweighted, node2comm_bridging_all_neighbors_unweighted = generate_global_bridging(G_directed, edges2weights, unweighted_communities, False)

        node2comm_diversity_weighted, node2comm_diversity_all_neighbors_weighted = generate_community_diversity(G_directed, edges2weights, weighted_communities, True)
        node2comm_diversity_unweighted, node2comm_diversity_all_neighbors_unweighted = generate_community_diversity(G_directed, edges2weights, unweighted_communities, False)
    
        sys.stderr.write('Computing department-based bridging at %s.\n' % datetime.now())
        node2depart_bridging_weighted, node2depart_bridging_all_neighbors_weighted = generate_department_bridging(G_directed, quarter2usr2department[timekey], edges2weights, True)
        node2depart_bridging_unweighted, node2depart_bridging_all_neighbors_unweighted = generate_department_bridging(G_directed, quarter2usr2department[timekey], edges2weights, False)

        for n in G_directed:
            row = ([node2ego_bridging_unweighted[n], node2ego_bridging_all_neighbors_unweighted[n], node2comm_bridging_unweighted[n], node2comm_bridging_all_neighbors_unweighted[n], node2comm_diversity_unweighted[n], node2comm_diversity_all_neighbors_unweighted[n], len(unweighted_communities),
            node2ego_bridging_weighted[n], node2ego_bridging_all_neighbors_weighted[n], node2comm_bridging_weighted[n], node2comm_bridging_all_neighbors_weighted[n], node2comm_diversity_weighted[n], node2comm_diversity_all_neighbors_weighted[n], len(weighted_communities),
            node2depart_bridging_unweighted[n], node2depart_bridging_all_neighbors_unweighted[n], node2depart_bridging_weighted[n], node2depart_bridging_all_neighbors_weighted[n]])
            usr_quarter2network_measures[(n, timekey)] = row

    except: 
        print('error caused here')
        pass
    return dict(usr_quarter2network_measures)

def time_edges_to_df(time_edges, test_mode=False):
    """
    Calculates network measures using edge lists
    Parameters
    ----------
    time_edges : dict
        A dictionary that maps quarters (quarters only) to a list of edges belonging to that time period
    test_mode : bool, optional
        If true, only generate one network
    Returns
    -------
    df : pd.DataFrame
        A dataframe of network measures with user id and timekey_type as index
    """
    if test_mode:
        time_edges = {quarter:edges for quarter, edges in time_edges.items() if len(edges) > 5}
        test_timekey = random.choice(list(time_edges))
        sys.stderr.write("Testing timekey %s out of %d time periods.\n" % (test_timekey, len(time_edges)))
        network_measures = generate_network_measures(test_timekey, time_edges[test_timekey], test_mode)
    else:
        pool = multiprocessing.Pool(processes = num_cores)
        results = [pool.apply_async(generate_network_measures, args=(timekey, edges, test_mode, )) for timekey, edges in time_edges.items()]
        pool.close()
        pool.join()
        network_measures = defaultdict(list)
        for r in results:
            network_measures.update(r.get())
    
    cols = (['ego_bridging_unweighted', 'ego_bridging_all_neighbors_unweighted', 'global_bridging_unweighted', 'global_bridging_all_neighbors_unweighted', 'comm_diversity_unweighted', 'comm_diversity_all_neighbors_unweighted', 'n_comm_unweighted',
    'ego_bridging_weighted', 'ego_bridging_all_neighbors_weighted', 'global_bridging_weighted', 'global_bridging_all_neighbors_weighted', 'comm_diversity_weighted', 'comm_diversity_all_neighbors_weighted', 'n_comm_weighted',
    'department_bridging_unweighted', 'department_bridging_all_neighbors_unweighted', 'department_bridging_weighted', 'department_bridging_all_neighbors_weighted'])
    
    df = dict_to_df(network_measures, cols, index_name=['anon_id', 'quarter'])
    return df.round(5)

def extract_quarterly_network_measures(corpus_dir, first_batches, test_mode=False):
    """
    Main workhorse function for computing netwrork measures and writing them to file.
    Parameters
    ----------
    corpus_dirs : list of str
        Location of all batches of emails, where the first and second are located in one folder and the third batch in another
    first_batches : bool
        If first_batches, only compute the first two batches
    test_mode : bool, optional
        If testing, modify file_name to include flags for testing in final output file name
    """
    sys.stderr.write("Computing quarterly edges at %s.\n" % str(datetime.now()))
    if test_mode:
        global output_dir
        output_dir = os.path.join(home_dir, "idtf_output_data_test")

    edges_file = 'edges_b12' if first_batches else 'edges_all'
    if internal_only: edges_file += '_internal_corrected'
    edges_file += '.txt'
    edges_file = os.path.join(output_dir, edges_file)

    quarterly_edges = defaultdict(list)
    with open(edges_file, 'r') as f:
        for line in f:
            tup = literal_eval(line)
            quarterly_edges[tup[0]].append((tup[1], tup[2]))

    network_df_quarterly = time_edges_to_df(quarterly_edges, test_mode)
    network_quarterly_filename = os.path.join(output_dir, "design_diversity_bridging_b12" if first_batches else "design_diversity_bridging_all")
    if internal_only: network_quarterly_filename += '_internal'
    network_quarterly_filename += '.csv'
    network_df_quarterly.to_csv(network_quarterly_filename)
    sys.stderr.write("Finished outputting weighted network measures at %s.\n" % str(datetime.now()))
    return

In [None]:
hr_filepath = os.path.join(home_dir, "datasets", "anonymized_hr_data.csv")
hr_df_quarterly = extract_hr_df(hr_filepath, 'quarter')
usr_quarter2department = hr_df_quarterly['department'].dropna().to_dict()
quarter2usr2department = defaultdict(dict)
for k, department in usr_quarter2department.items():
    u, quarter = k
    quarter2usr2department[quarter][u] = department

In [None]:
starttime = datetime.now()
test_mode = False
try:
    test_mode = sys.argv[1].lower() == 'test'
except IndexError as error:
    pass
    
sys.stderr.write('Generating Network Measures at %s.\n' % datetime.now())
network_df_quarterly = extract_quarterly_network_measures(corpus_dir, first_batches, test_mode)
sys.stderr.write('Finished Generating Network Measures at %s with a duration of %s.\n' % (datetime.now(), str(datetime.now()-starttime)))