**Company** : <br>
Staffing Firm

**Notebook Function** : <br>
    This notebook generates the local clustering measures
    
**Output File(s)** : <br>
    staffing_network_embedded.csv - The final output file containing the network measure

**Author(s)** : <br>
Lara Yang, Sarayu Anshuman

Import libraries

In [None]:
import os
import sys
from collections import defaultdict
from datetime import datetime
import pandas as pd
import numpy as np
from mittens import Mittens
import csv
from operator import itemgetter
import ujson as json
import re
from gensim.matutils import cossim
from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec
from statistics import mean 
from sklearn.decomposition import PCA
from datetime import datetime
import glob
import random
import logging
import pickle
from operator import itemgetter
from collections import defaultdict, Counter
import networkx as nx
from statistics import mean
from tqdm import tqdm
from ast import literal_eval
import networkx.algorithms.community as nx_comm
from cdlib import algorithms

Run helper functions

In [None]:
#########################################################################
########### Helper Functions for Generating Mittens Embeddings ##########
#########################################################################

#thsi function returns a word, another word, and the value given in the cooccurance matrix based on the weighting function of the occurance of teh second word in the first word's context window
def _window_based_iterator(toks, window_size, weighting_function):
    for i, w in enumerate(toks):
        yield w, w, 1
        left = max([0, i-window_size])
        for x in range(left, i):
            yield w, toks[x],weighting_function(x)
        right = min([i+1+window_size, len(toks)])
        for x in range(i+1, right):
            yield w, toks[x], weighting_function(x)
    return

def glove2dict(glove_filename):
    """
    Reads word vectors into a dictionary
    Parameters
    ----------
    glove_filename : str
        Name of file that contains vectors
    Returns
    -------
    data : dict
        A dictionary matching words to their vectors
    """
    with open(glove_filename) as f:
        reader = csv.reader(f, delimiter=' ', quoting=csv.QUOTE_NONE) #GloVe output files have the format one word per line, followed by its vector values separated by spaces
        data = {line[0]: np.array(list(map(float, line[1: ]))) for line in reader} #create a key (word): value (embedding) dict for every word
    return data

# Inspired by original build_weighted_matrix in utils.py in the Mittens paper source codebase
def build_weighted_matrix(emails,
        mincount=300, vocab_size=None, window_size=10,
        weighting_function=lambda x: 1 / (x + 1),
        email_type='all'):
    """
    Builds a count matrix based on a co-occurrence window of
    `window_size` elements before and `window_size` elements after the
    focal word, where the counts are weighted based on proximity to the
    focal word.
    Parameters
    ----------
    emails : list of dicts
        Emails converted from JSON formats
    mincount : int
        Only words with at least this many tokens will be included. #this means only words with atleast 300 occurances in teh document are included
    vocab_size : int or None
        If this is an int above 0, then, the top `vocab_size` words
        by frequency are included in the matrix, and `mincount`
        is ignored.
    window_size : int
        Size of the window before and after. (So the total window size
        is 2 times this value, with the focal word at the center.)
    weighting_function : function from ints to floats
        How to weight counts based on distance. The default is 1/d
        where d is the distance in words.
    email_type : str, optional
        Specifies which types of emails to include when building embeddings
    Returns
    -------
    X : pd.DataFrame
        Cooccurence matrix guaranteed to be symmetric because of the way the counts are collected.
    """
    wc = defaultdict(int)
    #corpus contains a list  of emails, where each email is a dict
    corpus = read_corpus(emails, email_type=email_type, sentence_delim=False)
    if corpus is None:
        print("These emails are empty\t{}.\nEmpty corpus returned for email type {}".format(str(emails), email_type))
        return pd.DataFrame()
    for toks in corpus:
        for tok in toks:
            wc[tok] += 1 #word count
    #now create the vocabulary
    if vocab_size: #if a vocab size is defined then take the first top 'vocab_size' counts
        srt = sorted(wc.items(), key=itemgetter(1), reverse=True)
        vocab_set = {w for w, c in srt[: vocab_size]} #sort all the words in the vocabulary according to count
    else: #define a vocab based on all those words which have a count greater than 'mincount'
        vocab_set = {w for w, c in wc.items() if c >= mincount}
    vocab = sorted(vocab_set)
    n_words = len(vocab) #length of vocab
    # Weighted counts:
    counts = defaultdict(float)
    for toks in corpus: #for each email
        window_iter = _window_based_iterator(toks, window_size, weighting_function)
        for w, w_c, val in window_iter:
            if w in vocab_set and w_c in vocab_set:
                counts[(w, w_c)] += val

    '''
    For each sentence (list of words), it uses a helper function _window_based_iterator (not provided) to generate a sequence of tuples:
    The first element is the focal word.
    The second element is a co-occurring word within the window.
    The third element is the weight for the co-occurrence based on the distance between the words (using the provided weighting_function).
    It checks if both the focal word and co-occurring word are in the vocabulary (vocab_set). If so, it updates the counts dictionary with the weighted co-occurrence for that word pair.
    '''
    X = np.zeros((n_words, n_words))
    for i, w1 in enumerate(vocab):
        for j, w2 in enumerate(vocab):
            X[i, j] = counts[(w1, w2)]
    X = pd.DataFrame(X, columns=vocab, index=pd.Index(vocab))
    return X

def read_corpus(emails, email_type, sentence_delim=False):
    """
    Parameters
    ----------
    emails : list of dict
        A list of emails converted from JSON formats
    email_type : str
        Specifies which types of emails to include when building embeddings
        'internal' filters for internal emails, 'external' filters for external and mixed emails, 
        and anything else does not filter
    sentence_delim : bool, optional
        If true, co-occurrences across sentence boundaries are ignored.
    Returns
    -------
    list of list of str
        Corpus converted from emails.
        If sentence_delim is false, returns a list of emails, which are represented as lists of tokens
        If sentence_delim is true, returns a list of sentences, which are represented as lists of tokens
    """
    # split with no argument splits on whitespaces and newlines
    if not sentence_delim:
        if email_type == 'internal':
            return [email['hb'].replace('\n', ' ').replace("SENT_END", "").strip().split() for email in emails if (
                email['email_type'] == 'int')]
        elif email_type == 'external':
            return [email['hb'].replace('\n', ' ').replace("SENT_END", "").strip().split() for email in emails if (
                email['email_type'] == 'ext' or email['email_type'] == 'mixed')]
        # agnostic to email type
        else:
            return [email['hb'].replace('\n', ' ').replace("SENT_END", "").strip().split() for email in emails]
    else:
        if email_type == 'internal':
            return [sent.strip().split() for email in emails for line in email['hb'].split('\n') for sent in line.split('SENT_END') if (
                len(sent) > 0 and email_type == 'int')]
        elif email_type == 'external':
            return [sent.strip().split() for email in emails for line in email['hb'].split('\n') for sent in line.split('SENT_END') if (
                len(sent) > 0 and (email_type == 'int' or email['email_type'] == 'mixed'))]
        else:
            return [sent.strip().split() for email in emails for line in email['hb'].split('\n') for sent in line.split('SENT_END') if len(sent) > 0]

def output_embeddings(mittens_df, filename, compress=False):
    if compress:
        mittens_df.to_csv(filename + '.gz', quoting=csv.QUOTE_NONE, header=False, sep=" ", compression='gzip')
    else:
        mittens_df.to_csv(filename, quoting=csv.QUOTE_NONE, header=False, sep=" ")
    return

#########################################################################
############# Helper Functions for Working with JSON Emails #############
#########################################################################
# Slightly different from get_recipients for spacespace emails
def get_recipients(msg):
    """
    Return a set of recipients of the current message.
    self is removed from list of recipients if in recipients #(-set([sender]))
    All fields contain email addresses, not user IDs. From fields are visually just strings
    but checking just in case
    """
    sender = msg['From'][0] if type(msg['From']) == list else msg['From']
    return set(msg.get('To', []) + msg.get('Cc', []) + msg.get('Bcc', [])) - set([sender]) #basically return a set of recipients to the current email

def slice_user_corpus(emails, train_mode): #slices a list of emails into chunks based on time periods specified by the train_mode parameter
    """
    Parameters
    ----------
    emails : list of dict
        A list of emails converted from JSON formats to dictionaries 
    train_mode : str
        One of 'annual', 'quarterly', 'all'
        Indicates how to chunk up emails - into quarters, years, or both
    Returns
    -------
    timekey2emails : dict
        Matches quarters or years to respective emails
    """
    #'ActivityCreatedAt' contains a timestamp for the list of emails
    timekey2emails = defaultdict(list)
    #the function iterates through each email in the email list
    for email in emails:
        if train_mode == 'annual':
            timekey2emails[to_year(email['ActivityCreatedAt'], format='str')].append(email)
        elif train_mode == 'quarterly':
            timekey2emails[to_quarter(email['ActivityCreatedAt'], format='str')].append(email)
        elif train_mode == 'halfyear':
            timekey2emails[to_halfyear(email['ActivityCreatedAt'], format='str')].append(email)
        elif train_mode == 'all':
            timekey2emails[to_year(email['ActivityCreatedAt'], format='str')].append(email)
            timekey2emails[to_quarter(email['ActivityCreatedAt'], format='str')].append(email)
            timekey2emails[to_halfyear(email['ActivityCreatedAt'], format='str')].append(email)
    return timekey2emails

#########################################################################
############# Helper Functions for Working with Date Objects ############
#########################################################################

def to_quarter(date, format):
    """
    Return quarter of date in string
    """
    year, month = 0, 0
    if format == 'str':
        year = date[0:4]
        month = date[5:7]    
    elif format == 'datetime':
        year = date.year
        month = date.month
    quarter = ((int(month)-1) // 3) + 1
    timekey = str(year) + 'Q' + str(quarter)
    return timekey

def to_halfyear(date, format):
    """
    Return half year of date in string
    """
    year, month = 0, 0
    if format == 'str':
        year = date[0:4]
        month = date[5:7]    
    elif format == 'datetime':
        year = date.year
        month = date.month
    halfyear = ((int(month)-1) // 6) + 1
    timekey = str(year) + 'HY' + str(halfyear)
    return timekey

def to_year(date, format):
    """
    Return year of date in string
    """
    if format == 'str':
        return date[0:4]
    elif format == 'datetime':
        return str(date.year)

def datetime_to_timekey(date, time_key):
    if time_key == 'year':
        return to_year(date, format='datetime')
    elif time_key == 'quarter':
        return to_quarter(date, format='datetime')

def is_month_before_equal(datetime1, datetime2):
    if datetime1.year < datetime2.year:
        return 1
    elif (datetime1.year == datetime2.year) and (datetime1.month <= datetime2.month):
        return 1
    else:
        return 0

def num_months_between_dates(datetime1, datetime2):
    return abs((datetime1.year - datetime2.year) * 12 + datetime1.month - datetime2.month)

def num_quarters_between_dates(datetime1, datetime2):
    return abs((datetime1.year - datetime2.year) * 12 + datetime1.month - datetime2.month) // 3

def num_years_between_dates(datetime1, datetime2):
    return abs(datetime1.year - datetime2.year)

def time_between_dates(datetime1, datetime2, time_key):
    if time_key == 'monthly':
        return num_months_between_dates(datetime1, datetime2)
    elif time_key == 'quarterly':
        return num_quarters_between_dates(datetime1, datetime2)
    elif time_key == 'annual':
        return num_years_between_dates(datetime1, datetime2)

#########################################################################
############## Helper Functions for Working with Dataframes #############
#########################################################################

#function to convert a dictionary to a dataframe
def dict_to_df(index2rows, cols, index_name):
    """
    Parameters
    ----------
    index2rows : dict
        Dictionary mapping index to rows to be coverted
    cols : list
        List of column names of type str
    index : list
        List of index names
    Returns
    -------
    df : pd.DataFrame
        Constructed dataframe
    """
    if index2rows is None or len(index2rows) == 0:
        return None
    #case where the dataframe is formed across entire time span of user emails
    if len(index_name) == 1:
        df = pd.DataFrame.from_dict(index2rows, orient='index', columns=cols)
        df.index.name = index_name[0]
        df.sort_index(axis=0, inplace=True)
        return df
    else: #case where the user token (word) counts and email counts are formed for ecah year or each quarter
        df = pd.DataFrame.from_dict(index2rows, orient='index', columns=cols)
        df = pd.DataFrame(df, pd.MultiIndex.from_tuples(df.index, names=index_name))
        df.sort_index(axis=0, inplace=True)
        return df

#########################################################################
########### Helper Functions for Working with Embedding Output ##########
#########################################################################

def remove_empty_embeddings(embeddings_dir):
    """
    Removes all empty files in embeddings_dir that were produced when vocab size was 0.
    Parameters
    ----------
    embeddings_dir : str
        Full path to directory where embedding files are located
    """
    for file in os.listdir(embeddings_dir):
        mittens_file = os.path.join(embeddings_dir, file)
        if os.path.getsize(mittens_file) == 0:
            os.remove(mittens_file)
    return

#not called, because this functionm is needed for finetuning and not training
def extract_company_embedding(company_embeddings_filename, tmp_dir, words):
    """
    Parameters
    ----------
    company_embeddings_filename : str
        File path of the company embeddings
    tmp_dir : str
        Path to the directory for gensim to output its tmp files in order to load embeddings into word2vec format
    words : list
        A list of strings for which to retrieve vectors
    Returns
    -------
    vecs : list
        A list of vectors of type numpy.ndarray that correspond to the list of words given as parameters
    """ 
    tmp_mittens = os.path.join(tmp_dir, "mittens_embeddings_all_word2vec.txt")
    word2vec_mittens_file = get_tmpfile(tmp_mittens)
    glove2word2vec(company_embeddings_filename, word2vec_mittens_file) #load the embeddings in the glove2word2vec format
    model = KeyedVectors.load_word2vec_format(word2vec_mittens_file) #this model represents the glove embddings space using the default company embeddings
    vecs = []
    for w in words:
        if w in model.vocab:
            vecs.append(model.wv[w]) #default vectors for the words in the vocab
        else:
            vecs.append(np.nan)
            print('%s not in company embeddings' % w)
    return vecs

def word_similarity(model, w1, w2):
    """
    This is an auxilary function that allows for comparing one word to another word or multiple words
    If w1 and w2 are both single words, n_similarity returns their cosine similarity which is the same as 
    simply calling similarity(w1, w2)
    If w1 or w2 is a set of words, n_similarity essentially takes the mean of the set of words and then computes
    the cosine similarity between that vector mean and the other vector. This functionality is both reflected
    in its source code and has been verified manually.
    Parameters
    ----------
    model : KeyedVectors
        The model that contains all the words and vectors
    w1 : str or list
        The first word or word list to be compared
    w2 : str or list
        The second word or word list to be compared
    Returns
    -------
    float
        Cosine similarity between w1 and w2
    """
    #first fo the case where w1 and w2 parameters are single words
    if not isinstance(w1, list): 
        w1 = [w1]
    if not isinstance(w2, list):
        w2 = [w2]
    #create a copy of the input lists
    w1 = [w for w in w1 if w in model.vocab]
    w2 = [w for w in w2 if w in model.vocab]
    if len(w1) == 0 or len(w2) == 0:
        return None
    return model.n_similarity(w1, w2) #inbuilt word2vec model that calculates the cosine similarity between two lists; takes the mean of the set of words and then computes
    #the cosine similarity between that vector mean and the other vector

def cossim_with_none(vec1, vec2, vec_format='sparse'):
    """
    Auxiliary function that calls cossim function to test if vectors are None to prevent erroring out.
    Parameters
    ----------
    vec1 : list of (int, float), gensim sparse vector format
    vec2 : list of (int, float), gensim sparse vector format
    format : str, optional
        Either sparse or dense. If sparse, vec1 and vec2 are in gensim sparse vector format; use cossim function from gensim.
        Otherwise, vec1 and vec2 are numpy arrays and cosine similarity is hand calculated
    Returns
    -------
    float
        Cosine similarity between vec1 and vec2
    """
    if not (isnull_wrapper(vec1) or isnull_wrapper(vec2)):
        if vec_format == 'sparse':
            return cossim(vec1, vec2)
        elif vec_format == 'dense':
            if len(vec1) == 0 or len(vec2) == 0:
                return None
            return np.dot(vec1, vec2)/(np.linalg.norm(vec1) * np.linalg.norm(vec2))
        else:
            raise ValueError()
    return None

def calculate_pairwise_cossim(col1, col2=None, reference=False, reference_group=None, anon_ids=None, vec_format='sparse'):
    """
    Calculates averaged cosine similarity of every row vector in col1 with every other row vector in col2.
    If no col2 is provided, cosine similarity of every row vector with every other row vector in col1 is calculated.
    The two columns should have equal length.
    Parameters
    ----------
    col1 : pd.Series
        A column where each row is a sparse word vector (BoW format that gensim code is written for).
    col2 : pd.Series, optional
        A column where each row is a sparse word vector (BoW format that gensim code is written for).
    reference : bool, optional
        Indicator variable for whether filtering for reference groups is needed
    reference_group : pd.Series, optional
        If filtering for reference groups, a list containing reference group members for every employee in col1
    anon_ids : pd.Series, optional
        If filtering for reference groups, a list containing anon_ids for every employee in col1
    Returns
    -------
    results : list
        A list where the ith element is the averaged cosine similarity between the ith vector in col1 and every vector
        in col2 for which i != j. If no col2 is provided, a list where the ith element is the averaged cosine similarity
        between the ith vector in col1 and every other vector in col1 is returned.
    """
    vectors1 = col1.tolist()
    vectors2 = col2.tolist() if col2 is not None else col1.tolist()
    reference_group = reference_group.tolist() if reference else None
    anon_ids = anon_ids.tolist() if anon_ids is not None else None
    results = list()
    for i in range(len(vectors1)):
        total_sim = []
        if not isnull_wrapper(vectors1[i]):
            for j in range(len(vectors2)):
                if i != j and not isnull_wrapper(vectors2[j]):
                    # filter out any np.nans as our reference group
                    if not reference or (type(reference_group[i]) == set and anon_ids[j] in reference_group[i]):
                        if vec_format == 'sparse':
                            total_sim.append(cossim(vectors1[i], vectors2[j]))
                        elif vec_format == 'dense':
                            total_sim.append(np.dot(vectors1[i], vectors2[j])/(np.linalg.norm(vectors1[i]) * np.linalg.norm(vectors2[j])))
                        else:
                            raise ValueError()
            results.append(mean(total_sim) if len(total_sim) > 0 else None)
        else:
            results.append(None)
    return results

def isnull_wrapper(x):
    r = pd.isnull(x)
    if type(r) == bool:
        return r
    return r.any()

def vector_mean(col):
    """
    Calculate vector means of row vectors
    Parameters
    ----------
    col : pd.Series
        The column to be averaged
    Returns
    -------
    np.array
        A vector that is the numerical average of all vectors in col
    """
    return np.array(col[col.notna()].tolist()).mean(axis=0)

def extract_hr_survey_df(survey_hr_file, user_qualtrics_file, users_file, perf_likert_file, perf_percentage_file):
    """
    Loads various datasets to prepare survey and hr data for merging with embeddings df.
    Returns
    -------
    survey_hr_df : pd.DataFrame
        Dataframe indexed by user_id
    """
    perf_likert_df = pd.read_csv(perf_likert_file)
    perf_percentage_df = pd.read_csv(perf_percentage_file)
    perf_likert_df = perf_likert_df[['UID', '2019 Perf_Type(Rating)', '2020 Perf_Type(Rating)']]
    perf_percentage_df = perf_percentage_df[['UID', '2019_Perf_Type(Percentage)', '2020_Perf_Type(Percentage)']]
    perf_likert_df.columns = ["UID", "perf_rating_2019", "perf_rating_2020"]
    perf_percentage_df.columns = ["UID", "perf_percentage_2019", "perf_percentage_2020"]
    
    user_qualtrics_df = pd.read_csv(user_qualtrics_file)
    user_qualtrics_df = user_qualtrics_df.merge(perf_likert_df, on='UID', how='left').merge(perf_percentage_df, on="UID", how='left')

    survey_hr_df = pd.read_csv(survey_hr_file)
    survey_hr_df = survey_hr_df.merge(user_qualtrics_df, left_on='uid', right_on='UID', how='left')
    # we lose two employees whose emails are not included in the crawled email data
    email2uid = {}
    with open(users_file, encoding='utf-8') as f:
        for line in f:
            user = json.loads(line)
            for e in user['Emails']:
                email2uid[e] = user['UserId']

    survey_hr_df = survey_hr_df[survey_hr_df['Email'].isin(email2uid.keys())]
    survey_hr_df['user_id'] = survey_hr_df['Email'].apply(lambda e : email2uid[e])
    survey_hr_df.set_index('user_id', inplace=True)
    return survey_hr_df

def extract_variables_from_file(file):
    """ 
    Extract relevant information from name of embedding file, with format: {}(_{})?_(internal|external).txt
    """
    file_chunks = file[0:-4].split('_')
    usr = file_chunks[0]
    time_key = file_chunks[1] if len(file_chunks) == 3 else None
    return (usr, time_key)

def project(word, dimension):
    """
    Returns the scalar projection of word on dimension. Word and dimension are both assumed to be vectors
    """
    return np.dot(word, dimension)/ np.linalg.norm(dimension)

def drop(u, v):
    """
    Returns the component in u that is orthogonal to v, also known as the vector rejection of u from v
    """
    return u - v * u.dot(v) / v.dot(v)

def remove_frequency(company_embeddings, embedding_dim):
    """
    Remove frequency dimension from company embeddings (assumed to be the top dimension)
    Parameters
    ----------
    company_embeddings : dict
        A dictionary mapping words to their embeddings
    embedding_dim : int
        The dimension of word vectors. Used to determine the number of components that go into PCA.
    Returns
    -------
    new_company_embeddings : dict
        A dictionary mapping words to their embeddings, where embeddings are demeaned and first PCA
        dimension hypothesized to represent frequency dimension is removed
    """
    vectors = np.array([v for k, v in company_embeddings.items()])
    miu = np.mean(vectors, axis=0)
    demeaned_vectors = vectors - miu
    pca = PCA(n_components = embedding_dim)
    pca.fit(demeaned_vectors)
    frequency_dim = pca.components_[0]
    new_company_embeddings = {k : drop(v-miu, frequency_dim) for k, v in company_embeddings.items()}
    return new_company_embeddings

def doPCA(words_start, words_end):
    """
    Performs PCA on differences between pairs of words and returns the first component
    Based on function doPCA in Bolukbasi et al. (2016) source code at https://github.com/tolga-b/debiaswe/blob/master/debiaswe/we.py
    Parameter
    ---------
    words_start : list
        List of hashed words at one end of interested dimension
    words_end: list
        List of hashed words at the other end of dimension
    Returns
    -------
    ndarray
        First component of PCA of differences between pairs of words
    """
    matrix = []
    for i in range(len(words_start)):
        center = (words_start[i] + words_end[i])/2
        matrix.append(words_end[i] - center)
        matrix.append(words_start[i] - center)
    matrix = np.array(matrix)
    # cannot have more components than the number of samples
    num_components = len(words_start)*2
    pca = PCA(n_components = num_components)
    pca.fit(matrix)
    return pca.components_[0]

def build_dimension(words_start, words_end):
    """
    This method builds a dimension defined by words at separate end of a dimension.
    Multiple methods exist in previous literature when building such a dimension.
    1) Kozlowski et al. (2019) averages across differences between different word pairs, noted to be interchangeable with averaging words on each side of the dimension and
    then taking the difference between averages. They are empirically verified to be identical.
    2) Bolukbasi et al. (2016) defines gender direction using a simple difference between man and woman in the corresponding tutorial. In the same tutorial, 
    racial direction is defined as difference between two clusters of words that are each sum of the embeddings of its corresponding dimensions
    normalized by the L2 norm. Wang et al. (2020) note that normalization is unnecessary. If unnormalized, this method should be equivalent to #3.
    3) Bolukbasi et al. (2016) defines gender direction also by taking the differences across multiple pairs, doing PCA on these differences, and 
    taking the first component as the gender direction.
    Parameter
    ---------
    words_start : list
        List of hashed words at the positive end of the dimension, where positive implies more likely to affect identification positively
    words_end: list
        List of hashed words at the other end of dimension
    Returns
    -------
    (mean_dim, pca_dimension) : 2-tuple of numpy vector
        Two vector that represents the dimension of interest calculated using method #1 and #3.
    """
    assert len(words_start) == len(words_end)
    differences = [(np.array(words_start[i]) - np.array(words_end[i])) for i in range(len(words_start)) if not np.isnan(words_start[i]).any() and not np.isnan(words_end[i]).any()]
    mean_dim = np.array(differences).mean(axis=0)
    pca_dim = doPCA(words_start, words_end)
    if project(words_start[0], pca_dim) < 0:
        # convention used in the current script is that words_start should represent the positive dimension
        pca_dim = pca_dim * -1
    return (mean_dim, pca_dim)

#ending here-------------------

Running additional helper function specifically to generate the network measures

In [None]:
def node_term_count_distribution(nodes, usr_quarter2liwc, quarter):
    """
    Sum liwc counts across all nodes and convert the overall dictionary into 
    a distribution.
    nodes : list
        List of node names
    usr_quarter2liwcs: dict
        Maps (user_id, quarter) tuples to a Counter of all LIWC counts of said user
    quarter : str
        A string that represents the focal quarter
    """
    worddict = defaultdict(int)
    for n in nodes:
        liwc = usr_quarter2liwc[n, quarter]
        for cat, count in liwc.items():
            worddict[cat] += count
    dist = counts2dist(worddict)
    return dist

def get_term_count_distribution(all_liwc):
    """
    Sum liwc counts across all dictionaries in all_liwc and convert the overall dictionary into a probability distribution.
    Parameter
    ---------
    all_liwc : list of dict of {str : int}
        A list of LIWC categories counts
    Returns
    -------
    dist : dict of {str : float}
        A probability distribution of LIWC categories
    """
    worddict = defaultdict(int)
    for liwc in all_liwc:
        for cat, count in liwc.items():
            worddict[cat] += count
    dist = counts2dist(worddict)
    return dist

def counts2dist(countdict):
    """
    Turns a dictionary of counts to a dictionary of probabilities.
    Parameter
    ---------
    countdict : dict of {str : int}
        A dictionary of counts
    Returns
    -------
    dict of {str : float}

    """
    total = float(sum(countdict.values()))
    return {key:val/total for key, val in countdict.items()}

def jensen_shannon(f, g):
    """
    Provides the jensen_shannon distance between two probability distributions f and g. See https://en.wikipedia.org/wiki/Jensen%E2%80%93Shannon_divergence.
    Parameters
    ----------
    f : dict of {str : int}
        A liwc distribution mapping keys to their probabilities
    g : dict of {str : int}
        A liwc distribution mapping keys to their probabilities
    Returns
    -------
    float
        JS distance
    """
    if f is None or g is None or len(f) == 0 or len(g) == 0:
        return None
    vocab = sorted(set(f.keys()) | set(g.keys()))
    p = np.zeros(len(vocab))
    q = np.zeros(len(vocab))
    for i, w in enumerate(vocab):
        p[i] = f.get(w, 0.0)
        q[i] = g.get(w, 0.0)
    pq = (p + q) / 2.0                
    a = 0.5 * kl(p, pq)
    b = 0.5 * kl(q, pq)
    return np.sqrt(a + b)

def kl(p, q):
    """
    Provides the Kullback-Leibler divergence between two probability distributions p and q
    Parameters
    ----------
    p : np.array
        An array that represents a probability distribution
    q : np.array
        An array that represents a probability distribution
    Returns
    -------
    float
        KL divergence
    """
    return np.sum(p * safelog2(p/q))

def safelog2(x):
    """
    Provides base-2 logarithm of float x while handling errors.
    """
    with np.errstate(divide='ignore'):
        x = np.log2(x)
        x[np.isinf(x)] = 0.0
        return x

def js2cf(dist):
    """
    Converts JS distance to cultural fit measure.
    Parameters
    ----------
    dist : float
        JS distance
    Returns
    -------
    float 
        Cultural fit measure
    """
    if dist == 0:
            dist = 0.0000000001
    if dist:
        return -np.log(dist)
    return None

Print current directory

In [None]:
import os
current_dir = os.getcwd()
current_dir

Set file paths

In [None]:
num_cores = 16
build_threshold_network = False
home_dir = "/zfs/projects/faculty/amirgo-identification/"
email_dir = os.path.join(home_dir, "email_data/")
email_file = "/zfs/projects/faculty/amirgo-identification/identification-Sarayu/staffing/training/MessagesHashed.jsonl"
users_file = os.path.join(email_dir, 'Users.json')
activity_file = os.path.join(email_dir, 'Activities.json')
output_dir = "/zfs/projects/faculty/amirgo-identification/identification-Sarayu/network_files/staffing"
user_qualtrics_file = os.path.join(home_dir, "survey_hr_data", "UsersQualtrics.csv")
email2user_id = {}

print(email_file)
print(output_dir)

domain_hash = {
    'collabera.com':                     '509c8f6b1127bceefd418c023533d653', 
    'collaberainc.mail.onmicrosoft.com': 'ec5b67548b6ec06f1234d198efec741e', 
    'collaberainc.onmicrosoft.com':      '86160680578ee9258f097a67a5f25af9', 
    'collaberasp.com':                   '6bf3934d19f1acf5b9295b63e0e7f66e', 
    'g-c-i.com':                         '3444d1f7d5e46443080f2d069e41a10c'}
collabera_hashes = set([v for k, v in domain_hash.items()])

lines_to_test = 2000

Run functions to generate network measures

In [None]:
def get_quarterly_edges(test_mode):
    """
    Uses activity_file and users_file to return a list of edges at the quarter level.
    Nodes are named by User Ids and not email addresses, which is why users_file is necessary.
    This file does not constrain measures to target users only as all communications
    to employees with or without survey data should be included in computation
    Returns
    -------
    edges : dict
        A dictionary mapping quarters to lists of 2-tuples that represent directed edges
    """
    global email2user_id
    with open(users_file, encoding='utf-8') as f:
        for i, line in enumerate(f):
            if test_mode and i > lines_to_test: break
            user = json.loads(line)
            for e in user['Emails']:
                email2user_id[e] = user['UserId']

    edges = defaultdict(list)
    with open(activity_file, encoding='utf-8') as f:
        for i, line in enumerate(f):
            if test_mode and i > lines_to_test: break
            activity = json.loads(line)
            sender_userid = activity["UserId"]
            recipients = get_recipients(activity)
            pure_internal, pure_external = True, True
            if test_mode and i > lines_to_test:
                break
            for r in recipients:
                domain = r.split('@')[1]
                if pure_external and domain in collabera_hashes:
                    pure_external = False
                elif pure_internal and domain not in collabera_hashes:
                    pure_internal = False
            if pure_internal:
                recipients_userids = list(set([email2user_id[r] for r in recipients if r in email2user_id]))
                if sender_userid in recipients_userids:
                    recipients_userids.remove(sender_userid)
                quarter = to_quarter(activity['ActivityCreatedAt'], format='str')
                edges[quarter] += [(sender_userid, r) for r in recipients_userids]

    return edges

def generate_network_embeddedness(G_directed, weighted, edges2weights):
    """
    Generate embeddedness of one's network
    Parameter
    ---------
    G_directed : NetworkX graph
    weighted : bool
        Whether to engage in weighted computations
    edges2weights : dict of {tuple : int}
        Maps directed edges to weights
    Returns
    -------
        Three different measures of network embeddedness at ego, alter, and alter's alter levels.
    """
    node2embed_ego, node2embed_alter, node2embed_alter2 = defaultdict(lambda : None), defaultdict(lambda : None), defaultdict(lambda : None)
    for u in G_directed:
        local_network = set(G_directed.neighbors(u))
        within_cluster, without_cluster = 0, 0
        # As G is a directed network, u's network only includes those who u has sent an email to
        for v in G_directed.neighbors(u):
            for w in G_directed.neighbors(v):
                add = edges2weights[v, w] if weighted else 1
                if w in local_network:
                    within_cluster += add
                else:
                    without_cluster += add

        if (without_cluster+within_cluster) == 0:
            node2embed_ego[u] = np.nan
        else:
            # if 1, no within_cluster ties, if -1, all within-cluster ties
            node2embed_ego[u] = (without_cluster-within_cluster)/(without_cluster+within_cluster)

    # alter level measure
    for u in G_directed:
        local_network = set(G_directed.neighbors(u))
        v2ei, uv2weight = {}, {}
        # As G is a directed network, u's network only includes those who u has sent an email to
        for v in G_directed.neighbors(u):
            uv2weight[v] = edges2weights[u, v]
            # skip any alter who doesn't have an alter
            if len(list(G_directed.neighbors(v))) == 0:
                continue
            within_cluster, without_cluster = 0, 0    
            for w in G_directed.neighbors(v):
                add = edges2weights[v, w] if weighted else 1
                if w in local_network:
                    within_cluster += add
                else:
                    without_cluster += add
            v2ei[v] = (without_cluster-within_cluster)/(without_cluster+within_cluster)
        
        if len(v2ei) == 0:
            node2embed_alter[u] = np.nan
        else:
            u_ei = 0
            if weighted:
                u_weight = sum(uv2weight.values())
                uv2prop = {v : weight / u_weight for v, weight in uv2weight.items()}
                for v, ei in v2ei.items():
                    u_ei += (uv2prop[v] * ei)
            else:
                u_ei = sum(v2ei.values()) / len(v2ei)
            node2embed_alter[u] = u_ei

        
    # alter alter level measure
    for u in G_directed:
        local_network = set(G_directed.neighbors(u))
        v2ei, uv2weight = {}, {}
        # As G is a directed network, u's network only includes those who u has sent an email to
        for v in G_directed.neighbors(u):
            uv2weight[v] = edges2weights[u, v]
            # skip any alter who doesn't have an alter
            if len(list(G_directed.neighbors(v))) == 0:
                continue
            w2ei, vw2weight = {}, {}
            for w in G_directed.neighbors(v):
                vw2weight[w] = edges2weights[v, w]
                if len(list(G_directed.neighbors(w))) == 0:
                    continue
                within_cluster, without_cluster = 0, 0
                for z in G_directed.neighbors(w):
                    add = edges2weights[w, z] if weighted else 1
                    if z in local_network:
                        within_cluster += add
                    else:
                        without_cluster += add
                w2ei[w] = (without_cluster-within_cluster)/(without_cluster+within_cluster)
            
            if len(w2ei) > 0:
                v_ei = 0
                if weighted:
                    v_weight = sum(vw2weight.values())
                    vw2prop = {w : weight/v_weight for w, weight in vw2weight.items()}
                    for w, ei in w2ei.items():
                        v_ei += (vw2prop[w] * ei)
                else:
                    v_ei = sum(w2ei.values()) / len(w2ei)
                v2ei[v] = v_ei
        
        if len(v2ei) == 0:
            node2embed_alter2[u] = np.nan
        else:
            u_ei = 0
            if weighted:
                u_weight = sum(uv2weight.values())
                uv2prop = {u : weight / u_weight for u, weight in uv2weight.items()}
                for v, ei in v2ei.items():
                    u_ei += (uv2prop[v] * ei)
            else:
                u_ei = sum(v2ei.values()) / len(v2ei)
            node2embed_alter2[u] = u_ei

    return [node2embed_ego, node2embed_alter, node2embed_alter2]

def generate_community_ei(G_directed, node2community, weighted, edges2weights):
    """
    Calculate EI index based on community structure.
    Parameter
    ---------
    G_directed : NetworkX graph
    node2community : dict of {str : int}
        A dictionary mapping user IDs to integers that represent distinct communities
    weighted : bool
        Whether to engage in weighted computations
    edges2weights : dict of {tuple : int}
        Maps directed edges to weights
    Returns
    -------
        Six different measures of network embeddedness at ego, alter, and alter's alter levels, with optional
        filtering.
    """
    node2embed_ego, node2embed_ego_filter, node2embed_alter, node2embed_alter_filter, node2embed_alter2, node2embed_alter2_filter = [defaultdict(lambda : None) for _ in range(6)]
    for u in G_directed:
        local_community = node2community[u]
        local_network = set(G_directed.neighbors(u))
        within_cluster, within_cluster_filter, without_cluster, without_cluster_filter = 0, 0, 0, 0
        for v in G_directed.neighbors(u):
            filtered = node2community[v] == local_community
            for w in G_directed.neighbors(v):
                add = edges2weights[v, w] if weighted else 1
                if w in local_network or node2community[w] == local_community:
                    within_cluster += add
                    if filtered:
                        within_cluster_filter += add
                else:
                    without_cluster += add
                    if filtered:
                        without_cluster_filter += add

        if (without_cluster+within_cluster) == 0:
            node2embed_ego[u] = np.nan
        else:
            node2embed_ego[u] = (without_cluster-within_cluster)/(without_cluster+within_cluster)
        
        if (without_cluster_filter + within_cluster_filter) == 0:
            node2embed_ego_filter[u] = np.nan
        else:
            node2embed_ego_filter[u] = (without_cluster_filter-within_cluster_filter)/(without_cluster_filter+within_cluster_filter) 

    for u in G_directed:
        local_community = node2community[u]
        local_network = set(G_directed.neighbors(u))
        v2ei, v2ei_filtered, uv2weight = {}, {}, {}
        # As G is a directed network, u's network only includes those who u has sent an email to
        for v in G_directed.neighbors(u):
            uv2weight[v] = edges2weights[u, v]
            filtered = node2community[v] == local_community
            # skip any alter who doesn't have an alter
            if len(list(G_directed.neighbors(v))) == 0:
                continue
            within_cluster, without_cluster = 0, 0    
            for w in G_directed.neighbors(v):
                add = edges2weights[v, w] if weighted else 1
                if w in local_network or node2community[w] == local_community:
                    within_cluster += add
                else:
                    without_cluster += add
            v_ei = (without_cluster-within_cluster)/(without_cluster+within_cluster)
            v2ei[v] = v_ei
            if node2community[v] == local_community:
                v2ei_filtered[v] = v_ei
        
        if len(v2ei) == 0:
            node2embed_alter[u] = np.nan
        else:
            u_ei = 0
            if weighted:
                u_weight = sum(uv2weight.values())
                uv2prop = {u : weight / u_weight for u, weight in uv2weight.items()}
                for v, ei in v2ei.items():
                    u_ei += (uv2prop[v] * ei)
            else:
                u_ei = sum(v2ei.values()) / len(v2ei)
            node2embed_alter[u] = u_ei
        
        if len(v2ei_filtered) == 0:
            node2embed_alter_filter[u] = np.nan
        else:
            u_ei_filtered = 0 
            # if weight by total traffic to same community alters only, need to create a new uv2prop
            if weighted: 
                for v, ei in v2ei_filtered.items():
                    u_ei_filtered += (uv2prop[v] * ei)
            else:
                u_ei_filtered = sum(v2ei_filtered.values()) / len(v2ei_filtered)
            node2embed_alter_filter[u] = u_ei_filtered

    # alter alter level measure
    for u in G_directed:
        local_network = set(G_directed.neighbors(u))
        local_community = node2community[u]
        v2ei, v2ei_filtered, uv2weight = {}, {}, {}
        # As G is a directed network, u's network only includes those who u has sent an email to
        for v in G_directed.neighbors(u):
            uv2weight[v] = edges2weights[u, v]
            # skip any alter who doesn't have an alter
            if len(list(G_directed.neighbors(v))) == 0:
                continue
            w2ei, vw2weight = {}, {}
            for w in G_directed.neighbors(v):
                vw2weight[w] = edges2weights[v, w]
                if len(list(G_directed.neighbors(w))) == 0:
                    continue
                within_cluster, without_cluster = 0, 0
                for z in G_directed.neighbors(w):
                    add = edges2weights[w, z] if weighted else 1
                    if z in local_network or node2community[z] == local_community:
                        within_cluster += add
                    else:
                        without_cluster += add
                w2ei[w] = (without_cluster-within_cluster)/(without_cluster+within_cluster)
            
            if len(w2ei) > 0:
                v_ei = 0
                if weighted:
                    v_weight = sum(vw2weight.values())
                    vw2prop = {w : weight/v_weight for w, weight in vw2weight.items()}
                    for w, ei in w2ei.items():
                        v_ei += (vw2prop[w] * ei)
                else:
                    v_ei = sum(w2ei.values()) / len(w2ei)
                v2ei[v] = v_ei
                if node2community[v] == local_community:
                    v2ei_filtered[v] = v_ei
        if len(v2ei) == 0:
            node2embed_alter2[u] = np.nan
        else:
            u_ei = 0
            if weighted:
                u_weight = sum(uv2weight.values())
                uv2prop = {u : weight / u_weight for u, weight in uv2weight.items()}
                for v, ei in v2ei.items():
                    u_ei += (uv2prop[v] * ei)
            else:
                u_ei = sum(v2ei.values()) / len(v2ei)
            node2embed_alter2[u] = u_ei

        if len(v2ei_filtered) == 0:
            node2embed_alter2[u] = np.nan
        else:
            u_ei_filtered = 0
            if weighted:
                for v, ei in v2ei_filtered.items():
                    u_ei += (uv2prop[v] * ei)
            else:
               u_ei = sum(v2ei_filtered.values()) / len(v2ei)
            node2embed_alter2_filter[u] = u_ei

    return [node2embed_ego, node2embed_ego_filter, node2embed_alter, node2embed_alter_filter, node2embed_alter2, node2embed_alter2_filter]

def generate_community_embeddedness(G_directed, community_algorithm, weighted, edges2weights):
    """
    Generate the degree to which one's local network is embedded in one's community or outside of one's community
    Parameter
    ---------
    G_directed : NetworkX DiGraph
    community_algorithm : str
        Indicates type of algorithm to use for community detection
    weight : str
        Either None or weight attribute
    Returns
    -------
    measures : [dict, dict, dict, dict, dict, dict, dict, n_comm]
        A list of possible community measures
    """
    sys.stderr.write("Computing communities using {}'s algorithm at {}.\n".format(community_algorithm, datetime.now()))
    communities = []
    if community_algorithm == 'cnm':
        communities = algorithms.greedy_modularity(G_directed, weight='weight').communities
    elif community_algorithm == 'leiden':
        try:
            communities = algorithms.leiden(G_directed, weights='weight').communities
        except nx.exception.AmbiguousSolution as e:
            print('No community found using {} due to AmbiguousSolution error.'.format(community_algorithm))
            return defaultdict(lambda : np.nan)
    elif community_algorithm == 'surprise':
        try:
            communities = algorithms.surprise_communities(G_directed, weights='weight').communities
        except nx.exception.AmbiguousSolution as e:
            print('No community found using {} due to AmbiguousSolution error.'.format(community_algorithm))
            return defaultdict(lambda : np.nan)
    else:
        print("Community detection algorithm {} not supported".format(community_algorithm))
        return defaultdict(lambda : np.nan)
    
    node2community = {node : i for i, c in enumerate(communities) for node in c}
    measures = generate_community_ei(G_directed, node2community, weighted, edges2weights)
    measures.append(len(communities))
    return measures

def compute_threshold(edges2weights):
    """
    Computes the 20th percentage edge weight for all nodes
    Parameters
    ----------
    edges2weights : dict of {tuple : list}
        Maps all directed edges to the weight of the edge
    Returns
    -------
    node2threshold : dict of {str : int}
        Maps all nodes to the 20th percentile threshold
    """
    node2weights = defaultdict(list)
    node2threshold = defaultdict(lambda : None)
    for edge, weight in edges2weights.items():
        node2weights[edge[0]].append(weight)

    for n, weights in node2weights.items():
        node2threshold[n] = np.percentile(weights, 20)
    return node2threshold

def generate_network_measures(timekey, edges, test_mode):
    """
    Generating network measures for a given time period using edges
    Parameters
    ----------
    timekey : str
        A string that represents the time period for which network measures are being computed
    edges : list
        A list of directd edges represented by 2-tuples
    test_mode : bool
        If true, restrict edges to a hundredth of all edges
    """
    
    if len(edges) < 10:
        sys.stderr.write('Returning empty network at %s with %d edges at %s.\n' % (timekey, len(edges), datetime.now()))
        return dict()

    G_directed = nx.DiGraph()
    sys.stderr.write('Generating weighted network measures for %s with %d edges at %s.\n' % (timekey, len(edges), datetime.now()))
    edges2weights = Counter(edges)
    weighted_edges = [(edge[0], edge[1], weight) for edge, weight in edges2weights.items()]
    G_directed.add_weighted_edges_from(weighted_edges)
    usr_quarter2network_measures = defaultdict(list)

    weighted_degree = G_directed.degree(weight='weight')
    unweighted_degree = G_directed.degree(weight=None)
    weighted_clustering = nx.clustering(G_directed, weight='weight')
    unweighted_clustering = nx.clustering(G_directed, weight=None)

    for n in G_directed:
        row = ([weighted_degree[n], unweighted_degree[n], weighted_clustering[n], unweighted_clustering[n]])
        usr_quarter2network_measures[(n, timekey)] = row
    return dict(usr_quarter2network_measures)

def time_edges_to_df(time_edges, test_mode=False):
    """
    Calculates network measures using edge lists
    Parameters
    ----------
    time_edges : dict
        A dictionary that maps quarters (quarters only) to a list of edges belonging to that time period
    test_mode : bool, optional
        If true, only generate one network
    Returns
    -------
    df : pd.DataFrame
        A dataframe of network measures with user id and timekey_type as index
    """
    if test_mode:
        time_edges = {quarter:edges for quarter, edges in time_edges.items() if len(edges) > 5}
        test_timekey = random.choice(list(time_edges))
        sys.stderr.write("Testing timekey %s out of %d time periods.\n" % (test_timekey, len(time_edges)))
        network_measures = generate_network_measures(test_timekey, time_edges[test_timekey], test_mode)
    else:
        pool = multiprocessing.Pool(processes = num_cores)
        results = [pool.apply_async(generate_network_measures, args=(timekey, edges, test_mode, )) for timekey, edges in time_edges.items()]
        pool.close()
        pool.join()
        network_measures = defaultdict(list)
        for r in results:
            network_measures.update(r.get())

    cols = (['weighted_degree', 'unweighted_degree', 'weighted_clustering', 'unweighted_clustering'])
    df = dict_to_df(network_measures, cols, index_name=['user_id', 'quarter'])
    return df.round(5)

def extract_network_measures(test_mode=False):
    """
    Main workhorse function for computing netwrork measures and writing them to file. Note that this function
    only computes measures quarterly.
    Parameters
    ----------
    test_mode : bool, optional
        If testing, modify file_name to include flags for testing in final output file name
    """
    edges_file = 'edges_test_corrected.txt' if test_mode else 'edges_corrected.txt'
    edges_file = os.path.join(output_dir, edges_file)

    quarterly_edges = defaultdict(list)
    if os.path.exists(edges_file):
        sys.stderr.write("Reading edges from edge file at %s.\n" % str(datetime.now()))
        with open(edges_file, 'r') as f:
            for line in f:
                tup = literal_eval(line)
                quarterly_edges[tup[0]].append((tup[1], tup[2]))
    else:
        sys.stderr.write("Computing edges at %s.\n" % str(datetime.now()))
        quarterly_edges = get_quarterly_edges(test_mode)
        sys.stderr.write("Writing edges to edge file at %s.\n" % str(datetime.now()))
        with open(edges_file, 'w') as f:
            for quarter, edges in quarterly_edges.items():
                for e in edges:
                    f.write(str((quarter, e[0], e[1]))+'\n')

    sys.stderr.write("Calculating network measures at %s.\n" % str(datetime.now()))
    file_name = 'staffing_network_embedded_test.csv' if test_mode else 'staffing_network_embedded.csv'
    
    df = time_edges_to_df(quarterly_edges, test_mode)
    df.to_csv(os.path.join(output_dir, file_name))
    
    sys.stderr.write("Finished outputting network measures at %s.\n" % str(datetime.now()))
    return

In [None]:
starttime = datetime.now()
test_mode = False
try:
    test_mode = sys.argv[1].lower() == 'test'
except IndexError as error:
    pass

sys.stderr.write('Generating Network Measures at %s.\n' % datetime.now())
extract_network_measures(test_mode)
    
sys.stderr.write("Finished running at %s, with a duration of %s.\n"
    % (str(datetime.now()), str(datetime.now() - starttime)))