**Company** : <br>
Staffing Firm

**Notebook Function** : <br>
    This notebook processes the identification measures

**Output File(s)** : <br>
    embeddings_high_prob_eng_50_quarterly_50d_mincount{}_v2.csv - A file containing the final identification measures for each person-quarter

**Author(s)** : <br>
Lara Yang, Sarayu Anshuman

Install packages and import libraries

In [None]:
pip install -U mittens

In [None]:
pip install -U gensim

In [None]:
import os
import sys
from collections import defaultdict
from datetime import datetime
import pandas as pd
import numpy as np
from mittens import Mittens
import csv
from operator import itemgetter
import ujson as json
import re
from gensim.matutils import cossim
from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec
from statistics import mean 
from sklearn.decomposition import PCA
import multiprocessing
from collections import defaultdict
from gensim.matutils import cossim, any2sparse
import random
from statistics import mean 
import ujson as json

In [None]:
#########################################################################
########### Helper Functions for Generating Mittens Embeddings ##########
#########################################################################

#this function returns a word, another word, and the value given in the cooccurance matrix based on the weighting function of the occurance of teh second word in the first word's context window
def _window_based_iterator(toks, window_size, weighting_function):
    for i, w in enumerate(toks):
        yield w, w, 1
        left = max([0, i-window_size])
        for x in range(left, i):
            yield w, toks[x],weighting_function(x)
        right = min([i+1+window_size, len(toks)])
        for x in range(i+1, right):
            yield w, toks[x], weighting_function(x)
    return

def glove2dict(glove_filename):
    """
    Reads word vectors into a dictionary
    Parameters
    ----------
    glove_filename : str
        Name of file that contains vectors
    Returns
    -------
    data : dict
        A dictionary matching words to their vectors
    """
    with open(glove_filename) as f:
        reader = csv.reader(f, delimiter=' ', quoting=csv.QUOTE_NONE) #GloVe output files have the format one word per line, followed by its vector values separated by spaces
        data = {line[0]: np.array(list(map(float, line[1: ]))) for line in reader} #create a key (word): value (embedding) dict for every word
    return data

# Inspired by original build_weighted_matrix in utils.py in the Mittens paper source codebase
def build_weighted_matrix(emails,
        mincount=300, vocab_size=None, window_size=10,
        weighting_function=lambda x: 1 / (x + 1),
        email_type='internal'):
    """
    Builds a count matrix based on a co-occurrence window of
    `window_size` elements before and `window_size` elements after the
    focal word, where the counts are weighted based on proximity to the
    focal word.
    Parameters
    ----------
    emails : list of dicts
        Emails converted from JSON formats
    mincount : int
        Only words with at least this many tokens will be included. #this means only words with atleast 300 occurances in teh document are included
    vocab_size : int or None
        If this is an int above 0, then, the top `vocab_size` words
        by frequency are included in the matrix, and `mincount`
        is ignored.
    window_size : int
        Size of the window before and after. (So the total window size
        is 2 times this value, with the focal word at the center.)
    weighting_function : function from ints to floats
        How to weight counts based on distance. The default is 1/d
        where d is the distance in words.
    email_type : str, optional
        Specifies which types of emails to include when building embeddings
    Returns
    -------
    X : pd.DataFrame
        Cooccurence matrix guaranteed to be symmetric because of the way the counts are collected.
    """
    wc = defaultdict(int)
    #corpus contains a list  of emails, where each email is a dict
    corpus = read_corpus(emails, email_type=email_type, sentence_delim=False)
    if corpus is None:
        print("These emails are empty\t{}.\nEmpty corpus returned for email type {}".format(str(emails), email_type))
        return pd.DataFrame()
    for toks in corpus:
        for tok in toks:
            wc[tok] += 1 #word count
    #now create the vocabulary
    if vocab_size: #if a vocab size is defined then take the first top 'vocab_size' counts
        srt = sorted(wc.items(), key=itemgetter(1), reverse=True)
        vocab_set = {w for w, c in srt[: vocab_size]} #sort all the words in the vocabulary according to count
    else: #define a vocab based on all those words which have a count greater than 'mincount'
        vocab_set = {w for w, c in wc.items() if c >= mincount}
    vocab = sorted(vocab_set)
    n_words = len(vocab) #length of vocab
    # Weighted counts:
    counts = defaultdict(float)
    for toks in corpus: #for each email
        window_iter = _window_based_iterator(toks, window_size, weighting_function)
        for w, w_c, val in window_iter:
            if w in vocab_set and w_c in vocab_set:
                counts[(w, w_c)] += val

    '''
    For each sentence (list of words), it uses a helper function _window_based_iterator (not provided) to generate a sequence of tuples:
    The first element is the focal word.
    The second element is a co-occurring word within the window.
    The third element is the weight for the co-occurrence based on the distance between the words (using the provided weighting_function).
    It checks if both the focal word and co-occurring word are in the vocabulary (vocab_set). If so, it updates the counts dictionary with the weighted co-occurrence for that word pair.
    '''
    X = np.zeros((n_words, n_words))
    for i, w1 in enumerate(vocab):
        for j, w2 in enumerate(vocab):
            X[i, j] = counts[(w1, w2)]
    X = pd.DataFrame(X, columns=vocab, index=pd.Index(vocab))
    return X

def read_corpus(emails, email_type, sentence_delim=False):
    """
    Parameters
    ----------
    emails : list of dict
        A list of emails converted from JSON formats
    email_type : str
        Specifies which types of emails to include when building embeddings
        'internal' filters for internal emails, 'external' filters for external and mixed emails, 
        and anything else does not filter
    sentence_delim : bool, optional
        If true, co-occurrences across sentence boundaries are ignored.
    Returns
    -------
    list of list of str
        Corpus converted from emails.
        If sentence_delim is false, returns a list of emails, which are represented as lists of tokens
        If sentence_delim is true, returns a list of sentences, which are represented as lists of tokens
    """
    # split with no argument splits on whitespaces and newlines
    if not sentence_delim:
        if email_type == 'internal':
            return [email['hb'].replace('\n', ' ').replace("SENT_END", "").strip().split() for email in emails if (
                email['email_type'] == 'int')]
        elif email_type == 'external':
            return [email['hb'].replace('\n', ' ').replace("SENT_END", "").strip().split() for email in emails if (
                email['email_type'] == 'ext' or email['email_type'] == 'mixed')]
        # agnostic to email type
        else:
            return [email['hb'].replace('\n', ' ').replace("SENT_END", "").strip().split() for email in emails]
    else:
        if email_type == 'internal':
            return [sent.strip().split() for email in emails for line in email['hb'].split('\n') for sent in line.split('SENT_END') if (
                len(sent) > 0 and email_type == 'int')]
        elif email_type == 'external':
            return [sent.strip().split() for email in emails for line in email['hb'].split('\n') for sent in line.split('SENT_END') if (
                len(sent) > 0 and (email_type == 'int' or email['email_type'] == 'mixed'))]
        else:
            return [sent.strip().split() for email in emails for line in email['hb'].split('\n') for sent in line.split('SENT_END') if len(sent) > 0]

def output_embeddings(mittens_df, filename, compress=False):
    if compress:
        mittens_df.to_csv(filename + '.gz', quoting=csv.QUOTE_NONE, header=False, sep=" ", compression='gzip')
    else:
        mittens_df.to_csv(filename, quoting=csv.QUOTE_NONE, header=False, sep=" ")
    return

#########################################################################
############# Helper Functions for Working with JSON Emails #############
#########################################################################
# Slightly different from get_recipients for spacespace emails
def get_recipients(msg):
    """
    Return a set of recipients of the current message.
    self is removed from list of recipients if in recipients #(-set([sender]))
    All fields contain email addresses, not user IDs. From fields are visually just strings
    but checking just in case
    """
    sender = msg['From'][0] if type(msg['From']) == list else msg['From']
    return set(msg.get('To', []) + msg.get('Cc', []) + msg.get('Bcc', [])) - set([sender]) #basically return a set of recipients to the current email

def slice_user_corpus(emails, train_mode): #slices a list of emails into chunks based on time periods specified by the train_mode parameter
    """
    Parameters
    ----------
    emails : list of dict
        A list of emails converted from JSON formats to dictionaries 
    train_mode : str
        One of 'annual', 'quarterly', 'all'
        Indicates how to chunk up emails - into quarters, years, or both
    Returns
    -------
    timekey2emails : dict
        Matches quarters or years to respective emails
    """
    #'ActivityCreatedAt' contains a timestamp for the list of emails
    timekey2emails = defaultdict(list)
    #the function iterates through each email in the email list
    for email in emails:
        if train_mode == 'annual':
            timekey2emails[to_year(email['ActivityCreatedAt'], format='str')].append(email)
        elif train_mode == 'quarterly':
            timekey2emails[to_quarter(email['ActivityCreatedAt'], format='str')].append(email)
        elif train_mode == 'halfyear':
            timekey2emails[to_halfyear(email['ActivityCreatedAt'], format='str')].append(email)
        elif train_mode == 'all':
            timekey2emails[to_year(email['ActivityCreatedAt'], format='str')].append(email)
            timekey2emails[to_quarter(email['ActivityCreatedAt'], format='str')].append(email)
            timekey2emails[to_halfyear(email['ActivityCreatedAt'], format='str')].append(email)
    return timekey2emails

#########################################################################
############# Helper Functions for Working with Date Objects ############
#########################################################################

def to_quarter(date, format):
    """
    Return quarter of date in string
    """
    year, month = 0, 0
    if format == 'str':
        year = date[0:4]
        month = date[5:7]    
    elif format == 'datetime':
        year = date.year
        month = date.month
    quarter = ((int(month)-1) // 3) + 1
    timekey = str(year) + 'Q' + str(quarter)
    return timekey

def to_halfyear(date, format):
    """
    Return half year of date in string
    """
    year, month = 0, 0
    if format == 'str':
        year = date[0:4]
        month = date[5:7]    
    elif format == 'datetime':
        year = date.year
        month = date.month
    halfyear = ((int(month)-1) // 6) + 1
    timekey = str(year) + 'HY' + str(halfyear)
    return timekey

def to_year(date, format):
    """
    Return year of date in string
    """
    if format == 'str':
        return date[0:4]
    elif format == 'datetime':
        return str(date.year)

def datetime_to_timekey(date, time_key):
    if time_key == 'year':
        return to_year(date, format='datetime')
    elif time_key == 'quarter':
        return to_quarter(date, format='datetime')

def is_month_before_equal(datetime1, datetime2):
    if datetime1.year < datetime2.year:
        return 1
    elif (datetime1.year == datetime2.year) and (datetime1.month <= datetime2.month):
        return 1
    else:
        return 0

def num_months_between_dates(datetime1, datetime2):
    return abs((datetime1.year - datetime2.year) * 12 + datetime1.month - datetime2.month)

def num_quarters_between_dates(datetime1, datetime2):
    return abs((datetime1.year - datetime2.year) * 12 + datetime1.month - datetime2.month) // 3

def num_years_between_dates(datetime1, datetime2):
    return abs(datetime1.year - datetime2.year)

def time_between_dates(datetime1, datetime2, time_key):
    if time_key == 'monthly':
        return num_months_between_dates(datetime1, datetime2)
    elif time_key == 'quarterly':
        return num_quarters_between_dates(datetime1, datetime2)
    elif time_key == 'annual':
        return num_years_between_dates(datetime1, datetime2)

#########################################################################
############## Helper Functions for Working with Dataframes #############
#########################################################################

#function to convert a dictionary to a dataframe
def dict_to_df(index2rows, cols, index_name):
    """
    Parameters
    ----------
    index2rows : dict
        Dictionary mapping index to rows to be coverted
    cols : list
        List of column names of type str
    index : list
        List of index names
    Returns
    -------
    df : pd.DataFrame
        Constructed dataframe
    """
    if index2rows is None or len(index2rows) == 0:
        return None
    #case where the dataframe is formed across entire time span of user emails
    if len(index_name) == 1:
        df = pd.DataFrame.from_dict(index2rows, orient='index', columns=cols)
        df.index.name = index_name[0]
        df.sort_index(axis=0, inplace=True)
        return df
    else: #case where the user token (word) counts and email counts are formed for ecah year or each quarter
        df = pd.DataFrame.from_dict(index2rows, orient='index', columns=cols)
        df = pd.DataFrame(df, pd.MultiIndex.from_tuples(df.index, names=index_name))
        df.sort_index(axis=0, inplace=True)
        return df

#########################################################################
########### Helper Functions for Working with Embedding Output ##########
#########################################################################

def remove_empty_embeddings(embeddings_dir):
    """
    Removes all empty files in embeddings_dir that were produced when vocab size was 0.
    Parameters
    ----------
    embeddings_dir : str
        Full path to directory where embedding files are located
    """
    for file in os.listdir(embeddings_dir):
        mittens_file = os.path.join(embeddings_dir, file)
        if os.path.getsize(mittens_file) == 0:
            os.remove(mittens_file)
    return

def extract_company_embedding(company_embeddings_filename, tmp_dir, words):
    """
    Parameters
    ----------
    company_embeddings_filename : str
        File path of the company embeddings
    tmp_dir : str
        Path to the directory for gensim to output its tmp files in order to load embeddings into word2vec format
    words : list
        A list of strings for which to retrieve vectors
    Returns
    -------
    vecs : list
        A list of vectors of type numpy.ndarray that correspond to the list of words given as parameters
    """ 
    tmp_mittens = os.path.join(tmp_dir, "mittens_embeddings_all_word2vec.txt")
    word2vec_mittens_file = get_tmpfile(tmp_mittens)
    glove2word2vec(company_embeddings_filename, word2vec_mittens_file) #load the embeddings in the glove2word2vec format
    model = KeyedVectors.load_word2vec_format(word2vec_mittens_file) #this model represents the glove embddings space using the default company embeddings
    vecs = []
    '''
    for w in words:
        if w in model.vocab:
            vecs.append(model.wv[w]) #default vectors for the words in the vocab
        else:
            vecs.append(np.nan)
            print('%s not in company embeddings' % w)
    '''
    for w in words:
     try:
      index = model.key_to_index[w]
      vecs.append(model.vectors[index])
     except KeyError:
      vecs.append(np.nan)
      print('%s not in company embeddings' % w)
    return vecs

def word_similarity(model, w1, w2):
    """
    This is an auxilary function that allows for comparing one word to another word or multiple words
    If w1 and w2 are both single words, n_similarity returns their cosine similarity which is the same as 
    simply calling similarity(w1, w2)
    If w1 or w2 is a set of words, n_similarity essentially takes the mean of the set of words and then computes
    the cosine similarity between that vector mean and the other vector. This functionality is both reflected
    in its source code and has been verified manually.
    Parameters
    ----------
    model : KeyedVectors
        The model that contains all the words and vectors
    w1 : str or list
        The first word or word list to be compared
    w2 : str or list
        The second word or word list to be compared
    Returns
    -------
    float
        Cosine similarity between w1 and w2
    """
    #first fo the case where w1 and w2 parameters are single words
    if not isinstance(w1, list): 
        w1 = [w1]
    if not isinstance(w2, list):
        w2 = [w2]
    #create a copy of the input lists
    '''
    w1 = [w for w in w1 if w in model.vocab]
    w2 = [w for w in w2 if w in model.vocab]
    '''
    w1 = [w for w in w1 if w in model.key_to_index]
    w2 = [w for w in w2 if w in model.key_to_index]
    if len(w1) == 0 or len(w2) == 0:
        return None
    return model.n_similarity(w1, w2) #inbuilt word2vec model that calculates the cosine similarity between two lists; takes the mean of the set of words and then computes
    #the cosine similarity between that vector mean and the other vector

def cossim_with_none(vec1, vec2, vec_format='sparse'):
    """
    Auxiliary function that calls cossim function to test if vectors are None to prevent erroring out.
    Parameters
    ----------
    vec1 : list of (int, float), gensim sparse vector format
    vec2 : list of (int, float), gensim sparse vector format
    format : str, optional
        Either sparse or dense. If sparse, vec1 and vec2 are in gensim sparse vector format; use cossim function from gensim.
        Otherwise, vec1 and vec2 are numpy arrays and cosine similarity is hand calculated
    Returns
    -------
    float
        Cosine similarity between vec1 and vec2
    """
    if not (isnull_wrapper(vec1) or isnull_wrapper(vec2)):
        if vec_format == 'sparse':
            return cossim(vec1, vec2)
        elif vec_format == 'dense':
            if len(vec1) == 0 or len(vec2) == 0:
                return None
            return np.dot(vec1, vec2)/(np.linalg.norm(vec1) * np.linalg.norm(vec2))
        else:
            raise ValueError()
    return None

def calculate_pairwise_cossim(col1, col2=None, reference=False, reference_group=None, anon_ids=None, vec_format='sparse'):
    """
    Calculates averaged cosine similarity of every row vector in col1 with every other row vector in col2.
    If no col2 is provided, cosine similarity of every row vector with every other row vector in col1 is calculated.
    The two columns should have equal length.
    Parameters
    ----------
    col1 : pd.Series
        A column where each row is a sparse word vector (BoW format that gensim code is written for).
    col2 : pd.Series, optional
        A column where each row is a sparse word vector (BoW format that gensim code is written for).
    reference : bool, optional
        Indicator variable for whether filtering for reference groups is needed
    reference_group : pd.Series, optional
        If filtering for reference groups, a list containing reference group members for every employee in col1
    anon_ids : pd.Series, optional
        If filtering for reference groups, a list containing anon_ids for every employee in col1
    Returns
    -------
    results : list
        A list where the ith element is the averaged cosine similarity between the ith vector in col1 and every vector
        in col2 for which i != j. If no col2 is provided, a list where the ith element is the averaged cosine similarity
        between the ith vector in col1 and every other vector in col1 is returned.
    """
    vectors1 = col1.tolist()
    vectors2 = col2.tolist() if col2 is not None else col1.tolist()
    reference_group = reference_group.tolist() if reference else None
    anon_ids = anon_ids.tolist() if anon_ids is not None else None
    results = list()
    for i in range(len(vectors1)):
        total_sim = []
        if not isnull_wrapper(vectors1[i]):
            for j in range(len(vectors2)):
                if i != j and not isnull_wrapper(vectors2[j]):
                    # filter out any np.nans as our reference group
                    if not reference or (type(reference_group[i]) == set and anon_ids[j] in reference_group[i]):
                        if vec_format == 'sparse':
                            total_sim.append(cossim(vectors1[i], vectors2[j]))
                        elif vec_format == 'dense':
                            total_sim.append(np.dot(vectors1[i], vectors2[j])/(np.linalg.norm(vectors1[i]) * np.linalg.norm(vectors2[j])))
                        else:
                            raise ValueError()
            results.append(mean(total_sim) if len(total_sim) > 0 else None)
        else:
            results.append(None)
    return results

def isnull_wrapper(x):
    r = pd.isnull(x)
    if type(r) == bool:
        return r
    return r.any()

def vector_mean(col):
    """
    Calculate vector means of row vectors
    Parameters
    ----------
    col : pd.Series
        The column to be averaged
    Returns
    -------
    np.array
        A vector that is the numerical average of all vectors in col
    """
    return np.array(col[col.notna()].tolist()).mean(axis=0)

def extract_hr_survey_df(survey_hr_file, user_qualtrics_file, users_file, perf_likert_file, perf_percentage_file):
    """
    Loads various datasets to prepare survey and hr data for merging with embeddings df.
    Returns
    -------
    survey_hr_df : pd.DataFrame
        Dataframe indexed by user_id
    """
    perf_likert_df = pd.read_csv(perf_likert_file)
    perf_percentage_df = pd.read_csv(perf_percentage_file)
    perf_likert_df = perf_likert_df[['UID', '2019 Perf_Type(Rating)', '2020 Perf_Type(Rating)']]
    perf_percentage_df = perf_percentage_df[['UID', '2019_Perf_Type(Percentage)', '2020_Perf_Type(Percentage)']]
    perf_likert_df.columns = ["UID", "perf_rating_2019", "perf_rating_2020"]
    perf_percentage_df.columns = ["UID", "perf_percentage_2019", "perf_percentage_2020"]
    
    user_qualtrics_df = pd.read_csv(user_qualtrics_file)
    user_qualtrics_df = user_qualtrics_df.merge(perf_likert_df, on='UID', how='left').merge(perf_percentage_df, on="UID", how='left')

    survey_hr_df = pd.read_csv(survey_hr_file)
    survey_hr_df = survey_hr_df.merge(user_qualtrics_df, left_on='uid', right_on='UID', how='left')
    # we lose two employees whose emails are not included in the crawled email data
    email2uid = {}
    with open(users_file, encoding='utf-8') as f:
        for line in f:
            user = json.loads(line)
            for e in user['Emails']:
                email2uid[e] = user['UserId']

    survey_hr_df = survey_hr_df[survey_hr_df['Email'].isin(email2uid.keys())]
    survey_hr_df['user_id'] = survey_hr_df['Email'].apply(lambda e : email2uid[e])
    survey_hr_df.set_index('user_id', inplace=True)
    return survey_hr_df

def extract_variables_from_file(file):
    """ 
    Extract relevant information from name of embedding file, with format: {}(_{})?_(internal|external).txt
    """
    file_chunks = file[0:-4].split('_') #split the file name into 3 parts
    usr = file_chunks[0] #the first part contains the userID
    time_key = file_chunks[1] if len(file_chunks) == 3 else None #this part contains the year - like 2020, or 2020Q1, or 2020HY1
    return (usr, time_key)

def project(word, dimension):
    """
    Returns the scalar projection of word on dimension. Word and dimension are both assumed to be vectors
    """
    return np.dot(word, dimension)/ np.linalg.norm(dimension)

def drop(u, v):
    """
    Returns the component in u that is orthogonal to v, also known as the vector rejection of u from v
    """
    return u - v * u.dot(v) / v.dot(v)

def remove_frequency(company_embeddings, embedding_dim):
    """
    Remove frequency dimension from company embeddings (assumed to be the top dimension)
    Parameters
    ----------
    company_embeddings : dict
        A dictionary mapping words to their embeddings
    embedding_dim : int
        The dimension of word vectors. Used to determine the number of components that go into PCA.
    Returns
    -------
    new_company_embeddings : dict
        A dictionary mapping words to their embeddings, where embeddings are demeaned and first PCA
        dimension hypothesized to represent frequency dimension is removed
    """
    vectors = np.array([v for k, v in company_embeddings.items()])
    miu = np.mean(vectors, axis=0)
    demeaned_vectors = vectors - miu
    pca = PCA(n_components = embedding_dim)
    pca.fit(demeaned_vectors)
    frequency_dim = pca.components_[0]
    new_company_embeddings = {k : drop(v-miu, frequency_dim) for k, v in company_embeddings.items()}
    return new_company_embeddings

def doPCA(words_start, words_end):
    """
    Performs PCA on differences between pairs of words and returns the first component
    Based on function doPCA in Bolukbasi et al. (2016) source code at https://github.com/tolga-b/debiaswe/blob/master/debiaswe/we.py
    Parameter
    ---------
    words_start : list
        List of hashed words at one end of interested dimension
    words_end: list
        List of hashed words at the other end of dimension
    Returns
    -------
    ndarray
        First component of PCA of differences between pairs of words
    """
    matrix = []
    for i in range(len(words_start)):
        center = (words_start[i] + words_end[i])/2
        matrix.append(words_end[i] - center)
        matrix.append(words_start[i] - center)
    matrix = np.array(matrix)
    # cannot have more components than the number of samples
    num_components = len(words_start)*2
    pca = PCA(n_components = num_components)
    pca.fit(matrix)
    return pca.components_[0]

def build_dimension(words_start, words_end):
    """
    This method builds a dimension defined by words at separate end of a dimension.
    Multiple methods exist in previous literature when building such a dimension.
    1) Kozlowski et al. (2019) averages across differences between different word pairs, noted to be interchangeable with averaging words on each side of the dimension and
    then taking the difference between averages. They are empirically verified to be identical.
    2) Bolukbasi et al. (2016) defines gender direction using a simple difference between man and woman in the corresponding tutorial. In the same tutorial, 
    racial direction is defined as difference between two clusters of words that are each sum of the embeddings of its corresponding dimensions
    normalized by the L2 norm. Wang et al. (2020) note that normalization is unnecessary. If unnormalized, this method should be equivalent to #3.
    3) Bolukbasi et al. (2016) defines gender direction also by taking the differences across multiple pairs, doing PCA on these differences, and 
    taking the first component as the gender direction.
    Parameter
    ---------
    words_start : list
        List of hashed words at the positive end of the dimension, where positive implies more likely to affect identification positively
    words_end: list
        List of hashed words at the other end of dimension
    Returns
    -------
    (mean_dim, pca_dimension) : 2-tuple of numpy vector
        Two vector that represents the dimension of interest calculated using method #1 and #3.
    """
    assert len(words_start) == len(words_end)
    differences = [(np.array(words_start[i]) - np.array(words_end[i])) for i in range(len(words_start)) if not np.isnan(words_start[i]).any() and not np.isnan(words_end[i]).any()]
    mean_dim = np.array(differences).mean(axis=0)
    pca_dim = doPCA(words_start, words_end)
    if project(words_start[0], pca_dim) < 0:
        # convention used in the current script is that words_start should represent the positive dimension
        pca_dim = pca_dim * -1
    return (mean_dim, pca_dim)

#ending here-------------------

Print current directory

In [None]:
import os
current_dir = os.getcwd()
current_dir

Set hyperparameters and input and output directories

In [None]:
#set hyperparameters
embedding_dim = 50
mincount = 50
ling_thres = 0.8 

#set file paths
home_dir = "/zfs/projects/faculty/amirgo-identification/"
email_dir = os.path.join(home_dir, "email_data/")
mittens_dir = os.path.join(home_dir, "mittens")
utils_dir = os.path.join(mittens_dir, "utils")
#note: created two copies of the embeddings folder, use this copy - embeddings_high_prob_eng_08_50d_mincount50_v2 instead of embeddings_high_prob_eng_08_50d_mincount50
embeddings_dir = "/zfs/projects/faculty/amirgo-identification/identification-Sarayu/staffing/fine-tuning/embeddings_high_prob_eng_08_50d_mincount50_v2"

email_file = os.path.join(email_dir, 'MessagesHashed.jsonl')
users_file = os.path.join(email_dir, 'Users.json')
activity_file = os.path.join(email_dir, 'Activities.json')
survey_dir = os.path.join(home_dir, "survey_hr_data")
user_qualtrics_file = os.path.join(survey_dir, "UsersQualtrics.csv")
#other additional survey files
perf_percentage = os.path.join(survey_dir, "perf_rating_percentages.csv")
perf_likert = os.path.join(survey_dir, "perf_rating_likert.csv")

analyses_data_dir = "/zfs/projects/faculty/amirgo-transfer/spacespace/spacespace/staffing/analyses_data/"
survey_filename = os.path.join(analyses_data_dir, "preprocessed_survey_hr.csv")
company_embeddings_filename = "/zfs/projects/faculty/amirgo-identification/identification-Sarayu/staffing/training/vectors_high_prob_eng_08_50d.txt"

#change the mincount directory as needed by tmp based on mincount value
tmp_dir = os.path.join(current_dir, "tmp_mincount50_v2")
output_dir = os.path.join(current_dir, "staffing_email_idtf_data_mincount50_v2")

#the main output files with the similarity measures, the measures are created at the person, person-quarter, person-halfyear, and person-year levels
user_output_filename = os.path.join(output_dir, "embeddings_high_prob_eng_{}_users_{}d_mincount{}_v2.csv".format(str(ling_thres).replace(".", ""), embedding_dim, mincount))
annual_output_filename = os.path.join(output_dir, "embeddings_high_prob_eng_{}_annual_{}d_mincount{}_v2.csv".format(str(ling_thres).replace(".", ""), embedding_dim, mincount))
quarterly_output_filename = os.path.join(output_dir, "embeddings_high_prob_eng_{}_quarterly_{}d_mincount{}_v2.csv".format(str(ling_thres).replace(".", ""), embedding_dim, mincount))
halfyearly_output_filename = os.path.join(output_dir, "embeddings_high_prob_eng_{}_halfyearly_{}d_mincount{}_v2.csv".format(str(ling_thres).replace(".", ""), embedding_dim, mincount))

print(embeddings_dir)
print(company_embeddings_filename)
print(tmp_dir)
print(user_output_filename)
print(annual_output_filename)
print(quarterly_output_filename)
print(halfyearly_output_filename)

In [None]:
#set hashes

year_colname, quarter_colname, halfyear_colname = 'year', 'quarter', 'halfyear'
hash2word = {
    '09f83385': 'mine',
    '20019fa4': 'i',
    '20b60145': 'us',
    '28969cb1': 'them',
    '3828d3d2': 'me',
    '4dd6d391': 'their',
    '5b4e27db': 'my',
    '64a505fc': 'ourselves',
    '6935bb23': 'ours',
    '6f75419e': 'myself',
    '86df0c8d': 'themselves',
    'a7383e72': 'we',
    'a9193217': 'theirs',
    'b72a9dd7': 'our',
     'fd0ccf1c': 'they'}
word2hash = {v:k for k, v in hash2word.items()}
pronouns = ['i', 'me', 'my', 'mine', 'myself', 'we', 'us', 'our', 'ours', 'ourselves']
single_pronouns = ['i', 'we']
i_index, we_index = 0, 5 #index in the 'pronouns' list defined above
hash_pronouns = [word2hash[p] for p in pronouns]
hash_single_pronouns = [word2hash[p] for p in single_pronouns]

file_name_re = re.compile("5([a-z0-9]+)_(2020(Q3)?_)?internal.txt") #all the generated email slices
num_cores = 10 #note yen1 has only 12 cores

domain_hash = {
    'collabera.com':                     '509c8f6b1127bceefd418c023533d653',
    'collaberainc.mail.onmicrosoft.com': 'ec5b67548b6ec06f1234d198efec741e',
    'collaberainc.onmicrosoft.com':      '86160680578ee9258f097a67a5f25af9',
    'collaberasp.com':                   '6bf3934d19f1acf5b9295b63e0e7f66e',
    'g-c-i.com':                         '3444d1f7d5e46443080f2d069e41a10c'}
collabera_hashes = set([v for k, v in domain_hash.items()])

Run the functions to generate the identification measure

In [None]:
#########################################################################
######### Functions for Loading Raw Counts as Control Variables #########
#########################################################################

def read_raw_counts(activity_file, email_file): #function to create the two control variables for each user: token (word) count across all emails made by the user, and number of emails made by the user
    """
    The main workhorse function for obtaining raw message and token counts (word count) as control variables.
    Parameters
    ----------
    activity_file : str
        The full filepath that contains all email metadata, where each line is a JSON object that represents one email
    email_file : str
        The full filepath that contains all email content, where each line is a JSON object that represents one email
    Returns
    -------
    tuple
        A tuple of user-level, annual, and quarterly dataframes
    """
    usr2counts, usr_year2counts, usr_quarter2counts, usr_halfyear2counts = defaultdict(lambda : [0, 0]), defaultdict(lambda : [0, 0]), defaultdict(lambda :[0, 0]), defaultdict(lambda :[0, 0])
    sid2activity = {}
    cols = ['num_tokens', 'num_messages']
    tok_count_index, msg_count_index = 0, 1
    with open(activity_file, encoding='utf-8') as f:
        for line in f:
            activity = json.loads(line)
            sid2activity[activity['MailSummarySid']] = activity

    with open(email_file, encoding='utf-8') as f:
        for line in f:
            email = json.loads(line)
            lang = email['l']
            if len(email['hb']) > 0 and lang[0] == "__label__en" and lang[1] > ling_thres:
                activity = sid2activity[email['sid']]
                user = activity['UserId']
                year = to_year(activity['ActivityCreatedAt'], format='str')
                quarter = to_quarter(activity['ActivityCreatedAt'], format='str')
                halfyear = to_halfyear(activity['ActivityCreatedAt'], format='str')
                num_toks = len(email['hb'].replace('\n', ' ').replace("SENT_END", "").strip().split()) #number of tokens or words in the email
                activity = sid2activity[email['sid']]
                recipients = get_recipients(activity)
                pure_internal = True
                for r in recipients:
                    domain = r.split('@')[1]
                    if domain not in collabera_hashes:
                        pure_internal = False
                        break
                if not pure_internal:
                    continue
                #update these counts only for the internal messages
                usr2counts[user][tok_count_index] += num_toks #keep track of each user's token count across all his emails
                usr2counts[user][msg_count_index] += 1 #keep a count of each user's token count
                #repeat for the year and quarter dicts. The dictionary above should be an overall dictionary, whereas the dictionaries below are for a user in a specific year or quarter
                usr_year2counts[(user, year)][tok_count_index] += num_toks
                usr_year2counts[(user, year)][msg_count_index] += 1
                usr_quarter2counts[(user, quarter)][tok_count_index] += num_toks
                usr_quarter2counts[(user, quarter)][msg_count_index] += 1
                usr_halfyear2counts[(user, halfyear)][tok_count_index] += num_toks
                usr_halfyear2counts[(user, halfyear)][msg_count_index] += 1

    #creates three different dataframes based on each dictionary (now four because including halfyear too)
    usr2counts_df = dict_to_df(usr2counts, cols, index_name=['user_id']) #calls util.py function
    usr_year2counts_df = dict_to_df(usr_year2counts, cols, index_name=['user_id', year_colname])
    usr_quarter2counts_df = dict_to_df(usr_quarter2counts, cols, index_name=['user_id', quarter_colname])
    usr_halfyear2counts_df = dict_to_df(usr_halfyear2counts, cols, index_name=['user_id', halfyear_colname])
    # return the dataframe that contain the control variables for each user along with the token (word) count and email count
    return (usr2counts_df, usr_year2counts_df, usr_quarter2counts_df, usr_halfyear2counts_df)

#########################################################################
#### Functions for Measuring Within-Person Similarities in Embeddings ###
#########################################################################

def embeddings_similarities(model):
    """
    Returns the embeddings of i, we, centroid of i-words, centroid of we-words, and their respective cosine similarities
    Parameters
    ----------
    model : gensim.models.Word2Vec
        Model that stores the embeddings for each word
    Returns
    -------
    embeds : list
        A list of embeddings (vectors) and similarities (float)
    """
    
    '''
    i = model.wv[hash_single_pronouns[0]] if hash_single_pronouns[0] in model.vocab else None #obtain the embedding of 'i' by passing the hash
    we = model.wv[hash_single_pronouns[1]] if hash_single_pronouns[1] in model.vocab else None
    '''
    
    try:
     #i = model.wv[model.key_to_index[hash_single_pronouns[0]]]
     i = model[model.key_to_index[hash_single_pronouns[0]]]
    except KeyError:
     i = None
    try:
     #we = model.wv[model.key_to_index[hash_single_pronouns[1]]]
     we = model[model.key_to_index[hash_single_pronouns[1]]]
    except KeyError:
     we = None
    
    #i_cluster represents all the words that are lose to i
    #i_cluster = [model.wv[word] for word in hash_pronouns[i_index:we_index] if word in model.vocab]
    #i_cluster = [model.wv[word] for word in hash_pronouns[i_index:we_index] if word in model.key_to_index]
    i_cluster = [model[word] for word in hash_pronouns[i_index:we_index] if word in model.key_to_index]
    i_cluster = None if len(i_cluster) == 0 else np.mean(i_cluster, axis=0) #store the centroid or mean value of all the embddings that represent i-words
    #we_cluster represents all the words that are close to we
    #we_cluster = [model.wv[word] for word in hash_pronouns[we_index:] if word in model.vocab]
    #we_cluster = [model.wv[word] for word in hash_pronouns[we_index:] if word in model.key_to_index]
    we_cluster = [model[word] for word in hash_pronouns[we_index:] if word in model.key_to_index]
    we_cluster = None if len(we_cluster) == 0 else np.mean(we_cluster, axis=0) #store the centroid or mean value of all the embddings that represent we-words
    embeds = ([i, we] + [word_similarity(model, hash_pronouns[i_index+i], hash_pronouns[we_index+i]) for i in range(5)] +
        [i_cluster, we_cluster, word_similarity(model, hash_pronouns[i_index:we_index], hash_pronouns[we_index:])]) #call util.py
    
    symmetric_i_words, symmetric_we_words = [], [] #create two lists to store the hashes of the i-words and we-words
    '''
    for i in range(len(hash_pronouns)-we_index): #for all the i-words
        if hash_pronouns[i] in model.vocab and hash_pronouns[i+we_index] in model.vocab:
            symmetric_i_words.append(hash_pronouns[i])
            symmetric_we_words.append(hash_pronouns[i+we_index])
    if len(symmetric_i_words) > 0:
        embeds.append(model.n_similarity(symmetric_i_words, symmetric_we_words))
    return embeds #embeds is a list that contains the embeddings of (i, we, centroid of i-words, centroid of we-words), and cosine similarities between all the i-words and all the we-words'''
    
    symmetric_i_words = []
    symmetric_we_words = []
    for i in range(len(hash_pronouns)-we_index):
        if hash_pronouns[i] in model.key_to_index and hash_pronouns[i+we_index] in model.key_to_index:
            symmetric_i_words.append(hash_pronouns[i])
            symmetric_we_words.append(hash_pronouns[i+we_index])
    if len(symmetric_i_words) > 0:
        embeds.append(model.n_similarity(symmetric_i_words, symmetric_we_words))
    return embeds


def process_single_embedding_file(i, num_files, embeddings_dir, file):
    """
    Reading from one embedding file
    Parameters
    ----------
    i : int
        Index used for progress tracking
    num_files : int
        Total number of files to process used for progress tracking
    embeddings_dir : str
        Directory in which embedding files reside
    file : str
        Embedding file to open and process
    Returns
    -------
    embeds : list
        A list of embeddings and similarities. Embeddings are used for calculating between-person similarities in downstream functions.
    """
    mittens_file = os.path.join(embeddings_dir, file)
    if i%100 == 0:
        sys.stderr.write("Processing \t%d/%d -'%s', at %s.\n" % (i, num_files, mittens_file, datetime.now()))
        print('searching the directory')
        print(embeddings_dir)
    # chopping off the file extension in filename
    tmp_mittens = os.path.join(tmp_dir, file[0:-4] + "_word2vec.txt")
    try:
        word2vec_mittens_file = get_tmpfile(tmp_mittens) #this is a function in utils, the embeddings files are converted to word2vec format and stored in a location called tmp_dir
        glove2word2vec(mittens_file, word2vec_mittens_file)
        model = KeyedVectors.load_word2vec_format(word2vec_mittens_file)
        embeds = embeddings_similarities(model)
        return embeds
    except Exception as e:
        sys.stderr.write('File %s caused an error: %s.\n' % (mittens_file, str(e)))
   
#within-person similarities: individual's i embedding to we embedding
def self_similarities(files, num_files, embeddings_dir):
    """
    Main workhorse function for calculating within-person similarities. Compares an individual's i embedding to we embedding, using both
    i and we's embedding only and centroid of i-words and we-words. Indices used in this file relies on knowledge of the naming convention of underlying embedding files.
    Parameters
    ----------
    files : list of str
        Embedding files to process
    num_files : int
        Total number of files to process, used to keep track of progress
    embeddings_dir : str
        Directory in which embedding files reside
    Return
    ------
    tuple
        3-tuple of dictionaries mapping usr and optional timekeys to within-person embedding similarities
    """
    usr2distances, usr_year2distances, usr_quarter2distances, usr_halfyear2distances = defaultdict(list), defaultdict(list), defaultdict(list), defaultdict(list)
    pool = multiprocessing.Pool(processes = num_cores) #muliprocessing uisng pythons inbuilt modeule depending on the number of cpu cores available
    results = {}
    for i, file in enumerate(files, 1): #start indexing from 1
        usr, time_key = extract_variables_from_file(file) #utils
        #process_single_embedding_file defined above is the function that will be executed by a worker process 
        results[(usr, time_key)] = pool.apply_async(process_single_embedding_file, args=(i, num_files, embeddings_dir, file, ))
    pool.close()
    pool.join()
    for key, r in results.items():
        usr, time_key = key
        curr_row = r.get()
        # Empty if errored out
        if curr_row:
            if time_key:
                if len(time_key) == 4: #ex. 2020
                    usr_year2distances[(usr, time_key)] = curr_row
                elif len(time_key) == 6: #ex. 2020Q1
                    usr_quarter2distances[(usr, time_key)] = curr_row
                elif len(time_key) == 7: #ex. 2020HY1
                    usr_halfyear2distances[(usr, time_key)] = curr_row
                else:
                    sys.stderr.write('Embedding file format does not conform to expectations. Extracted time key %s for user %s.\n' % (time_key, usr)) 
            else:
                usr2distances[(usr)] = curr_row
    return (usr2distances, usr_year2distances, usr_quarter2distances,usr_halfyear2distances)

#########################################################################
### Functions for Measuring Between-Person Similarities in Embeddings ###
#########################################################################
def pairwise_cossim(df, i_col, we_col, reference_tag='', reference=False, reference_group=None, anon_ids=None, vec_format='sparse'):
    """
    Calculating pairwise cosine similarities between every i-embedding to every other i-embedding, and every we-embedding to every other we-embedding
    """
    col1, col2 = 'avg_i_i' + reference_tag, 'avg_we_we' + reference_tag
    df[col1] = calculate_pairwise_cossim(df[i_col], reference=reference, reference_group=reference_group, anon_ids=anon_ids, vec_format=vec_format)
    df[col2] = calculate_pairwise_cossim(df[we_col], reference=reference, reference_group=reference_group, anon_ids=anon_ids, vec_format=vec_format)
    return df

def sparsify(df, to_sparse, sparse):
    assert len(to_sparse) == len(sparse)
    for i in range(len(to_sparse)):
        df[sparse[i]] = df[to_sparse[i]].apply(lambda x : any2sparse(x) if (x is not None and np.isfinite(x).all()) else None)
    return df
    
def cossimify(df, names, vec_format='sparse'):
    """
    Parameters
    ----------
    df : pd.DataFrame
    names : list
        A list of tuples, where the first element is the first column name, second element is the second column name, and third element is the result column name
    Returns
    -------
    df : pd.DataFrame
        The original dataframe with the cosine similarity columns appended
    """
    for tup in names:
        df[tup[2]] = df.apply(lambda row : cossim_with_none(row[tup[0]], row[tup[1]], vec_format), axis=1)
    return df

#between-person similarities: individual's embeddings to the average embeddings of everyone else in the company
def self_other_similarities(df, company_embeddings, company_cluster_embeddings, panel_data):
    """
    Main workhorse function for calculating between-person similarities. Compares an individual's embeddings to
    the average embeddings of everyone else in the company, as well as GloVe embeddings built on the entire company's corpus.
    Parameters
    ----------
    df : pd.DataFrame
        DataFrame produced by self_similarities that includes both embeddings and embedding similarities at the individual level
    company_embeddings : 2-tuple of numpy array
        Company embeddings of "i" and "we"
    company_cluster_embeddings : 2-tuple of numpy array
        Company embeddings of centroid of "i" words and centroid of "we" words
    panel_data : bool
        Indicates whether df is in a panel data format or a cross-sectional format. If in panel format,
        grouping by timekey is needed before averaging.
    Returns
    -------
    df : pd.DataFrame
        Dataframe that includes both within- and between-person similarities
    """
    if df is None or df.empty:
        return
    sparse_company_embeddings = [any2sparse(company_embeddings[0]), any2sparse(company_embeddings[1])]
    sparse_company_cluster_embeddings = [any2sparse(company_cluster_embeddings[0]), any2sparse(company_cluster_embeddings[1])]

    # converts np.array or scipy array to a sparse vector format used by gensim
    # any doesn't mean anything -- Python lists are not accepted by this function
    # sparse vectors are lists whose first elements are indices and second elements are numbers
    # the indices correspond to indices of the original vector fed into any2sparse, where zeroes were retained
    # sparse vectors through out all the zeroes in the actual vector to save space
    # are vector indices of the dense vector 
    df['i_sparse'] = df['i_embed'].apply(lambda x : any2sparse(x) if not isnull_wrapper(x) else None)
    df['we_sparse'] = df['we_embed'].apply(lambda x: any2sparse(x) if not isnull_wrapper(x) else None)
    df['i_cluster_sparse'] = df['i_cluster'].apply(lambda x : any2sparse(x) if not isnull_wrapper(x) else None)
    df['we_cluster_sparse'] = df['we_cluster'].apply(lambda x : any2sparse(x) if not isnull_wrapper(x) else None)

    if not panel_data:
        i_mean = vector_mean(df['i_embed'])
        df['i_embed_avg'] = df.apply(lambda x : i_mean, axis=1)
        we_mean = vector_mean(df['we_embed'])
        df['we_embed_avg'] = df.apply(lambda x : we_mean, axis=1)
        i_mean = vector_mean(df['i_cluster'])
        df['i_cluster_avg'] = df.apply(lambda x : i_mean, axis=1)
        we_mean = vector_mean(df['we_cluster'])
        df['we_cluster_avg'] = df.apply(lambda x : we_mean, axis=1)
        df = pairwise_cossim(df, 'i_sparse', 'we_sparse', vec_format='sparse')
        df = pairwise_cossim(df, 'i_cluster_sparse', 'we_cluster_sparse', reference_tag='_cluster', vec_format='sparse')
    else:
        #  If i_embed or we_embed is not defined for any anon_id during this period,
        # then vector_mean will return np.nan
        df = df.join(df['i_embed'].groupby(level=1).apply(vector_mean), rsuffix='_avg') 
        df = df.join(df['we_embed'].groupby(level=1).apply(vector_mean), rsuffix='_avg')
        df = df.join(df['i_cluster'].groupby(level=1).apply(vector_mean), rsuffix='_avg') 
        df = df.join(df['we_cluster'].groupby(level=1).apply(vector_mean), rsuffix='_avg')
        new_df = pd.DataFrame()
        for time_chunk, time_df in df.groupby(level=1):
            time_df = pairwise_cossim(time_df, 'i_sparse', 'we_sparse', vec_format='sparse')
            time_df = pairwise_cossim(time_df, 'i_cluster_sparse', 'we_cluster_sparse', reference_tag='_cluster', vec_format='sparse')
            new_df = new_df._append(time_df)
        df = new_df
    # Filtering out Nones and np.nan (who has class float)
    df = sparsify(df, ['i_embed_avg', 'we_embed_avg', 'i_cluster_avg', 'we_cluster_avg'],
        ['i_avg_sparse', 'we_avg_sparse', 'i_cluster_avg_sparse', 'we_cluster_avg_sparse'])
    
    # These averages are already defined based on time periods when we are running using panel data due to the inherent structure of the data
    df = cossimify(df,
        [('i_sparse', 'i_avg_sparse', 'i_avg_i'), ('we_sparse', 'we_avg_sparse', 'we_avg_we'),
        ('i_cluster_sparse', 'i_cluster_avg_sparse', 'i_avg_i_cluster'), ('we_cluster_sparse', 'we_cluster_avg_sparse', 'we_avg_we_cluster'),
        ('i_sparse', 'we_avg_sparse', 'i_avg_we'), ('i_cluster_sparse', 'we_cluster_avg_sparse', 'i_avg_we_cluster')], vec_format='sparse')

    df['i_company_i'] = df.apply(lambda row : cossim_with_none(row['i_sparse'], sparse_company_embeddings[0], vec_format='sparse'), axis=1) 
    df['we_company_we'] = df.apply(lambda row : cossim_with_none(row['we_sparse'], sparse_company_embeddings[1], vec_format='sparse'), axis=1) 
    df['i_company_i_cluster'] = df.apply(lambda row : cossim_with_none(row['i_cluster_sparse'], sparse_company_cluster_embeddings[0], vec_format='sparse'), axis=1) 
    df['we_company_we_cluster'] = df.apply(lambda row : cossim_with_none(row['we_cluster_sparse'], sparse_company_cluster_embeddings[1], vec_format='sparse'), axis=1) 
    df['i_company_we'] = df.apply(lambda row : cossim_with_none(row['i_sparse'], sparse_company_embeddings[1], vec_format='sparse'), axis=1) 
    df['i_company_we_cluster'] = df.apply(lambda row : cossim_with_none(row['i_cluster_sparse'], sparse_company_cluster_embeddings[1], vec_format='sparse'), axis=1) 
    return df.round(5)

def compare_internal_external(df):
    df = cossimify(df,
        [('i_sparse_internal', 'we_sparse_external', 'i_int_we_ext'),
        ('i_cluster_sparse_internal', 'we_cluster_sparse_external', 'i_int_we_ext_cluster'),
        ('i_sparse_internal', 'we_avg_sparse_external', 'i_int_we_avg_ext'),
        ('i_cluster_sparse_internal', 'we_cluster_avg_sparse_external', 'i_int_we_avg_ext_cluster')], vec_format='sparse')

    for post in ['_internal', '_external']:
        cols = ['i_embed', 'we_embed', 'i_sparse', 'we_sparse', 'i_embed_avg', 'we_embed_avg', 'i_avg_sparse', 'we_avg_sparse',
        'i_cluster', 'we_cluster', 'i_cluster_sparse', 'we_cluster_sparse', 'i_cluster_avg', 'we_cluster_avg', 'i_cluster_avg_sparse', 'we_cluster_avg_sparse']
        cols = [c+post for c in cols]
        df.drop(cols, axis=1, inplace=True)
    return df

#most important function to read the finally generated embeddings
def reading_embeddings(embeddings_dir, company_embeddings, company_cluster_embeddings, test_mode=False):
    """
    Calculates embedding similarities within-person and between-person
    Parameters
    ----------
    embeddings_dir : str
        Directory where all embedding files exist
    company_embeddings : tuple of numpy array
        Embeddings of "i" and "we" in the whole company email corpus
    company_cluster_embeddings : tuple of numpy array
        Embeddings of average of all "i" words and average of all "we" words in the whole company email corpus
    test_mode : bool, optional
        If testing, reduce number of files to process
    Returns
    -------
    tuple
        User, annual, and quarter level dataframes that include both within- and between-person embedding similarities
    """
    #note: embeddings_dir = os.path.join(mittens_dir, "embeddings_high_prob_eng_{}_{}d_mincount{}".format(str(ling_thres).replace(".", ""), embedding_dim, mincount))
    #thus, embeddings_dir is the embeddings file itself which is in glove embeddings  format 
    all_files = os.listdir(embeddings_dir) #specifies list of files in the directory
    if test_mode: all_files = [all_files[random.randint(0, len(all_files)-1)] for _ in range(len(all_files)//50)]

    internal_re = re.compile(".+_internal.txt")
    external_re = re.compile(".+_external.txt")

#create a list of internal and external email exchanges files
    internal_files, external_files = [], []
    for f in all_files:
        if re.match(internal_re, f):
            internal_files.append(f)
        elif re.match(external_re, f):
            external_files.append(f)
    print(len(internal_files))
    print(len(external_files))
    result_dfs = []
    #when enumerating over two lists, the value of i, and files first refers to internal_files, then in the next iteration refers to extrenal_files
    for i, files in enumerate([internal_files, external_files], 1): #once for internal files, once for external files
    #for i, files in enumerate([internal_files], 1):   #for only internal files
        num_files = len(files)
        sys.stderr.write('Iteration %d: Calculate within-person similarities for %d files at %s.\n' % (i, num_files, str(datetime.now())))
        usr2distances, usr_year2distances, usr_quarter2distances, usr_halfyear2distances = self_similarities(files, num_files, embeddings_dir)
        print('reached here')
        cols = ['i_embed', 'we_embed', 'i_we', 'me_us', 'my_our', 'mine_ours', 'myself_ourselves', 'i_cluster', 'we_cluster', 'i_we_cluster', 'i_we_symmetric']
        usr2distances_df = dict_to_df(usr2distances, cols, index_name=['user_id'])
        usr_year2distances_df = dict_to_df(usr_year2distances, cols, index_name=['user_id', year_colname])
        usr_quarter2distances_df = dict_to_df(usr_quarter2distances, cols, index_name=['user_id', quarter_colname])
        usr_halfyear2distances_df = dict_to_df(usr_halfyear2distances, cols, index_name=['user_id', halfyear_colname])

        sys.stderr.write('Iteration %d: Calculate between-person similarities for %d files at %s.\n' % (i, num_files, str(datetime.now())))
        pool = multiprocessing.Pool(processes = num_cores)
        results = ([pool.apply_async(self_other_similarities, args=(df, company_embeddings, company_cluster_embeddings, panel,))
            for df, panel in [(usr2distances_df, False), (usr_year2distances_df, True), (usr_quarter2distances_df, True), (usr_halfyear2distances_df, True)]])
        pool.close()
        pool.join()
        result_dfs.append([r.get() for r in results])
        sys.stderr.write('Iteration %d: Successfully read and computed cosine similarities for %d embedding files at %s.\n' % (i, num_files, str(datetime.now())))  
    
    print('dimensions of result------')
    if isinstance(result_dfs, list):
        if all(isinstance(sublist, list) for sublist in result_dfs):
          # 2D array
          rows = len(result_dfs)
          cols = len(result_dfs[0])
          print("Dimensions: {} rows, {} columns".format(rows, cols))
        else:
          # 1D array
          print("Dimensions: 1D array with {} elements".format(len(result_dfs)))
    else:
        print("Input is not a list.")
    
    #if the first for statement including both internal and external email's are considered
    usr_df = result_dfs[0][0].join(result_dfs[1][0], lsuffix='_internal', rsuffix='_external', how='outer')
    usr_year_df = result_dfs[0][1].join(result_dfs[1][1], lsuffix='_internal', rsuffix='_external', how='outer')
    usr_quarter_df = result_dfs[0][2].join(result_dfs[1][2], lsuffix='_internal', rsuffix='_external', how='outer')
    usr_halfyear_df = result_dfs[0][3].join(result_dfs[1][3], lsuffix='_internal', rsuffix='_external', how='outer')

    '''
    #if only internal emails are considered
    usr_df = result_dfs[0][0]
    usr_year_df = result_dfs[0][1]
    usr_quarter_df = result_dfs[0][2]
    usr_halfyear_df = result_dfs[0][3]
    '''

    #comparing internal and external emails
    usr_df = compare_internal_external(usr_df)
    usr_year_df = compare_internal_external(usr_year_df)
    usr_quarter_df = compare_internal_external(usr_quarter_df)
    usr_halfyear_df = compare_internal_external(usr_halfyear_df)
    return (usr_df, usr_year_df, usr_quarter_df, usr_halfyear_df)

if __name__ == '__main__':
    starttime = datetime.now()
    test = False
    try:
        test = sys.argv[1].lower() == 'test'
    except IndexError as error:
        pass
    if test:
        user_output_filename = os.path.join(output_dir, "test_embeddings_users.csv")
        annual_output_filename = os.path.join(output_dir, "test_embeddings_annual.csv")
        quarterly_output_filename = os.path.join(output_dir, "test_embeddings_quarterly.csv")
    for d in [output_dir, tmp_dir]:
        if not os.path.exists(d):
            os.mkdir(d)

    #note the start time
    sys.stderr.write('Loading corpus counts at %s.\n' % datetime.now())
    #create three dataframes (now four because including halfyearly counts) that contain the two control variable values for each user: token (word) count and email count
    usr2counts, usr2annual_counts, usr2quarterly_counts, usr2halfyearly_counts = read_raw_counts(activity_file, email_file) 
    #read the default embeds by calling a function from utils.py. How are the default vallues of the embddings set?
    sys.stderr.write('Reading embeddings at %s.\n' % datetime.now())
    company_embeddings = extract_company_embedding(company_embeddings_filename, tmp_dir, hash_pronouns) #loads the company embeddings are converts them to the appropriate format and stores them in the tmp_dir location
    #call utils.py to create the initial i-words mean embedding and the initial we-word embedding and store these two in a list
    company_cluster_embeddings = (vector_mean(pd.Series(company_embeddings[i_index:we_index])), vector_mean(pd.Series(company_embeddings[we_index:])))
    #the read embeddings function is the main function
    usr2measures, usr2annual_measures, usr2quarterly_measures, usr2halfyearly_measures = reading_embeddings(embeddings_dir, company_embeddings, company_cluster_embeddings, test)
    
    # different embedding files should be matched with the same hr file as hr data is not in panel format
    sys.stderr.write('Reading HR and Survey data at %s.\n' % datetime.now())
    hr_df = extract_hr_survey_df(survey_filename, user_qualtrics_file, users_file, perf_likert, perf_percentage)

    # could just merge and write to csv without calling another function
    sys.stderr.write('Outputting dataframe at %s.\n' % datetime.now())
    if usr2measures is not None: hr_df.join(usr2measures).join(usr2counts).to_csv(user_output_filename)
    if usr2annual_measures is not None: hr_df.join(usr2annual_measures).join(usr2annual_counts).to_csv(annual_output_filename)
    if usr2quarterly_measures is not None: hr_df.join(usr2quarterly_measures).join(usr2quarterly_counts).to_csv(quarterly_output_filename)
    if usr2halfyearly_measures is not None: hr_df.join(usr2halfyearly_measures).join(usr2halfyearly_counts).to_csv(halfyearly_output_filename)
    
    sys.stderr.write("Finished outputting measures at %s, with a duration of %s.\n"
        % (str(datetime.now()), str(datetime.now() - starttime)))