**Company** : <br>
Tech Firm

**Notebook Function** : <br>
    This notebook processes the identification measures

**Output File(s)** : <br>
     embeddings_quarterly_50d_mincount50.csv - A file containing the final identification measures for each person-quarter

**Author(s)** : <br>
Lara Yang, Sarayu Anshuman

Install some libraries

In [None]:
pip install -U mittens

In [None]:
pip install -U gensim

In [None]:
pip install ujson

Run functions that help in calculating the cosine similarities of I-We vectors

In [None]:
import os
import sys
from collections import defaultdict
from datetime import datetime
import pandas as pd
import numpy as np
from mittens import Mittens
import csv
from operator import itemgetter
import ujson as json
import re
from gensim.matutils import cossim
from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec
from statistics import mean 
from sklearn.decomposition import PCA

#########################################################################
########### Helper Functions for Generating Mittens Embeddings ##########
#########################################################################
def _window_based_iterator(toks, window_size, weighting_function):
    for i, w in enumerate(toks):
        yield w, w, 1
        left = max([0, i-window_size])
        for x in range(left, i):
            yield w, toks[x],weighting_function(x)
        right = min([i+1+window_size, len(toks)])
        for x in range(i+1, right):
            yield w, toks[x], weighting_function(x)
    return

def glove2dict(glove_filename):
    """
    Reads word vectors into a dictionary
    Parameters
    ----------
    glove_filename : str
        Name of file that contains vectors
    Returns
    -------
    data : dict
        A dictionary matching words to their vectors
    """
    with open(glove_filename) as f:
        reader = csv.reader(f, delimiter=' ', quoting=csv.QUOTE_NONE)
        data = {line[0]: np.array(list(map(float, line[1: ]))) for line in reader}
    return data

# Inspired by original build_weighted_matrix in utils.py in the Mittens paper source codebase
def build_weighted_matrix(emails,
        mincount=300, vocab_size=None, window_size=10,
        weighting_function=lambda x: 1 / (x + 1)):
    """
    Builds a count matrix based on a co-occurrence window of
    `window_size` elements before and `window_size` elements after the
    focal word, where the counts are weighted based on proximity to the
    focal word.
    Parameters
    ----------
    emails : list of dicts
        Emails converted from JSON formats
    mincount : int
        Only words with at least this many tokens will be included.
    vocab_size : int or None
        If this is an int above 0, then, the top `vocab_size` words
        by frequency are included in the matrix, and `mincount`
        is ignored.
    window_size : int
        Size of the window before and after. (So the total window size
        is 2 times this value, with the focal word at the center.)
    weighting_function : function from ints to floats
        How to weight counts based on distance. The default is 1/d
        where d is the distance in words.
    Returns
    -------
    X : pd.DataFrame
        Cooccurence matrix guaranteed to be symmetric because of the way the counts are collected.
    """
    wc = defaultdict(int)
    corpus = read_corpus(emails, sentence_delim=False)
    if corpus is None:
        print("These emails are empty\t{}.\n".format(str(emails)))
        return pd.DataFrame()
    for toks in corpus:
        for tok in toks:
            wc[tok] += 1
    if vocab_size:
        srt = sorted(wc.items(), key=itemgetter(1), reverse=True)
        vocab_set = {w for w, c in srt[: vocab_size]}
    else:
        vocab_set = {w for w, c in wc.items() if c >= mincount}
    vocab = sorted(vocab_set)
    n_words = len(vocab)
    # Weighted counts:
    counts = defaultdict(float)
    for toks in corpus:
        window_iter = _window_based_iterator(toks, window_size, weighting_function)
        for w, w_c, val in window_iter:
            if w in vocab_set and w_c in vocab_set:
                counts[(w, w_c)] += val
    X = np.zeros((n_words, n_words))
    for i, w1 in enumerate(vocab):
        for j, w2 in enumerate(vocab):
            X[i, j] = counts[(w1, w2)]
    X = pd.DataFrame(X, columns=vocab, index=pd.Index(vocab))
    return X

def read_corpus(emails, sentence_delim=False):
    """
    Parameters
    ----------
    emails : list of dict
        A list of emails converted from JSON formats
    sentence_delim : bool, optional
        If true, co-occurrences across sentence boundaries are ignored.
    Returns
    -------
    list of list of str
        Corpus converted from emails.
        If sentence_delim is false, returns a list of emails, which are represented as lists of tokens
        If sentence_delim is true, returns a list of sentences, which are represented as lists of tokens
    """
    if not sentence_delim:
        return [email['body'].replace('\n', ' ').replace("SENT_END", "").strip().split() for email in emails]
    else:
        return [sent.strip().split() for email in emails for line in email['body'].split('\n') for sent in line.split('SENT_END') if len(sent) > 0]

def output_embeddings(mittens_df, filename, compress=False):
    if compress:
        mittens_df.to_csv(filename + '.gz', quoting=csv.QUOTE_NONE, header=False, sep=" ", compression='gzip')
    else:
        mittens_df.to_csv(filename, quoting=csv.QUOTE_NONE, header=False, sep=" ")
    return

def isnull_wrapper(x):
    r = pd.isnull(x)
    if type(r) == bool:
        return r
    return r.any()

def cossim_with_none(vec1, vec2, vec_format='sparse'):
    """
    Auxiliary function that calls cossim function to test if vectors are None to prevent erroring out.
    Parameters
    ----------
    vec1 : list of (int, float), gensim sparse vector format
    vec2 : list of (int, float), gensim sparse vector format
    format : str, optional
        Either sparse or dense. If sparse, vec1 and vec2 are in gensim sparse vector format; use cossim function from gensim.
        Otherwise, vec1 and vec2 are numpy arrays and cosine similarity is hand calculated
    Returns
    -------
    float
        Cosine similarity between vec1 and vec2
    """
    if not (isnull_wrapper(vec1) or isnull_wrapper(vec2)):
        if vec_format == 'sparse':
            return cossim(vec1, vec2)
        elif vec_format == 'dense':
            if len(vec1) == 0 or len(vec2) == 0:
                return None
            return np.dot(vec1, vec2)/(np.linalg.norm(vec1) * np.linalg.norm(vec2))
        else:
            raise ValueError()
    return None

#########################################################################
############# Helper Functions for Working with JSON Emails #############
#########################################################################

def slice_user_corpus(emails, train_mode):
    """
    Similar to slice_user_corpus in the Coco, modified to work with tech firm data structure
    Parameters
    ----------
    emails : list of dict
        A list of emails converted from JSON formats to dictionaries 
    train_mode : str
        One of 'annual', 'quarterly', 'all'
        Indicates how to chunk up emails - into quarters, years, or both
    Returns
    -------
    timekey2emails : dict
        Matches quarters or years to respective emails
    """
    timekey2emails = defaultdict(list)
    for email in emails:
        if train_mode == 'annual':
            timekey2emails[email['year']].append(email)
        elif train_mode == 'quarterly':
            timekey2emails[email['quarter']].append(email)
        elif train_mode == 'all':
            timekey2emails[email['year']].append(email)
            timekey2emails[email['quarter']].append(email)
    return timekey2emails

#########################################################################
############# Helper Functions for Working with Date Objects ############
#########################################################################
def str_to_datetime(date):
    if date is None:
        return None
    date = re.sub(r"(\([A-Z]{3,}\))", "", date)
    date = date.strip()
    dt = None
    for fmt in ('%a, %d %b %Y %H:%M:%S %z', '%d %b %Y %H:%M:%S %z', '%a %d, %b %Y %H:%M:%S %z', '%a, %d %b %Y %H:%M:%S %Z', '%a, %d %b %Y %H:%M:%S (%Z)', '%Y-%m-%d %H:%M:%S%z'):
        try:
            dt = datetime.strptime(date, fmt)
        except ValueError:
            pass
    return dt

def to_quarter(date, format):
    """
    Returns quarter of date as str using date formats used in Tech firm.
    Support for string format is provided, but if both to_quarter and to_year are needed,
    it is computationally more efficient to first convert string to datetime and then call
    both to_quarter and to_year on the datetime object to avoid duplicated datetime conversion.
    """
    if date is None:
        return None
    year, month = 0, 0
    if format == 'str':
        dt = str_to_datetime(date)
        if dt is None:
            return None
        year = dt.year
        month = dt.month
    elif format == 'datetime':
        year = date.year
        month = date.month
    quarter = ((int(month)-1) // 3) + 1
    timekey = str(year) + 'Q' + str(quarter)
    return timekey

def to_year(date, date_type):
    """
    Returns year of date as str using date formats used in Tech firm.
    Support for string format is provided, but if both to_quarter and to_year are needed,
    it is computationally more efficient to first convert string to datetime and then call
    both to_quarter and to_year on the datetime object to avoid duplicated datetime conversion.
    """
    if date is None:
        return None
    if date_type == 'str':
        date = re.sub(r"\([A-Z]{3,}\)", "", date)
        date = date.strip()
        dt = None
        for fmt in ('%a, %d %b %Y %H:%M:%S %z', '%d %b %Y %H:%M:%S %z', '%a %d, %b %Y %H:%M:%S %z', '%a, %d %b %Y %H:%M:%S %Z'):
            try:
                dt = datetime.strptime(date, fmt)
            except ValueError:
                pass
        if dt is None:
            return None
        return str(dt.year)
    elif date_type == 'datetime':
        return str(date.year)

def word_similarity(model, w1, w2):
    """
    This is an auxilary function that allows for comparing one word to another word or multiple words
    If w1 and w2 are both single words, n_similarity returns their cosine similarity which is the same as 
    simply calling similarity(w1, w2)
    If w1 or w2 is a set of words, n_similarity essentially takes the mean of the set of words and then computes
    the cosine similarity between that vector mean and the other vector. This functionality is both reflected
    in its source code and has been verified manually.
    Parameters
    ----------
    model : KeyedVectors
        The model that contains all the words and vectors
    w1 : str or list
        The first word or word list to be compared
    w2 : str or list
        The second word or word list to be compared
    Returns
    -------
    float
        Cosine similarity between w1 and w2
    """
    if not isinstance(w1, list):
        w1 = [w1]
    if not isinstance(w2, list):
        w2 = [w2]
    w1 = [w for w in w1 if w in model.key_to_index]
    w2 = [w for w in w2 if w in model.key_to_index]
    if len(w1) == 0 or len(w2) == 0:
        return None
    return model.n_similarity(w1, w2)

#########################################################################
############## Helper Functions for Working with Dataframes #############
#########################################################################
def dict_to_df(index2rows, cols, index_name):
    """
    Parameters
    ----------
    index2rows : dict
        Dictionary mapping index to rows to be coverted
    cols : list
        List of column names of type str
    index : list
        List of index names
    Returns
    -------
    df : pd.DataFrame
        Constructed dataframe
    """
    if index2rows is None or len(index2rows) == 0:
        return None
    if len(index_name) == 1:
        df = pd.DataFrame.from_dict(index2rows, orient='index', columns=cols)
        df.index.name = index_name[0]
        df.sort_index(axis=0, inplace=True)
        return df
    else:
        df = pd.DataFrame.from_dict(index2rows, orient='index', columns=cols)
        df = pd.DataFrame(df, pd.MultiIndex.from_tuples(df.index, names=index_name))
        df.sort_index(axis=0, inplace=True)
        return df

def get_recipients(msg):
    """
    Return a set of recipients of the current message.
    self is removed from list of recipients if in recipients
    Tech firm's from fields are all lists based on visual inspection
    but we check for the type of the field just in case
    """
    sender = msg['from'][0] if type(msg['from']) == list else msg['from']
    return set(msg.get('to', []) + msg.get('cc', []) + msg.get('bcc', [])) - set([sender])

def is_internal_msg(msg):
    recipients = get_recipients(msg)
    internal = True
    for r in recipients:
        if not re.match(r'\d+', r):
            internal = False
            break
    return internal
    
def extract_variables_from_file(file):
    """ 
    Extract relevant information from name of embedding file, with format: {}(_{})?.txt
    """
    file_chunks = file[0:-4].split('_')
    usr = file_chunks[0]
    time_key = file_chunks[1] if len(file_chunks) == 2 else None
    return (usr, time_key)

def month2timekey(month, time_key):
    """
    Converts month numbers to actual year or quarter or halfyear
    """
    result = ''
    if time_key == 'year':
        year = 2006 + month // 12 
        if month % 12 > 2:
            year += 1
        result = str(year)
    elif time_key == 'quarter':
        year = 2006 + month // 12 
        remainder = month % 12
        if remainder > 2:
            year += 1
        quarter = ''
        if remainder <=2:
            quarter = 'Q4'
        elif remainder <= 5:
            quarter = 'Q1'
        elif remainder <= 8:
            quarter = 'Q2'
        else:
            quarter = 'Q3'
        result = str(year)+quarter
    elif time_key == 'halfyear':
        year = 2006 + month // 12  #2006 is the starting year, so remainder holds the number of months after 2006
        remainder = month % 12
        if remainder > 2:
            year += 1
        halfyear = ''
        if remainder <=6:
            halfyear = 'HY1'
        else:
            halfyear = 'HY2'
        result = str(year)+halfyear
    return result

def extract_hr_df(hr_file, time_key=None):
    """
    Extract and preprocess Tech firm HR data
    """
    hr = pd.read_csv(hr_file, index_col=0)
    # spell is a counter of the number of observations, not tenure
    hr['hire_month'] = hr.apply(lambda row : row['month'] if row['spell'] == 1 else np.nan, axis=1)
    hr['hire_month'] = hr['hire_month'].fillna(method='ffill')
    hr['tenure_months'] = hr.apply(lambda row : (row['month'] - row['hire_month'])+1, axis=1)
    hr['tenure_days'] = hr.apply(lambda row: row['tenure_months'] * 365/12, axis=1)
    hr['employeeid'] = hr['employeeid'].astype(str)
    hr['bonus_eligible'] = hr['bonus_eligible'].astype(int)
    if not time_key:
        hr.drop_duplicates(subset='employeeid', keep='last', inplace=True)
        hr = (hr[['employeeid', 'hire_month', 'sales', 'marketing', 'tech', 'vol_exit_event', 'invol_exit_event', 'manager', 'female', 'fav_rating', 'bonus', 'bonus_eligible', 'age', 'age2', 'cumbonus', 'tenure_months', 'tenure_days']])
        hr.rename(columns={"employeeid": "anon_id"}, inplace=True)
        hr.set_index('anon_id', inplace=True)
    else:
        hr[time_key] = hr['month'].apply(lambda row : month2timekey(row, time_key))
        hr['bonus'] = hr.groupby(['employeeid', time_key])['bonus'].transform('sum')
        hr['vol_exit_event'] = hr.groupby(['employeeid', time_key])['vol_exit_event'].transform('max')
        hr['invol_exit_event'] = hr.groupby(['employeeid', time_key])['invol_exit_event'].transform('max')
        hr['manager'] = hr.groupby(['employeeid', time_key])['manager'].transform('max')
        hr['fav_rating'] = hr.groupby(['employeeid', time_key])['fav_rating'].transform('max')
        hr['bonus_eligible'] = hr.groupby(['employeeid', time_key])['bonus_eligible'].transform('max')
        hr.drop_duplicates(subset=['employeeid', time_key], keep='last', inplace=True)
        hr = (hr[['employeeid', time_key, 'hire_month', 'sales', 'marketing', 'tech', 'vol_exit_event', 'invol_exit_event', 'manager', 'female', 'fav_rating', 'bonus', 'bonus_eligible', 'age', 'age2', 'cumbonus', 'tenure_months', 'tenure_days']])
        hr.rename(columns={"employeeid": "anon_id"}, inplace=True)
        hr.set_index(['anon_id', time_key], inplace=True)
    return hr

Obtain the current working directory

In [None]:
import numpy as np
import os
current_dir = os.getcwd()
current_dir

Load additional libraries

In [None]:
import os
import sys
import multiprocessing
from collections import defaultdict
import pandas as pd
import numpy as np
import csv
from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.matutils import cossim, any2sparse
from utils import *
import re
import random
from statistics import mean 
import ujson as json
from pytz import timezone
from tqdm.auto import tqdm

Set the hyperparameters, input files, and output directories

In [None]:
#Set the hyperparameters
embedding_dim = 50
mincount = 50 
home_dir = current_dir

#Load the fine-tuned embeddings and hr file for the Tech firm
corpus_dir = "/zfs/projects/faculty/amirgo-identification/identification-Sarayu/tech/training/cleaned_email_data_v2"
mittens_dir = os.path.join(home_dir, "mittens")
embeddings_dir = "/zfs/projects/faculty/amirgo-identification/identification-Sarayu/tech/fine-tuning/embeddings_50d_mincount50"
hr_file = "/zfs/projects/faculty/amirgo-identification/identification-Sarayu/tech/cossim/tech_ph2_data_for_analysis.csv"

#Set a temp directory and the output directory
tmp_dir = os.path.join(current_dir, "tmp")
output_dir = os.path.join(current_dir, "tech_idtf_output_data")
quarterly_output_filename = os.path.join(output_dir, "embeddings_quarterly_{}d_mincount{}.csv".format(embedding_dim, mincount))

#Set other global variables
year_colname, quarter_colname = 'year', 'quarter'
single_pronouns = ['i', 'we']
num_cores = 10

print(corpus_dir)
print(embeddings_dir)
print(tmp_dir)
print(output_dir)

Now run the following code to generate the identification measures

In [None]:
#########################################################################
######### Functions for Loading Raw Counts as Control Variables #########
#########################################################################
def scale(x):
    """
    This scale function provides the same result as R's scale. sklearn's preprocessing.scale is slightly different as it uses N instead of N-1
    in calculating standard deviation 
    """
    return (x-np.mean(x))/np.std(x, ddof=1)

def read_raw_counts(corpus_dir, hr_df_quarterly, test_mode=False):
    """
    The main workhorse function for obtaining raw message and token counts as control variables.
    Parameter
    ---------
    corpus_dir : str
        File containing all emails
    hr_df_quarterly : pd.DataFrame
        Tech firm HR data, used for making department adjusted effort calculations
    Returns
    -------
    tuple
        A tuple of user-level, annual, and quarterly dataframes
    """
    usr_quarter2counts = defaultdict(lambda :[0, 0, 0, 0, set(), set(), [], 0.0])
    msg2time = {}
    
    files = os.listdir(corpus_dir)
    if test_mode: files = [files[random.randint(0, len(files)-1)] for _ in range(len(files)//100)]

    for filename in tqdm(files): 
        usr = filename.replace('.txt', '')
        with open(os.path.join(corpus_dir, filename), encoding='utf-8') as f:
            emails = json.load(f)
            for i, e in enumerate(emails):
                dt = str_to_datetime(e['date'])
                if dt is not None:
                    msg2time[e['message-id']] = dt

    for filename in tqdm(files):
        usr = filename.replace('.txt', '')
        with open(os.path.join(corpus_dir, filename), encoding='utf-8') as f:
            emails = json.load(f)
            for i, e in enumerate(emails):
                if not is_internal_msg(e):
                    continue
                if test and i > 10:
                    break
                quarter = e['quarter']
                num_toks = len(e['body'].replace('\n', ' ').replace("SENT_END", "").strip().split())
                usr_quarter2counts[(usr, quarter)][0] += num_toks
                usr_quarter2counts[(usr, quarter)][1] += 1
                for r in get_recipients(e):
                    assert r != usr, "Created a self-loop"
                    usr_quarter2counts[(usr, quarter)][5].add(r)
                dt = str_to_datetime(e['date'])
                if dt is None:
                    continue
                weekday = dt.weekday()
                hour = dt.hour
                if (weekday == 5) or (weekday == 6):
                    usr_quarter2counts[(usr, quarter)][2] += 1
                    usr_quarter2counts[(usr, quarter)][4].add(dt.strftime('%Y-%m-%d'))
                elif (hour < 8) or (hour > 17):
                    usr_quarter2counts[(usr, quarter)][3] += 1
                if e['in-reply-to'] is not None and e['in-reply-to'] in msg2time:
                    try:
                        diff = dt - msg2time[e['in-reply-to']]
                        diff = diff.total_seconds()
                        usr_quarter2counts[(usr, quarter)][6].append(diff)
                        if diff > usr_quarter2counts[(usr, quarter)][7]:
                            usr_quarter2counts[(usr, quarter)][7] = diff
                    except TypeError:
                        diff = dt.replace(tzinfo=None)-msg2time[e['in-reply-to']].replace(tzinfo=None)
                        diff = diff.total_seconds()
                        usr_quarter2counts[(usr, quarter)][6].append(diff)
                        if diff > usr_quarter2counts[(usr, quarter)][7]:
                            usr_quarter2counts[(usr, quarter)][7] = diff
                    
    for key in list(usr_quarter2counts):
        
        usr_quarter2counts[key][4] = len(usr_quarter2counts[key][4])
        if len(usr_quarter2counts[key][6]) > 0:
            usr_quarter2counts[key][6] = sum(usr_quarter2counts[key][6])/len(usr_quarter2counts[key][6])
        else:
            usr_quarter2counts[key][6] = None
        for peer in usr_quarter2counts[key][5]:
            usr_quarter2counts[(peer, key[1])][5].add(key[0])
    
    for key in list(usr_quarter2counts):
        # standardizing by network peers
        peers = usr_quarter2counts[key][5]
        quarter = key[1]
        peer_num_messages_weekend = [usr_quarter2counts[(p, quarter)][2] for p in peers] + [usr_quarter2counts[key][2]]
        peer_num_messages_post_work = [usr_quarter2counts[(p, quarter)][3] for p in peers] + [usr_quarter2counts[key][3]]
        usr_working_weekends = usr_quarter2counts[(usr, quarter)][4] if type(usr_quarter2counts[(usr, quarter)][4]) == int else 0
        peer_num_working_weekends = [usr_quarter2counts[(p, quarter)][4] if type(usr_quarter2counts[(p, quarter)][4]) == int else 0 for p in peers] + [usr_working_weekends]
        usr_quarter2counts[key].append(scale(peer_num_messages_weekend)[-1])
        usr_quarter2counts[key].append(scale(peer_num_messages_post_work)[-1])
        usr_quarter2counts[key].append(scale(peer_num_working_weekends)[-1])
    
    cols = (['num_tokens', 'num_messages', 'num_messages_weekend', 'num_messages_post_work', 'num_working_weekends', 'network_peers', 'avg_response_time', 'max_response_time',
        'peer_standardized_num_messages_weekend', 'peer_standardized_num_messages_post_work', 'peer_standardized_num_working_weekends'])
    usr_quarter2counts_df = dict_to_df(usr_quarter2counts, cols, index_name=['anon_id', quarter_colname])

    #hr data missing#############

    #'''
    usr_quarter2counts_df = usr_quarter2counts_df.join(hr_df_quarterly)
    usr_quarter2counts_df['department'] = np.select([(usr_quarter2counts_df['sales'] == 1), (usr_quarter2counts_df['tech'] == 1), (usr_quarter2counts_df['marketing'] == 1)], ['sales', 'tech', 'marketing'], default='other')
    usr_quarter2counts_df['department_standardized_num_messages_weekend'] = usr_quarter2counts_df.groupby(['quarter', 'department'])['num_messages_weekend'].transform(scale)
    usr_quarter2counts_df['department_standardized_num_messages_post_work'] = usr_quarter2counts_df.groupby(['quarter', 'department'])['num_messages_post_work'].transform(scale)
    #usr_quarter2counts_df['num_working_weekends'] = usr_quarter2counts_df['num_working_weekends'].apply(lambda x : x if type(x) == int else 0)
    usr_quarter2counts_df['num_working_weekends'] = usr_quarter2counts_df['num_working_weekends'].transform(lambda x: np.where(pd.api.types.is_integer_dtype(x), x, 0))
    usr_quarter2counts_df['department_standardized_num_working_weekends'] = usr_quarter2counts_df.groupby(['quarter', 'department'])['num_working_weekends'].transform(scale)
    #'''

    return usr_quarter2counts_df

#########################################################################
#### Functions for Measuring Within-Person Similarities in Embeddings ###
#########################################################################

def process_single_embedding_file(i, num_files, embeddings_dir, file):
    """
    Reading from one embedding file
    Parameters
    ----------
    i : int
        Index used for progress tracking
    num_files : int
        Total number of files to process used for progress tracking
    embeddings_dir : str
        Directory in which embedding files reside
    file : str
        Embedding file to open and process
    Returns
    -------
    float
        Cosine similarity between i and we
    """
    mittens_file = os.path.join(embeddings_dir, file)
    if i % 50 == 0:
        print('reached here')
        sys.stderr.write("Processing \t%d/%d -'%s', at %s.\n" % (i, num_files, mittens_file, datetime.now()))
    try:
        model = KeyedVectors.load_word2vec_format(mittens_file, binary=False, no_header=True)
        return word_similarity(model, single_pronouns[0], single_pronouns[1])
    except Exception as e:
        sys.stderr.write('File %s caused an error: %s.\n' % (mittens_file, str(e)))
   
def self_similarities(files, num_files, embeddings_dir):
    """
    Main workhorse function for calculating within-person similarities by comparing an individual's i embedding to we embedding in the focal quarter
    Parameters
    ----------
    files : list of str
        Embedding files to process
    num_files : int
        Total number of files to process, used to keep track of progress
    embeddings_dir : str
        Directory in which embedding files reside
    Return
    ------
    tuple
        3-tuple of dictionaries mapping usr and optional timekeys to within-person embedding similarities
    """
    usr_quarter2distances = defaultdict(list)
    pool = multiprocessing.Pool(processes = num_cores)
    results = {}
    for i, file in enumerate(files, 1):
        usr, time_key = extract_variables_from_file(file)
        if time_key:
            results[(usr, time_key)] = pool.apply_async(process_single_embedding_file, args=(i, num_files, embeddings_dir, file, ))
    pool.close()
    pool.join()
    for key, r in results.items():
        usr, time_key = key
        curr_row = r.get()
        # Empty if errored out
        if curr_row and time_key:
            if len(time_key) == 6:
                usr_quarter2distances[(usr, time_key)] = curr_row
    return usr_quarter2distances

def reading_embeddings(embeddings_dir, test_mode=False):
    """
    Calculates embedding similarities within-person and between-person
    Parameters
    ----------
    embeddings_dir : str
        Directory where all embedding files exist
    test_mode : bool, optional
        If testing, reduce number of files to process
    Returns
    -------
    tuple
        User, annual, and quarter level dataframes that include similarity between i and we
    """
    files = os.listdir(embeddings_dir)
    if test_mode: files = [files[random.randint(0, len(files)-1)] for _ in range(len(files)//50)]
    
    num_files = len(files)
    usr_quarter2distances = self_similarities(files, num_files, embeddings_dir)

    cols = ['i_we']
    usr_quarter2distances_df = dict_to_df(usr_quarter2distances, cols, index_name=['anon_id', quarter_colname])
    return usr_quarter2distances_df

if __name__ == '__main__':
    starttime = datetime.now()
    test = False
    try:
        test = sys.argv[1].lower() == 'test'
    except IndexError as error:
        pass
    if test:
        quarterly_output_filename = os.path.join(output_dir, "test_embeddings_quarterly.csv")
    for d in [output_dir, tmp_dir]:
        print('here')
        if not os.path.exists(d):
            os.mkdir(d)
    
    sys.stderr.write('Reading HR and Survey data at %s.\n' % datetime.now())
    hr_df_quarterly = extract_hr_df(hr_file, 'quarter')
    #no hr data
    #hr_df_quarterly = 'hr_df_quarterly'
    print('Finished reading the hr data')

    sys.stderr.write('Loading corpus counts at %s.\n' % datetime.now())
    usr2quarterly_counts = read_raw_counts(corpus_dir, hr_df_quarterly, test)
    
    sys.stderr.write('Reading embeddings at %s.\n' % datetime.now())
    usr2quarterly_measures = reading_embeddings(embeddings_dir, test)
    
    sys.stderr.write('Outputting dataframe at %s.\n' % datetime.now())
    usr2quarterly_counts.join(usr2quarterly_measures).to_csv(quarterly_output_filename)
    
    sys.stderr.write("Finished outputting measures at %s, with a duration of %s.\n"
        % (str(datetime.now()), str(datetime.now() - starttime)))