**Company** : Tech Firm

**Notebook Function** :
    This notebook builds the Tech firm data that can be passed to Glove to train company embeddings from scratch.

**Input File(s)** : 
    tech_email_data.zip - The zipped folder containing the Tech firm data
    lid.176.ftz - Model for language identification

**Output File(s)** :
    corpus_high_prob_eng0.8_tech.txt - Tech firm corpus

**Author(s)** : Lara Yang, Sarayu Anshuman

Unzip the processed email data.

In [None]:
pip install ujson

In [None]:
pip install fasttext

In [None]:
pwd

In [None]:
import os
current_dir = os.getcwd()
current_dir

unzip the file. The zipped file is 1 GB, the unzipped file is 132 GB.

In [None]:
import os
import zipfile
current_dir = os.getcwd()
with zipfile.ZipFile("tech_email_data.zip","r") as zip_ref:
    zip_ref.extractall(current_dir)

Import Libraries

In [None]:
import pickle
import glob
import logging
import os
import random
import re
import sys
import ujson as json
import pandas as pd
import numpy as np
from numpy import random
from collections import defaultdict
from datetime import datetime
from utils import *
import multiprocessing
import fasttext
model = fasttext.load_model('lid.176.ftz') #model to identify a language in a piece of text

Set hyperparameters

In [1]:
mittens_params = 0.1
# moving this larger moves out of the default window we have seen in papers (1-10)
# 5 was originally used in 2yp; if unspecified, 10 is the default
window_size = 10
# seems like smallest embedding dim works best given that there might not that many dimensions needed to capture the difference between i and we
# 100 was originally used in 2yp; if unspecified, 50 is the default
embedding_dim = 50
mincount = 150
max_iter = 100
num_cores = 10
num_users_to_test = 60
vocab_size = 2500
max_iter_all = 3000
ling_thres = 0.8

Set output directory

In [None]:
home_dir = current_dir
corpus_dir = os.path.join(home_dir, "cleaned_email_data_v2")
print(corpus_dir)

In [None]:
out_dir = current_dir
corpus_file = os.path.join(out_dir, 'corpus_high_prob_eng_{}_tech.txt'.format(str(ling_thres).replace(".", "")))
print(corpus_file)

Generate the corpus

In [None]:
total_emails = 0
non_english = 0
english = 0
ling_thres = 0.8 #setting same value for Staffing firm

def load_user_emails(corpus_dir, out_file):
    output_file = open(out_file, 'w')
    uid2emails = defaultdict(list)
    print('reached here')
    for filename in os.listdir(corpus_dir):
        usr = filename.replace('.txt', '') #extarct the user number
        with open(os.path.join(corpus_dir, filename), encoding='utf-8') as f:
            global total_emails, english, non_english
            emails = json.load(f) #load each individual user's emails
            uid2emails[usr] = emails #create a dictionary where the key is the user number, and all the user's emails is the values
            eng_emails = []
            for e in emails:
                 total_emails += 1
                 clean_e = ' '.join(e['body'].split('\n')) #obtain the entire email sentences as one piece of text, note that '\n' referes to a space in tech firm
                 if len(clean_e) == 0:
                     continue
                 r = model.predict(clean_e)
                 lang = (r[0][0], r[1][0])
                 if lang[0] == '__label__en' and lang[1] > ling_thres:
                     eng_emails.append(e)
                     output_file.write(clean_e + ' \n ')
                     english += 1
                 else:
                     non_english
            uid2emails[usr] = eng_emails
    output_file.close()
    return uid2emails

if __name__ == '__main__':
    load_user_emails(corpus_dir, corpus_file)
    print("""Out of {} emails processed, {} emails were non_empty.\n
        {} English emails were written to corpus.txt. {} emails non-English emails are discarded.""".format(total_emails, english+non_english, english, non_english))