### Reformat Enron Dataset as train and test
##### 10% test, 90% train

In [1]:
import shutil
import os
import re
from random import randint

def copyf(source, destination, depth=None):
    if not depth:
        depth = ""
    for file_or_dir in os.listdir(os.path.join(source + depth)):
        if os.path.isfile(os.path.join(source + depth, file_or_dir)):
#            print("cp " + os.path.join(source + depth, file_or_dir) + " " + destination)
            buf = depth.split("/")[-2:]
            fn = re.sub('[^0-9a-zA-Z]+', '_', "_".join(buf)) + '_' + file_or_dir + 'txt'
            if randint(0,99) > 80:
                if randint(0,99) > 85:
                    dest = os.path.join(destination, "test")
                else:
                    dest = os.path.join(destination, "train")                                                    
                if buf[1] == "sent":
                    shutil.copyfile(os.path.join(source + depth, file_or_dir), os.path.join(dest, fn))
        else:
            copyf('', destination, os.path.join(source + depth, file_or_dir))

In [2]:
source = "/home/wk/myProjects/data/maildir"
destination = "/home/wk/myProjects/data/Enron"
copyf(source, destination, depth=None)

#### Read in the Tag lookup

In [3]:
import pandas as pd
import numpy as np
fn = "/home/wk/myProjects/data/Enron/Enron_name_lookup.csv"
df_tag = pd.read_csv(fn, delimiter=";")

In [4]:
df_tag.head()

Unnamed: 0,Token,FirstName,LastName,Email
0,allen_p,"Phillip K , Phillip",Allen,phillip.allen@enron.com
1,arnold_j,John,Arnold,john.arnold@enron.com
2,arora_h,Harry,Arora,harry.arora@enron.com
3,bass_e,Eric,Bass,eric.bass@enron.com
4,beck_s,Sally,Beck,sally.beck@enron.com


#### Read in each files

In [5]:
PATH='/home/wk/myProjects/data/Enron/'

TRN_PATH = 'train/'
VAL_PATH = 'test/'
TRN = f'{PATH}{TRN_PATH}'
VAL = f'{PATH}{VAL_PATH}'

%ls {PATH}

Enron_name_lookup.csv  [0m[01;34moper[0m/  TagSamples.ipynb  [01;34mtmp[0m/
[01;34mmodels[0m/                [01;34mtag[0m/   [01;34mtest[0m/             [01;34mtrain[0m/


In [6]:
fname = !ls {TRN}
#trn_files = !dir /w {TRN}
fname[7:17]

['allen_p_sent_120.txt',
 'allen_p_sent_125.txt',
 'allen_p_sent_134.txt',
 'allen_p_sent_137.txt',
 'allen_p_sent_142.txt',
 'allen_p_sent_156.txt',
 'allen_p_sent_159.txt',
 'allen_p_sent_15.txt',
 'allen_p_sent_161.txt',
 'allen_p_sent_164.txt']

In [7]:
def get_tokens(flist):
    return ([a.split("_")[0] + "_" + a.split("_")[1] for a in flist])

In [8]:
Token = get_tokens(fname)

In [9]:
Token[0:4]

['allen_p', 'allen_p', 'allen_p', 'allen_p']

In [10]:
df_f = pd.DataFrame({'Filename': fname, 'Token': Token})
df_f.head()

Unnamed: 0,Filename,Token
0,allen_p_sent_103.txt,allen_p
1,allen_p_sent_106.txt,allen_p
2,allen_p_sent_110.txt,allen_p
3,allen_p_sent_113.txt,allen_p
4,allen_p_sent_117.txt,allen_p


In [11]:
df_proc = pd.merge(df_f, df_tag, on="Token")
df_proc.head(3)

Unnamed: 0,Filename,Token,FirstName,LastName,Email
0,allen_p_sent_103.txt,allen_p,"Phillip K , Phillip",Allen,phillip.allen@enron.com
1,allen_p_sent_106.txt,allen_p,"Phillip K , Phillip",Allen,phillip.allen@enron.com
2,allen_p_sent_110.txt,allen_p,"Phillip K , Phillip",Allen,phillip.allen@enron.com


In [12]:
print(df_f.shape)
print(df_proc.shape)

(9466, 2)
(9429, 5)


#### Search and Replace

In [13]:
def search_and_replace_text_in_file(fn_in, fn_out, src_str, rpc_str):
    # Read in the file
    with open(fn_in, 'r', errors="ignore") as file :
        filedata = file.read()
    # Replace the target string
        filedata = filedata.replace(src_str, rpc_str)
    # Write the file out again
    with open(fn_out, 'w') as file:
        file.write(filedata)

In [14]:
import re
def search_and_replace_regex_in_file(fn_in, fn_out):
    # Read in the file
    with open(fn_in, 'r', errors="ignore") as file :
        line = file.read()
        line = re.sub(
                   r"""(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])""", 
                   " @@othr_em@@ ", 
                   line 
               )
        line = re.sub(
                   r"""(?:(?:\+?([1-9]|[0-9][0-9]|[0-9][0-9][0-9])\s*(?:[.-]\s*)?)?(?:\(\s*([2-9]1[02-9]|[2-9][02-8]1|[2-9][02-8][02-9])\s*\)|([0-9][1-9]|[0-9]1[02-9]|[2-9][02-8]1|[2-9][02-8][02-9]))\s*(?:[.-]\s*)?)?([2-9]1[02-9]|[2-9][02-9]1|[2-9][02-9]{2})\s*(?:[.-]\s*)?([0-9]{4})(?:\s*(?:#|x\.?|ext\.?|extension)\s*(\d+))?""", 
                   " @@othr_ph@@ ", 
                   line
               )
        line = re.sub(
                   r"""(?i)\b((?:[a-z][\w-]+:(?:/{1,3}|[a-z0-9%])|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))""", 
                   " @@othr_ws@@ ", 
                   line
               )
        line = re.sub(
                   r"""\d{1,2}[/-]\d{1,2}[/-]\d{2,4}""", 
                   " @@othr_dt@@ ", 
                   line
               )
        line = re.sub(
                   r"""\d{2} (?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* \d{4}""", 
                   " @@othr_dt@@ ", 
                   line
               )
        line = re.sub(
                   r"""\d{1,2}(?:(?:am|pm)|(?::\d{1,2})(?:am|pm)?)""", 
                   " @@othr_tm@@ ", 
                   line
               )
    with open(fn_out, 'w') as file:
        file.write(line)

In [15]:
dir_in = "/home/wk/myProjects/data/Enron/train/"
dir_out = "/home/wk/myProjects/data/Enron/tag/train/"

#### Define rule of replacement

In [16]:
for fn in df_proc["Filename"]:    
    cond = (df_proc.Filename == fn)
    Email = df_proc.Email.where(cond, "").max()
    search_and_replace_text_in_file(dir_in + fn, dir_out + fn, Email, "@@Email@@")
    
    FirstNames = df_proc.FirstName.where(cond, "").max().split(",")
    FirstNames = [a.lstrip(' ').rstrip(' ') for a in FirstNames]
    LastName = df_proc.LastName.where(cond, "").max()
    LastName
    for firstName in FirstNames:
        fullName = firstName + ' ' + LastName
        search_and_replace_text_in_file(dir_out + fn, dir_out + fn, fullName, "@@FirstName@@ @@LastName@@ ")
        search_and_replace_text_in_file(dir_out + fn, dir_out + fn, firstName, "@@FirstName@@") 
        search_and_replace_regex_in_file(dir_out + fn, dir_out + fn)

In [17]:
'''
fname = !ls {VAL}

Token = get_tokens(fname)

df_f = pd.DataFrame({'Filename': fname, 'Token': Token})
df_f.head()

df_proc = pd.merge(df_f, df_tag, on="Token")

dir_in = "/home/wk/myProjects/data/Enron/test/"
dir_out = "/home/wk/myProjects/data/Enron/tag/test/"

for fn in df_proc["Filename"]:    
    cond = (df_proc.Filename == fn)
    Email = df_proc.Email.where(cond, "").max()
    search_and_replace_text_in_file(dir_in + fn, dir_out + fn, Email, "@@Email@@")
    
    FirstNames = df_proc.FirstName.where(cond, "").max().split(",")
    FirstNames = [a.lstrip(' ').rstrip(' ') for a in FirstNames]
    LastName = df_proc.LastName.where(cond, "").max()
    LastName
    for firstName in FirstNames:
        fullName = firstName + ' ' + LastName
        search_and_replace_text_in_file(dir_out + fn, dir_out + fn, fullName, "@@FirstName@@ @@LastName@@ ")
        search_and_replace_text_in_file(dir_out + fn, dir_out + fn, firstName, "@@FirstName@@") 
'''


'\nfname = !ls {VAL}\n\nToken = get_tokens(fname)\n\ndf_f = pd.DataFrame({\'Filename\': fname, \'Token\': Token})\ndf_f.head()\n\ndf_proc = pd.merge(df_f, df_tag, on="Token")\n\ndir_in = "/home/wk/myProjects/data/Enron/test/"\ndir_out = "/home/wk/myProjects/data/Enron/tag/test/"\n\nfor fn in df_proc["Filename"]:    \n    cond = (df_proc.Filename == fn)\n    Email = df_proc.Email.where(cond, "").max()\n    search_and_replace_text_in_file(dir_in + fn, dir_out + fn, Email, "@@Email@@")\n    \n    FirstNames = df_proc.FirstName.where(cond, "").max().split(",")\n    FirstNames = [a.lstrip(\' \').rstrip(\' \') for a in FirstNames]\n    LastName = df_proc.LastName.where(cond, "").max()\n    LastName\n    for firstName in FirstNames:\n        fullName = firstName + \' \' + LastName\n        search_and_replace_text_in_file(dir_out + fn, dir_out + fn, fullName, "@@FirstName@@ @@LastName@@ ")\n        search_and_replace_text_in_file(dir_out + fn, dir_out + fn, firstName, "@@FirstName@@") \n'