In [57]:
# preprocessing data and making it suitable for csv

import os
import csv

def read_file(file_path):
    with open(file_path, 'r') as file:
        return file.read()

raw_data_dir = '/Users/nik/personal_projects/BizCase-Classifier/data/raw/UC Berkeley Enron Email Analysis Project'

csv_headers = ['category_first_level', 'category_second_level', 'frequency', 'mail_text']
csv_data = []

for folder in os.listdir(raw_data_dir):
    folder_path = os.path.join(raw_data_dir, folder)
    if os.path.isdir(folder_path):
        
        # iterating through files in the folder
        for file_name in os.listdir(folder_path):
            file_path = os.path.join(folder_path, file_name)
            
            # checking if the file has a '.txt' extension
            if file_name.endswith('.txt'):
                # read and save file content - raw message
                file_content = read_file(file_path)
                
                # determine cats file path
                cats_path = os.path.join(folder_path, file_name.replace('.txt', '.cats'))
                
                # check if the corresponding '.cats' file exists
                if os.path.exists(cats_path):
                    # read and save cats content - cats
                    cats_content = read_file(cats_path)
                    for cat in cats_content.split('\n'):
                        if cat.strip():
                            subcats = cat.split(',')
                            csv_data.append([subcats[0], subcats[1], subcats[2], file_content.replace('\n', '')])

In [58]:
# replacing the category numbers with categories names and writing to csv

first_level_categories = {
    1: 'Coarse genre',
    2: 'Included/forwarded information',
    3: 'Primary topics',
    4: 'Emotional tone'
}

second_level_categories = {
    1: {
        1: 'Company Business, Strategy, etc. (elaborate in Section 3 [Topics])',
        2: 'Purely Personal',
        3: 'Personal but in professional context',
        4: 'Logistic Arrangements (meeting scheduling, technical support, etc)',
        5: 'Employment arrangements (job seeking, hiring, recommendations, etc)',
        6: 'Document editing/checking (collaboration)',
        7: 'Empty message (due to missing attachment)',
        8: 'Empty message'
    },
    2: {
        1: 'Includes new text in addition to forwarded material',
        2: 'Forwarded email(s) including replies',
        3: 'Business letter(s) / document(s)',
        4: 'News article(s)',
        5: 'Government / academic report(s)',
        6: 'Government action(s) (such as results of a hearing, etc)',
        7: 'Press release(s)',
        8: 'Legal documents (complaints, lawsuits, advice)',
        9: 'Pointers to url(s)',
        10: 'Newsletters', 
        11: 'Jokes, humor (related to business)',
        12: 'Jokes, humor (unrelated to business)',
        13: 'Attachment(s) (assumed missing)'
    },
    3: {
        1: 'regulations and regulators (includes price caps)', 
        2: 'internal projects -- progress and strategy', 
        3: 'company image - current', 
        4: 'company image - changing/influencing',
        5: 'political influence / contributions / contacts', 
        6: 'california energy crisis / california politics',
        7: 'internal company policy', 
        8: 'internal company operations', 
        9: 'alliances / partnerships',
        10: 'legal advice', 
        11: 'talking points', 
        12: 'meeting minutes', 
        13: 'trip reports'
    },
    4: {
        1: 'jubilation', 
        2: 'hope / anticipation',
        3: 'humor', 
        4: 'camaraderie', 
        5: 'admiration', 
        6: 'gratitude', 
        7: 'friendship / affection', 
        8: 'sympathy / support', 
        9: 'sarcasm', 
        10: 'secrecy / confidentiality', 
        11: 'worry / anxiety', 
        12: 'concern', 
        13: 'competitiveness / aggressiveness', 
        14: 'triumph / gloating', 
        15: 'pride', 
        16: 'anger / agitation',
        17: 'sadness / despair', 
        18: 'shame',
        19: 'dislike / scorn'
    }
}

for row in csv_data:
    first_cat_key = int(row[0])
    second_cat_key = int(row[1])
    row[0] = first_level_categories[first_cat_key]
    row[1] = second_level_categories[first_cat_key][second_cat_key]

csv_path = '/Users/nik/personal_projects/BizCase-Classifier/data/processed/data_struct.csv'

with open(csv_path, 'w', encoding='UTF8', newline='') as f:
    writer = csv.writer(f, delimiter='|',
                            quotechar='"', quoting=csv.QUOTE_ALL)
    writer.writerow(csv_headers)
    writer.writerows(csv_data)
