In [21]:
!pip install pytz  
# pip install unicodecsv



In [1]:
import pywintypes  
from datetime import datetime
import re  
from bs4 import BeautifulSoup, NavigableString
import html 
import os
from dateutil import parser
  
def format_pywintypes_datetime(pywin_dt):  
    dt = datetime.fromtimestamp(pywin_dt.timestamp(), pywin_dt.tzinfo)    
    formatted_dt = dt.strftime("%A, %B %d, %Y %I:%M %p")  
      
    return formatted_dt    
  
def convert_to_timestamp(datetime_str):  
    datetime_str = datetime_str.strip()  
      
    # Dictionary to translate Finnish to English  
    finnish_to_english = {  
        'maanantai': 'Monday', 'tiistai': 'Tuesday', 'keskiviikko': 'Wednesday',   
        'torstai': 'Thursday', 'perjantai': 'Friday', 'lauantai': 'Saturday',   
        'sunnuntai': 'Sunday', 'tammikuuta': 'January', 'helmikuuta': 'February',   
        'maaliskuuta': 'March', 'huhtikuuta': 'April', 'toukokuuta': 'May',   
        'kesäkuuta': 'June', 'heinäkuuta': 'July', 'elokuuta': 'August',   
        'syyskuuta': 'September', 'lokakuuta': 'October', 'marraskuuta': 'November',   
        'joulukuuta': 'December'  
    }  
      
    # Replace Finnish words with English words  
    for finnish, english in finnish_to_english.items():  
        datetime_str = datetime_str.replace(finnish, english)  
      
    try:  
        # Try parsing with dateutil.parser first  
        datetime_obj = parser.parse(datetime_str)  
        return datetime_obj  
    except ValueError:  
        # If parsing fails, try the specific Finnish format  
        try:  
            datetime_format = '%A %d. %B %Y %H.%M'  
            datetime_obj = datetime.strptime(datetime_str, datetime_format)  
            return datetime_obj  
        except ValueError as e:  
            print(f"Error parsing datetime string: {e}")  
            return None

def process_element(element, depth=0):
    if isinstance(element, NavigableString):
        return html.unescape(str(element)).strip()

    if element.name == 'br':
        return '\n'

    if element.name in ['p', 'div']:
        return '\n\n' + ''.join(process_element(child, depth + 1) for child in element.children).strip() + '\n\n'

    if element.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
        return '\n\n' + ''.join(process_element(child, depth + 1) for child in element.children).strip().upper() + '\n\n'

    if 'src' in element.attrs and element['src'].startswith('cid:'):
        cid = element['src'].split(':', 1)[1]
        return f'\n\n[cid:{cid}]\n'

    # Recursively process child elements
    return ''.join(process_element(child, depth + 1) for child in element.children)


def clean_email_content(text):
    # Remove HTML tags
    clean = re.sub(r'<[^>]+>', '', text)
    
    # Remove special characters and extra whitespace
    clean = re.sub(r'[\r\n\t]+', '\n', clean)
    clean = re.sub(r'\[if.*?\].*?\[endif\]', '', clean, flags=re.DOTALL)
    clean = re.sub(r'\[cid:.*?\]', '', clean)
    
    # # Remove extra newlines
    # clean = re.sub(r'\n+', '\n', clean)
    
    # Strip leading and trailing whitespace
    clean = clean.strip()
    
    return clean
  
def html_to_clean_text_with_ordered_cids(email_obj): 
    Hbody = email_obj.HTMLBody 
    # Parse the HTML  
    soup = BeautifulSoup(Hbody, 'html.parser')

    # Remove style and script tags
    for tag in soup(['style', 'script']):
        tag.decompose()

    # Function to process each element

    # Process the entire body
    raw_text = process_element(soup.body)

    # # Clean up the text
    clean_text = re.sub(r'\n{3,}', '\n\n', raw_text)  # Replace 3 or more newlines with 2
    clean_text = re.sub(r' +', ' ', clean_text)  # Remove extra spaces
    clean_text = clean_text.strip()  # Remove leading/trailing whitespace

    # Add extra newline before "From:" to separate email threads
    clean_text = re.sub(r'(\nFrom:)', r'-------------\1', clean_text)

    clean_text = f"""
From:{email_obj.SenderName}
Sent:{format_pywintypes_datetime(email_obj.SentOn)}
To:{email_obj.To}
Cc:{email_obj.CC}
Subject:{email_obj.Subject}

{clean_text}
"""
      
    return clean_text

def extract_email_info(email_text, folder_path=None):
    email_info = {}

    # Extract From
    from_match = re.search(r'From:(.*?)\n', email_text, re.DOTALL)
    if from_match:
        from_line = from_match.group(1).strip()
        name_email_match = re.search(r'(.*?)\s*<([^>]+)>', from_line)
        if name_email_match:
            email_info['from'] = {
                'name': name_email_match.group(1).strip(),
                'email': name_email_match.group(2).strip()
            }
        else:
            email_info['from'] = {
                'name': from_line,
                'email': None
            }

    # Extract Sent time
    sent_match = re.search(r'Sent:(.*?)\n', email_text, re.DOTALL)
    if sent_match:
        email_info['time'] = sent_match.group(1).strip()
        email_info['time_py'] = convert_to_timestamp(email_info['time'])

    # Extract To
    to_match = re.search(r'To:(.*?)\n', email_text, re.DOTALL)
    if to_match:
        to_list = to_match.group(1).strip().split(';')
        email_info['to'] = []
        for to in to_list:
            to_name_email = re.search(r'(.*?)\s*<([^>]+)>', to.strip())
            if to_name_email:
                email_info['to'].append({
                    'name': to_name_email.group(1).strip(),
                    'email': to_name_email.group(2).strip()
                })
            else:
                email_info['to'].append({
                    'name': to.strip(),
                    'email': None
                })

    # Extract Cc
    cc_match = re.search(r'Cc:(.*?)\n', email_text, re.DOTALL)
    if cc_match:
        cc_list = cc_match.group(1).strip().split(';')
        email_info['cc'] = []
        for cc in cc_list:
            cc_name_email = re.search(r'(.*?)\s*<([^>]+)>', cc.strip())
            if cc_name_email:
                email_info['cc'].append({
                    'name': cc_name_email.group(1).strip(),
                    'email': cc_name_email.group(2).strip()
                })
            else:
                email_info['cc'].append({
                    'name': cc.strip(),
                    'email': None
                })

    # Extract Subject
    subject_match = re.search(r'Subject:(.*?)\n', email_text, re.DOTALL)
    if subject_match:
        email_info['subject'] = subject_match.group(1).strip()

    # Extract Content and Attachments
    content_match = re.search(r'Subject:.*?\n(.*)', email_text, re.DOTALL)
    if content_match:
        content = content_match.group(1).strip()
        attachments = re.findall(r'\[cid:(.*?)@[^\]]*\]', content)
        for attachment in attachments:
            content = re.sub(rf'\[cid:{re.escape(attachment)}@[^\]]*\]', f'****{attachment}****', content)
        email_info['email_content'] = clean_email_content(content)
        email_info['attachment'] = attachments if attachments else None
        if email_info['attachment'] is not None and folder_path:
            current_attachment_path = [f"attachments\\{folder_path}\\{i}" for i in email_info['attachment']]
            email_info['attachment'] = current_attachment_path

    return email_info

def get_email_list(clean_text, folder_path=None):  
    emails = clean_text.split('-------------')
    emails = [email for email in emails if email]  
    
    # Parse each email and store in a list of dictionaries  
    email_list = [extract_email_info(email, folder_path) for email in emails]
    return email_list



def custom_encoder(obj):  
        if isinstance(obj, datetime):  
            return obj.isoformat()  # Convert datetime to ISO 8601 string  
        raise TypeError(f"Object of type {type(obj)} is not JSON serializable")

def custom_decoder(dct):
    for key, value in dct.items():
        if key == 'time_py' and isinstance(value, str):
            try:
                dct[key] = datetime.fromisoformat(value)
            except ValueError:
                pass  # If it's not a valid ISO format, leave it as is
    return dct

def process_new_dict(existing_list, item):
    new_subject = item.Subject
    new_time = item.SentOn
    for existing_dict in existing_list:
        if existing_dict.Subject == new_subject:
            existing_time = existing_dict.SentOn
            if new_time > existing_time:
                return 'REPLACE', existing_dict
            else:
                return 'IGNORE', None
    return 'ADD', None

In [None]:
import win32com.client
import json

def find_pst_folder(OutlookObj, pst_filepath) :
    for Store in OutlookObj.Stores :
        if Store.IsDataFileStore and Store.FilePath == pst_filepath :
            return Store.GetRootFolder()
    return None

def enumerate_folders(FolderObj) :
    for ChildFolder in FolderObj.Folders :
        enumerate_folders(ChildFolder)
    iterate_messages(FolderObj)

def iterate_messages(FolderObj, save_path):
    attachment_path = f"{save_path}\\attachments"
    for index,item in enumerate(FolderObj.Items):
        try:
            folder_path = f"{attachment_path}\\{index}"
            clean_email = html_to_clean_text_with_ordered_cids(item)
            

            # clean_email_.append(clean_email) 
            email_list = get_email_list(clean_email, index)
            
            count_attachments = item.Attachments.Count
            if count_attachments > 0 :
                os.mkdir(folder_path)
                for att in range(count_attachments) :
                    item.Attachments.Item(att + 1).SaveAsFile(folder_path +"\\"+item.Attachments.Item(att + 1).FileName)

        
            with open(f"{save_path}\data.json", 'a') as json_file:
                json_file.write(json.dumps(email_list, default=custom_encoder) + '\n')
            print('completed', index)
            
        except:
            print('skipped', index)
        

    

Outlook = win32com.client.Dispatch("Outlook.Application").GetNamespace("MAPI")

pst = r"C:\Users\v-samomin\Downloads\PST\PDMLink_Admin_forLLM.pst"
save_path = r"C:\Users\v-samomin\Downloads\PST"
Outlook.AddStore(pst)
PSTFolderObj = find_pst_folder(Outlook,pst)

full_email = iterate_messages(PSTFolderObj, save_path)

Outlook.RemoveStore(PSTFolderObj)

 

In [19]:
def remove_duplicate_lists(input_file, output_file):
    # Iterate over each list of dicts in the main list
    with open(input_file, 'r') as file:
        list_of_lists = [json.loads(line, object_hook=custom_decoder) for line in file]
    i = 0
    while i < len(list_of_lists):
        current_list = list_of_lists[i]  # Current list of dicts
        if not current_list:  # Skip if the list is empty
            i += 1
            continue
        
        # Get the "email_content" of the last dictionary in the current list
        selected_email_content = current_list[-1].get('email_content', None)
        
        if not selected_email_content:
            i += 1
            continue
        
        # Iterate over the remaining elements in the list
        j = i + 1
        while j < len(list_of_lists):
            next_list = list_of_lists[j]
            
            # Check if the "email_content" of the last dictionary in the next list matches
            next_email_content = next_list[-1].get('email_content', None)
            if next_email_content == selected_email_content:
                # Remove the duplicate list if found
                list_of_lists.pop(j)
            else:
                j += 1
        
        # Move to the next element
        i += 1

    with open(output_file, 'w') as file:
        for email_list in list_of_lists:
            json.dump(email_list, file, default=custom_encoder)
            file.write('\n')
    
    return list_of_lists



input_file = r"C:\Users\v-samomin\Downloads\PST\data.json"

output_file = r"C:\Users\v-samomin\Downloads\PST\data_cleaned.json"



result = remove_duplicate_lists(input_file, output_file)


In [37]:
import json
with open(r"C:\Users\v-samomin\Downloads\PST\data_cleaned.json", 'r') as file:
    list_of_lists = [json.loads(line, object_hook=custom_decoder) for line in file]

In [38]:
list_of_lists[0][0]['attachment']#[0].split('\\')[1]

In [41]:
attachments = []
for emails in list_of_lists:
    for email in emails:
        global email_
        email_ = email
        try:
            if email['attachment'] is not None:
                for attach in email['attachment']:
                        folder = attach.split('\\')[1]
                        if folder not in attachments:
                            attachments.append(folder)
        except:
             pass



In [45]:
import os
import shutil

# Path to the parent directory containing folders
parent_dir = r"C:\Users\v-samomin\Downloads\PST\attachments"

# List of folders to keep
folders_to_keep = attachments  # Add your folder names here

# Get a list of all directories in the parent directory
all_folders = [d for d in os.listdir(parent_dir) if os.path.isdir(os.path.join(parent_dir, d))]

# Iterate through all folders and delete those not in the 'folders_to_keep' list
for folder in all_folders:
    if folder not in folders_to_keep:
        folder_path = os.path.join(parent_dir, folder)
        try:
            shutil.rmtree(folder_path)  # Deletes the entire folder and its contents
            print(f"Deleted folder: {folder}")
        except Exception as e:
            print(f"Failed to delete {folder}: {e}")

print("Cleanup complete.")


Deleted folder: 100
Deleted folder: 1000
Deleted folder: 1001
Deleted folder: 1002
Deleted folder: 1003
Deleted folder: 1004
Deleted folder: 1005
Deleted folder: 1006
Deleted folder: 1007
Deleted folder: 1008
Deleted folder: 1011
Deleted folder: 1012
Deleted folder: 1013
Deleted folder: 1014
Deleted folder: 1015
Deleted folder: 1016
Deleted folder: 1017
Deleted folder: 1018
Deleted folder: 1019
Deleted folder: 102
Deleted folder: 1023
Deleted folder: 1024
Deleted folder: 1025
Deleted folder: 1026
Deleted folder: 1027
Deleted folder: 1028
Deleted folder: 1029
Deleted folder: 103
Deleted folder: 1030
Deleted folder: 1031
Deleted folder: 1033
Deleted folder: 1037
Deleted folder: 1038
Deleted folder: 1039
Deleted folder: 1040
Deleted folder: 1042
Deleted folder: 1043
Deleted folder: 1044
Deleted folder: 1045
Deleted folder: 1046
Deleted folder: 1048
Deleted folder: 105
Deleted folder: 1050
Deleted folder: 1051
Deleted folder: 1052
Deleted folder: 1054
Deleted folder: 1057
Deleted folder: 1