In [None]:
import re
test_history = '''test stub CTR USCOM TECH (USA)
06/24/2024 11:34 PM
login account
06/25/2024
Test User MR USCOM FINC (USA)
06/25/2024 12:15 AM
Good morning, Anchor is online
Stub test CIV USCOM OPS (USA)
06/25/2024 5:12 AM
account request
nsw socrates printer
Account, Test U MS USCOM CSTE (USA)
06/25/2024 8:08 AM
admin account
06/25/2024 8:10 AM
account-request
User, New CTR USCOM TECH (USA)
06/25/2024 8:24 AM
Is this thing on?
06/26/2024
Test User MRS USCOM OPS (USA)
06/26/2024 1:26 PM
Good afternoon, this is Anchor'''
print(test_history)

test stub CTR USCOM TECH (USA)
06/24/2024 11:34 PM
login account
06/25/2024
Test User MR USCOM FINC (USA)
06/25/2024 12:15 AM
Good morning, Anchor is online
Stub test CIV USCOM OPS (USA)
06/25/2024 5:12 AM
account request
nsw socrates printer
Account, Test U MS USCOM CSTE (USA)
06/25/2024 8:08 AM
admin account
06/25/2024 8:10 AM
account-request
User, New CTR USCOM TECH (USA)
06/25/2024 8:24 AM
Is this thing on?
06/26/2024
Test User MRS USCOM OPS (USA)
06/26/2024 1:26 PM
Good afternoon, this is Anchor


this will organize the multi-line log of Jabber into an expected one-row-per-entry of input using Python.

Split entries by date, then datetime.  move users to correct entry, as they are on the prior entry after split.

In [None]:
def split_string_by_dates(s):
    re_date_format = r'^\b\d{2}\/\d{2}\/\d{4}\b$'

    # Find all substrings that match the date format
    dates = re.findall(re_date_format, s, re.MULTILINE)

    # Split the string at each date
    parts = re.split(re_date_format, s, 0, re.MULTILINE)

    # Combine the dates and parts into a list of tuples
    result = [(dates[i], parts[i+1].strip()) for i in range(len(dates))]

    return result

# Test the function
s = 'start \n07/12/2024\n: I started my new job. \n07/13/2024\n: I had my first meeting.'
print(split_string_by_dates(s))
print(split_string_by_dates(test_history))

[('07/12/2024', ': I started my new job.'), ('07/13/2024', ': I had my first meeting.')]
[('06/25/2024', 'Test User MR USCOM FINC (USA)\n06/25/2024 12:15 AM\nGood morning, Anchor is online\nStub test CIV USCOM OPS (USA)\n06/25/2024 5:12 AM\naccount request\nnsw socrates printer\nAccount, Test U MS USCOM CSTE (USA)\n06/25/2024 8:08 AM\nadmin account\n06/25/2024 8:10 AM\naccount-request\nUser, New CTR USCOM TECH (USA)\n06/25/2024 8:24 AM\nIs this thing on?'), ('06/26/2024', 'Test User MRS USCOM OPS (USA)\n06/26/2024 1:26 PM\nGood afternoon, this is Anchor')]


In [None]:
def split_string_by_datetime(s):
    re_date_format = r'^\b\d{2}\/\d{2}\/\d{4}\b\s\b\d{1,2}:\d{2}\b\s[AP]M$'

    # Find all substrings that match the date format
    dates = re.findall(re_date_format, s, re.MULTILINE)

    # Split the string at each date
    parts = re.split(re_date_format, s, 0, re.MULTILINE)

    # Combine the dates and parts into a list of tuples
    result = [(dates[i], parts[i+1].strip()) for i in range(len(dates))]
    result = [("", parts[0].strip())] + result

    return result

# Test the function
s = 'start \n07/12/2024 12:15 PM\n: I started my new job. \n07/13/2024 6:15 AM\n: I had my first meeting.'
print(split_string_by_datetime(s))
print(split_string_by_datetime(split_string_by_dates(test_history)[0][1]))

[('', 'start'), ('07/12/2024 12:15 PM', ': I started my new job.'), ('07/13/2024 6:15 AM', ': I had my first meeting.')]
[('', 'Test User MR USCOM FINC (USA)'), ('06/25/2024 12:15 AM', 'Good morning, Anchor is online\nStub test CIV USCOM OPS (USA)'), ('06/25/2024 5:12 AM', 'account request\nnsw socrates printer\nAccount, Test U MS USCOM CSTE (USA)'), ('06/25/2024 8:08 AM', 'admin account'), ('06/25/2024 8:10 AM', 'account-request\nUser, New CTR USCOM TECH (USA)'), ('06/25/2024 8:24 AM', 'Is this thing on?')]


In [None]:
def cleanup_username_splits(entries):
    ## Cleanup of usernames.  Events have time split after the username, and
    ##  this needs to be recovered to the correct entry. Additionally, the user
    ##  extracting info is not listed, and this is shown in the history as a
    ##  line without a previous user; This can be identified as a display name
    ##  (last, first, M Title USCOM).
    extracting_user = "Lau, Peter M CTR USCOM OPS (USA)"
    re_display_name = re.compile('^[A-z]+,?\s[A-z]+\s(?:\w\s)?[A-z]{2,4}\sUSCOM')
    for i in range(len(entries)-1,0,-1):  #skips i=0
        last_split = entries[i-1][1].split('\n')
        split = entries[i][1].split('\n')
        if i == len(entries)-1:
            split.append(last_split[-1]) #prevents overwriting last entry
        elif re_display_name.match(last_split[-1]):
            split[-1] = last_split[-1]
        else: # prevents overwriting an entry before extracting_user
            split[-1] = extracting_user
            # temporarily update next, required to update next user
            last_split.append(extracting_user)
            entries[i-1] = (entries[i-1][0],'\n'.join(last_split))
        entries[i] = (entries[i][0],'\n'.join(split))
    entries.pop(0)  #removes first entry
    return entries

# Test the function
s = '''test, user
07/12/2023
I started my new job.
user, new
07/13/2023
I had my first meeting.'''
single = s.replace('\n', ' ')
print(cleanup_username_splits(split_string_by_datetime(s)))
print(cleanup_username_splits(split_string_by_datetime(split_string_by_dates(test_history)[0][1])))

[]
[('06/25/2024 12:15 AM', 'Good morning, Anchor is online\nTest User MR USCOM FINC (USA)'), ('06/25/2024 5:12 AM', 'account request\nnsw socrates printer\nStub test CIV USCOM OPS (USA)'), ('06/25/2024 8:08 AM', 'admin account\nAccount, Test U MS USCOM CSTE (USA)'), ('06/25/2024 8:10 AM', 'account-request\nLau, Peter M CTR USCOM OPS (USA)'), ('06/25/2024 8:24 AM', 'Is this thing on?\nUser, New CTR USCOM TECH (USA)')]


In [None]:
def extract_username(s):
    #Last line of last object in tuple
    lines = s.split('\n')
    return (lines[-1], s.replace(lines[-1], '').strip())

# Test the function
s = '''07/12/2023
I started my new job.
07/13/2023
I had my first meeting.'''
single = s.replace('\n', ' ')
print(extract_username(s))
print(extract_username(single))
print(extract_username(cleanup_username_splits(split_string_by_datetime(split_string_by_dates(test_history)[0][1]))[0][1]))

('I had my first meeting.', '07/12/2023\nI started my new job.\n07/13/2023')
('07/12/2023 I started my new job. 07/13/2023 I had my first meeting.', '')
('Test User MR USCOM FINC (USA)', 'Good morning, Anchor is online')


In [None]:
def get_entries(text):
    for (day, du_events) in split_string_by_dates(text):
        for (dt, u_events) in cleanup_username_splits(split_string_by_datetime(du_events)):
            (user, events) = extract_username(u_events)
            for event in events.split('\n'):
                yield (day, dt, user, event)

# Test the function
for entry in get_entries(test_history):
    print(entry)

('06/25/2024', '06/25/2024 12:15 AM', 'Test User MR USCOM FINC (USA)', 'Good morning, Anchor is online')
('06/25/2024', '06/25/2024 5:12 AM', 'Stub test CIV USCOM OPS (USA)', 'account request')
('06/25/2024', '06/25/2024 5:12 AM', 'Stub test CIV USCOM OPS (USA)', 'nsw socrates printer')
('06/25/2024', '06/25/2024 8:08 AM', 'Account, Test U MS USCOM CSTE (USA)', 'admin account')
('06/25/2024', '06/25/2024 8:10 AM', 'Lau, Peter M CTR USCOM OPS (USA)', 'account-request')
('06/25/2024', '06/25/2024 8:24 AM', 'User, New CTR USCOM TECH (USA)', 'Is this thing on?')
('06/26/2024', '06/26/2024 1:26 PM', 'Test User MRS USCOM OPS (USA)', 'Good afternoon, this is Anchor')


In [None]:
import re

## Import
#infile = open('input.txt', 'r')
#input_data = infile.read()
#infile.close()
## Write out
outfile = open('output.csv', 'w')
outfile.write('Date,Time,User,Event\n') #Header
for entry in get_entries(test_history):
    outfile.write(f'"{entry[0]}","{entry[1]}","{entry[2]}","{entry[3]}"\n')
outfile.close()