# Time Agg Entries

This will be a really small python script that takes in a date range, reads a directory to see if those dates are present, pulls the dates in as obsidian backlinks, and adds a pre-selected format text to the body.

Ok it got bigger. We included a pretty cool AI analysis tool.


also I want to teach this. note the steps:

1. all the fundamentals for python
2. getting local llm up and running
3. write local TimeAgg class and show it works on one set of entries
4. make batch script and utils that help manage sd ed stuff

In [1]:
import os
import pandas as pd
from langchain_community.llms import Ollama

In [2]:
class TimeAgg:

    def __init__(self, sd: str, ed: str, dir: str ='/Users/shrey/Personal/journal/pages/journal/daily_journal_test/'):
        """
        sd: string start date format YYYY-MM-DD
        ed: string end date format YYYY-MM-DD
        dir: directory with text files containing journal entries, with filename convention of dates
        """
        self.sd = sd
        self.ed = ed
        self.dir = dir

        self.files = os.listdir(self.dir)

        self.get_files_in_daterange()
        self.gen_smart_summary()
        self.gen_template()

    def get_files_in_daterange(self):

        self.filter_dates_list = list(pd.date_range(self.sd, self.ed).strftime('%Y-%m-%d'))
        self.pages_dates_list = [file.split('.')[0] for file in self.files]
        self.selected_dates_list = list(set(self.filter_dates_list) & set(self.pages_dates_list))
        self.selected_dates_list.sort()
        self.selected_pages_list = [file + '.md' for file in self.selected_dates_list]
        

    def gen_smart_summary(self):
        """
        this is the part where we prompt an LLM to summarize our entries from this time period, give us feedback, and offer questions / writing prompts
        """

        self.llm = Ollama(model="llama3")

        # grab the journal text
        j_text = ""
        for page in self.selected_pages_list:
            with open(self.dir + page, 'r') as file:
                data = file.read()#.replace('\n', '')
                j_text = j_text + data
                j_text = j_text+"\n"
            
        
        self.subquery_1 = f"Here are my journal entries from {self.sd} to {self.ed}: "
        self.subquery_2 = j_text

        self.subquery_3 = f"\
        Analyze my journal entries from {self.sd} to {self.ed}. Summarize what happened during this time period, highlighting \
        key events, themes, or experiences that stood out. Where was I? Who did I interact with? What was I focused on? How did I feel?\
        \
        Next, provide a section on reflection prompts:\
        \
        * Based on what you've analyzed, what are some questions I can reflect on to deepen my understanding of myself and my \
        experiences?\
        * What themes or areas would benefit from further exploration through journaling or self-reflection?\
        Make sure not to be repetitive. Speak concisely and thoughtfully but also kindly. Write in second person: 'You did...'"

        self.subquery_3 = f"**AI Journal Entry Analysis**\
        Analyze my journal entries from {self.sd} to {self.ed}.\
        In your analysis, answer the following questions:\
        What was my focus during this time period--what did I want? Respond to this under the heading **Focus:**.\
        What were my relationships with other people like? Who did I talk to? How did I feel about them? Respond to this under the heading **People:**.\
        What did I work on, achieve, read, or practice (for example: coding, meditation, yoga, my job)? Respond to this under the heading **Practice:**.\
        What did I learn about myself during this time period? Respond to this under the heading **Insights:**.\
        Write the analysis in second person: 'You did...'"


        self.query = self.subquery_1 + self.subquery_2 + self.subquery_3
        self.smart_summary = self.llm(self.query)
    def gen_template(self):

        self.backlinks_str = 'backlinked dates: '
        for date in self.selected_dates_list:
            self.backlinks_str = self.backlinks_str + '[[' + date + ']], '
        
        with open(self.sd+':'+self.ed+'.md', 'w') as f:
            f.write("Date: \n\n")
            f.write(self.backlinks_str + "\n\n")
            f.write("backlinks: \n\n")
            f.write(self.smart_summary)
            f.write("\n\n**Human Reflection:**\n\n")
            f.write("What is the AI analysis bringing up for you? What lessons or shifts in focus do you want to take into the future? Reflect: \n\n")

        

In [3]:
def parse_backfill_dates(sd, ed, backfill_window=7):
    """
    sd: overall start date
    ed: overall end date
    backfill: str, 'biweekly' or 'monthly'

    makes tuples of start and end dates, inclusive of the sd value provided, such that each tuple is 'backfill_window' days long,
    until the point where the new start date of a pair would exceed the ed given
    """
    sdt = pd.to_datetime(sd)
    edt = pd.to_datetime(ed)

    sd_list = []
    ed_list = []

    while sdt <= edt:
        sd_list.append(sdt)
        ed_list.append(sdt+pd.Timedelta(backfill_window-1, unit='D'))
        sdt = sdt + pd.Timedelta(backfill_window, unit='D')
    sd_list = [el.strftime('%Y-%m-%d') for el in sd_list]
    ed_list = [el.strftime('%Y-%m-%d') for el in ed_list]
    sd_ed_tuples = list(zip(sd_list, ed_list))

    
    return sd_ed_tuples

In [71]:
def parse_backfill_dates_by_month(sd, ed):
    """
    backfills by month for all months between sd and ed, inclusive of sd and ed month
    so if sd is 2024-03-05 and ed is 2024-05-20, it'll get it for 2024-03, 2024-04, and 2024-05
    """
    sd_month = pd.to_datetime(sd, format='%Y-%m-%d').to_period('M').to_timestamp()
    ed_month = pd.to_datetime(ed, format='%Y-%m-%d').to_period('M').to_timestamp()

    sd_list = pd.date_range(sd_month, ed_month, freq='MS')
    ed_list = [el + MonthEnd(0) for el in sd_list]

    sd_list = [el.strftime('%Y-%m-%d') for el in sd_list]
    ed_list = [el.strftime('%Y-%m-%d') for el in ed_list]
    sd_ed_tuples = list(zip(sd_list, ed_list))

    return sd_ed_tuples

In [73]:
parse_backfill_dates_by_month('2024-01-01', '2024-02-17')

[('2024-01-01', '2024-01-31'), ('2024-02-01', '2024-02-29')]

In [300]:
len(parse_backfill_dates('2024-01-01', '2024-01-17'))

3

In [70]:
p1 = pd.to_datetime('2024-01-03', format='%Y-%m-%d').to_period('M').to_timestamp()
p2 = pd.to_datetime('2024-03-24', format='%Y-%m-%d').to_period('M').to_timestamp()
pd.date_range(p1, p2, freq='MS')

DatetimeIndex(['2024-01-01', '2024-02-01', '2024-03-01'], dtype='datetime64[ns]', freq='MS')

In [299]:
('2024-01-01', '2024-01-17')[0]

'2024-01-01'

In [277]:
pd.date_range('2024-01-01', '2024-03-12', freq='SMS')

DatetimeIndex(['2024-01-01', '2024-01-15', '2024-02-01', '2024-02-15',
               '2024-03-01'],
              dtype='datetime64[ns]', freq='SMS-15')

In [288]:
pd.to_datetime('2024-01-01')+pd.Timedelta(7, unit='D')

Timestamp('2024-01-08 00:00:00')

In [54]:
pd.date_range('2024-01-02', '2024-03-24', freq='MS')

DatetimeIndex(['2024-02-01', '2024-03-01'], dtype='datetime64[ns]', freq='MS')

In [52]:
pd.date_range('2024-01-01', '2024-03-24', freq='M')

DatetimeIndex(['2024-01-31', '2024-02-29'], dtype='datetime64[ns]', freq='M')

In [30]:
from pandas.tseries.offsets import MonthEnd

In [32]:
[el + MonthEnd(0) for el in pd.date_range('2024-01', '2024-03', freq='MS')]

[Timestamp('2024-01-31 00:00:00'),
 Timestamp('2024-02-29 00:00:00'),
 Timestamp('2024-03-31 00:00:00')]

In [22]:
pd.date_range('2024-01-01', '2024-03-30', freq='MS')

DatetimeIndex(['2024-01-01', '2024-02-01', '2024-03-01'], dtype='datetime64[ns]', freq='MS')

In [10]:
pd.date_range('2024-01-01', '2024-03-31', freq='M')

DatetimeIndex(['2024-01-31', '2024-02-29', '2024-03-31'], dtype='datetime64[ns]', freq='M')

In [280]:
ta = TimeAgg('2024-01-01', '2024-01-14')

In [109]:
ta.backlinks_str

'backlinked dates: [[2024-01-07]], [[2024-01-03]], [[2024-01-06]], '

In [218]:
ta.filter_dates_list

['2024-01-01',
 '2024-01-02',
 '2024-01-03',
 '2024-01-04',
 '2024-01-05',
 '2024-01-06',
 '2024-01-07',
 '2024-01-08',
 '2024-01-09',
 '2024-01-10',
 '2024-01-11',
 '2024-01-12',
 '2024-01-13',
 '2024-01-14']

In [222]:
ta.pages_dates_list

['2023-12-22',
 '2024-04-30',
 '2023-10-02',
 '2022-10-02',
 '2023-12-16',
 '2024-04-04',
 '2023-08-28',
 '2022-07-25',
 '2023-05-31',
 '2022-09-26',
 '2022-09-12',
 '2023-09-12',
 '2022-05-15',
 '2022-05-21',
 '2023-01-04',
 '2023-03-10',
 '2023-12-06',
 '2022-10-12',
 '2024-04-20',
 '2023-01-30',
 '2023-11-28',
 '2022-11-28',
 '2023-10-26',
 '2022-10-26',
 '2022-12-12',
 '2023-10-06',
 '2022-10-06',
 '2023-01-10',
 '2022-11-08',
 '2023-12-26',
 '2022-12-26',
 '2023-03-30',
 '2023-01-24',
 '2022-08-18',
 '2022-07-15',
 '2022-05-01',
 '2022-09-16',
 '2023-05-01',
 '2024-02-05',
 '2023-09-22',
 '2022-07-21',
 '2023-07-21',
 '2022-07-31',
 '2023-05-25',
 '2023-05-11',
 '2024-02-15',
 '2023-09-06',
 '2022-05-11',
 '2022-07-05',
 '2023-07-05',
 '2022-08-08',
 '2023-03-20',
 '2024-04-24',
 '2023-10-22',
 '2022-10-22',
 '2023-03-14',
 '2022-10-16',
 '2023-10-16',
 '2022-12-02',
 '2022-12-13',
 '2023-10-07',
 '2022-11-09',
 '2024-04-01',
 '2023-03-05',
 '2023-01-25',
 '2022-08-19',
 '2022-09-

In [223]:
x = ta.pages_dates_list.sort()

In [224]:
x

In [90]:
list(pd.date_range(ta.sd, ta.ed).strftime('%Y-%m-%d'))

['2024-01-01', '2024-01-02', '2024-01-03', '2024-01-04', '2024-01-05']

In [49]:
ta.sd

'2024-01-01'

In [50]:
ta.ed

'2024-01-05'

In [51]:
ta.dir

'/Users/shrey/Personal/journal/pages/journal/daily_journal/'

In [58]:
files = os.listdir(ta.dir)

In [64]:
[file.split('.')[0] for file in files]

['2023-12-22',
 '2024-04-30',
 '2023-10-02',
 '2022-10-02',
 '2023-12-16',
 '2024-04-04',
 '2023-08-28',
 '2022-07-25',
 '2023-05-31',
 '2022-09-26',
 '2022-09-12',
 '2023-09-12',
 '2022-05-15',
 '2022-05-21',
 '2023-01-04',
 '2023-03-10',
 '2023-12-06',
 '2022-10-12',
 '2024-04-20',
 '2023-01-30',
 '2023-11-28',
 '2022-11-28',
 '2023-10-26',
 '2022-10-26',
 '2022-12-12',
 '2023-10-06',
 '2022-10-06',
 '2023-01-10',
 '2022-11-08',
 '2023-12-26',
 '2022-12-26',
 '2023-03-30',
 '2023-01-24',
 '2022-08-18',
 '2022-07-15',
 '2022-05-01',
 '2022-09-16',
 '2023-05-01',
 '2024-02-05',
 '2023-09-22',
 '2022-07-21',
 '2023-07-21',
 '2022-07-31',
 '2023-05-25',
 '2023-05-11',
 '2024-02-15',
 '2023-09-06',
 '2022-05-11',
 '2022-07-05',
 '2023-07-05',
 '2022-08-08',
 '2023-03-20',
 '2024-04-24',
 '2023-10-22',
 '2022-10-22',
 '2023-03-14',
 '2022-10-16',
 '2023-10-16',
 '2022-12-02',
 '2022-12-13',
 '2023-10-07',
 '2022-11-09',
 '2024-04-01',
 '2023-03-05',
 '2023-01-25',
 '2022-08-19',
 '2022-09-

In [62]:
files[0].split('.')

['2023-12-22', 'md']

# Script backup

In [None]:
import os
import pandas as pd
from langchain_community.llms import Ollama
import argparse
import time
from pandas.tseries.offsets import MonthEnd

def get_args():
    """
    sd: start date condition for journal entries we'll analyze (inclusive)
    ed: end date condition for journal entries we'll analyze (inclusive)
    i: input directory where journal entries live with naming convention of date (YYYY-MM-DD)
    o: output directory where we deposit smart summaries
    bfws: if we're not backfilling the default None will use the exact sd and ed provided. If we provide a
        value here then we'll start backfilling, taking chunks each sized the number of days provided, from within the
        sd and ed. for example, if sd=2024-01-01 and ed=2024-03-01, and we have no backfill-wsize, we'll just get a
        AI analysis for those whole two months. but if we set it to 7, we'll get a weekly analysis for every week in
        those two months. there is fencepost behavior. we make tuples of start and end dates, inclusive of the
        sd value provided, such that each tuple is 'backfill_window' days long, until the point where the
        new start date of a pair would exceed the ed given. for example if sd=2024-01-01, ed = 2024-01-08, and
        bfws=7, then our tuples will be 2024-01-01:2024-01-07 and 2024-01-08:2024-01-14
    bfm: for monthly summaries (where there is no fixed backfill window size). providing a bfws will overwrite this!
    """
    parser = argparse.ArgumentParser(description='Arguments for getting device adjusted and unadjusted segment std')
    parser.add_argument('-sd', type=str, help='start date for query format YYYY-MM-DD', default='2020-01-01')
    parser.add_argument('-ed', type=str, help='end date for query format YYYY-MM-DD', default='2020-06-30')
    parser.add_argument('-i', type=str, help='input directory',
                        default = '/Users/shrey/Personal/journal/pages/journal/daily_journal_test/')
    parser.add_argument('-o', type=str, help='output directory',
                        default = '/Users/shrey/Personal/journal/pages/journal/monthly_journal_test/')
    parser.add_argument('-bfws', type=int, help='backfill window size, num days per chunk for backfilling',
                        default=None)
    parser.add_argument('-bfm', type=bool, help='backfill monthly, can be set to True or False',
                        default=False)

    args = parser.parse_args()
    return args.sd, args.ed, args.i, args.o, args.bfws, args.bfm




class TimeAgg:

    def __init__(self, sd: str, ed: str, i_dir: str, o_dir: str, llm_obj=None):
        """
        sd: string start date format YYYY-MM-DD
        ed: string end date format YYYY-MM-DD
        i_dir: directory with text files containing journal entries, with filename convention of dates
        o_dir: output directory
        llm_obj: if running smart summary, pass in an llm object (from langchain_community.llms)
        """
        self.sd = sd
        self.ed = ed
        self.i_dir = i_dir
        self.o_dir = o_dir
        self.files = os.listdir(self.i_dir)

        self.llm = llm_obj

    def run(self):

        self.get_files_in_daterange()
        self.gen_smart_summary()
        self.gen_template()

    def get_files_in_daterange(self):

        self.filter_dates_list = list(pd.date_range(self.sd, self.ed).strftime('%Y-%m-%d'))
        self.pages_dates_list = [file.split('.')[0] for file in self.files]
        self.selected_dates_list = list(set(self.filter_dates_list) & set(self.pages_dates_list))
        self.selected_dates_list.sort()
        self.selected_pages_list = [file + '.md' for file in self.selected_dates_list]

    def gen_smart_summary(self):
        """
        this is the part where we prompt an LLM to summarize our entries from this time period, give us feedback, and offer questions / writing prompts
        """

        # grab the journal text
        j_text = ""
        for page in self.selected_pages_list:
            with open(self.i_dir + page, 'r') as file:
                data = file.read()  # .replace('\n', '')
                j_text = j_text + data
                j_text = j_text + "\n"

        self.subquery_1 = f"Here are my journal entries from {self.sd} to {self.ed}: "
        self.subquery_2 = j_text
        self.subquery_3 = f"**AI Journal Entry Analysis**\
        Analyze my journal entries from {self.sd} to {self.ed}.\
        In your analysis, answer the following questions:\
        What was my focus during this time period--what did I want? Respond to this under the heading **Focus:**.\
        What were my relationships with other people like? Who did I talk to? How did I feel about them? Respond to this under the heading **People:**.\
        What did I work on, achieve, read, or practice (for example: coding, meditation, yoga, my job)? Respond to this under the heading **Practice:**.\
        What did I learn about myself during this time period? Respond to this under the heading **Insights:**.\
        Write the analysis in second person: 'You did...'"

        self.query = self.subquery_1 + self.subquery_2 + self.subquery_3
        self.smart_summary = self.llm(self.query)

    def gen_template(self):

        self.backlinks_str = 'backlinked dates: '
        for date in self.selected_dates_list:
            self.backlinks_str = self.backlinks_str + '[[' + date + ']], '

        with open(self.o_dir + self.sd + ':' + self.ed + '.md', 'w') as f:
            f.write("Date: \n\n")
            f.write(self.backlinks_str + "\n\n")
            f.write("backlinks: \n\n")
            f.write(self.smart_summary)
            f.write("\n\n**Human Reflection:**\n\n")
            f.write(
                "What is the AI analysis bringing up for you? What lessons or shifts in focus do you want to take into the future? Reflect: \n\n")


def parse_backfill_dates_by_window(sd, ed, backfill_window=7):
    """
    sd: overall start date
    ed: overall end date
    backfill: str, 'biweekly' or 'monthly'

    makes tuples of start and end dates, inclusive of the sd value provided, such that each tuple is 'backfill_window' days long,
    until the point where the new start date of a pair would exceed the ed given
    returns a list of tuples
    """
    sdt = pd.to_datetime(sd)
    edt = pd.to_datetime(ed)

    sd_list = []
    ed_list = []

    while sdt <= edt:
        sd_list.append(sdt)
        ed_list.append(sdt + pd.Timedelta(backfill_window - 1, unit='D'))
        sdt = sdt + pd.Timedelta(backfill_window, unit='D')
    sd_list = [el.strftime('%Y-%m-%d') for el in sd_list]
    ed_list = [el.strftime('%Y-%m-%d') for el in ed_list]
    sd_ed_tuples = list(zip(sd_list, ed_list))

    return sd_ed_tuples

def parse_backfill_dates_by_month(sd, ed):
    """
    backfills by month for all months between sd and ed, inclusive of sd and ed month
    so if sd is 2024-03-05 and ed is 2024-05-20, it'll get it for 2024-03, 2024-04, and 2024-05
    """
    sd_month = pd.to_datetime(sd, format='%Y-%m-%d').to_period('M').to_timestamp()
    ed_month = pd.to_datetime(ed, format='%Y-%m-%d').to_period('M').to_timestamp()

    sd_list = pd.date_range(sd_month, ed_month, freq='MS')
    ed_list = [el + MonthEnd(0) for el in sd_list]

    sd_list = [el.strftime('%Y-%m-%d') for el in sd_list]
    ed_list = [el.strftime('%Y-%m-%d') for el in ed_list]
    sd_ed_tuples = list(zip(sd_list, ed_list))

    return sd_ed_tuples

if __name__ == '__main__':
    print('started job at ' + time.ctime())
    sd, ed, input_dir, output_dir, backfill_chunk_size, backfill_monthly = get_args()
    llm_obj = Ollama(model="llama3") # i think initializing outside the timeagg obj will enable our llm to learn from
    #past summarizations which is pretty interesting! it'll make its inference have some long-term time dependence!

    if backfill_chunk_size:
        sd_ed_tuples = parse_backfill_dates_by_window(sd, ed, backfill_chunk_size) # for specific chunk size like 7 days
    elif backfill_monthly:
        sd_ed_tuples = parse_backfill_dates_by_month(sd, ed)
    else: sd_ed_tuples = [(sd, ed)]

    for i in range(0, len(sd_ed_tuples)):
        sd_ed_tuple = sd_ed_tuples[i]
        print("started chunk number " + str(i) + " at " + time.ctime() )
        t_obj = TimeAgg(sd=sd_ed_tuple[0], ed=sd_ed_tuple[1], i_dir=input_dir, o_dir=output_dir, llm_obj=llm_obj)
        t_obj.run()

    print('completed job at ' + time.ctime())




