In [None]:
import pandas as pd
df = pd.read_csv("../20230726_Messages.csv")
df.head()

In [None]:
df["Message"][21]

In [None]:
# Group by week
df["Datetime"] = pd.to_datetime(df["Datetime"])
df["Week"] = df["Datetime"].dt.isocalendar().week
df["Date"] = df["Datetime"].dt.date

# Group by Date
daily_df = df.groupby("Date").agg({"Message": " \n ".join}).reset_index()
daily_df = pd.DataFrame(daily_df)
len(daily_df)

# # Group by Week
# weekly_df = df.groupby('Week').agg({'Message': ' \n '.join}).reset_index()
# weekly_df = pd.DataFrame(messages_df)
# print(weekly)
# print(weekly_df["Message"][9])

In [None]:
daily_df["wc"] = daily_df["Message"].apply(lambda x: len(x.split()))

In [None]:
daily_df["wc"].describe()

In [None]:
from datetime import date

d = daily_df["Date"][42]


def human_date(d):
    def ordinal(n):
        return "%d%s" % (
            n,
            "tsnrhtdd"[((n // 10 % 10 != 1) * (n % 10 < 4) * n % 10) :: 4],
        )

    formatted_date = d.strftime(f"{ordinal(d.day)} %B %Y")
    return formatted_date


print(human_date(d))

## Summarization
Built with Langchain

In [None]:
from langchain import OpenAI, PromptTemplate, LLMChain
from langchain.chat_models import ChatOpenAI
from langchain.text_splitter import CharacterTextSplitter
from langchain.chains.mapreduce import MapReduceChain
from langchain.prompts import PromptTemplate
from langchain.docstore.document import Document
from langchain.chains.summarize import load_summarize_chain

from functools import lru_cache

# llm = ChatOpenAI(temperature=0)
text_splitter = CharacterTextSplitter.from_tiktoken_encoder()

In [None]:
%%time


class SummarizeDay:
    def __init__(self, plain_text):
        self.prompt_template = """This is a chaotic Generative AI Group Chat transcript. Write detailed, exhaustive bullet point recap of topics discussed. Extract COMPLETE URL of web and social links with context. Please organise it into sections, only when needed:

{text}


Use Markdown. Add ## for section titles. TOPICS RECAP:"""
        # Research with weblinks where relevant EXACTLY ONCE:
        self.PROMPT = PromptTemplate(
            template=self.prompt_template, input_variables=["text"]
        )
        #         self.chain = load_summarize_chain(ChatOpenAI(temperature=0), chain_type="map_reduce", return_intermediate_steps=True, map_prompt=PROMPT, combine_prompt=PROMPT)
        self.chain = load_summarize_chain(
            ChatOpenAI(temperature=0), chain_type="stuff", prompt=self.PROMPT
        )
        self.docs = self.make_docs(plain_text)

    @lru_cache
    def make_docs(self, plain_text: str):
        texts = text_splitter.split_text(plain_text)
        docs = [Document(page_content=t) for t in texts]
        return docs

    def summarize_docs(self):
        chain_output = self.chain(
            {"input_documents": self.docs}, return_only_outputs=True
        )
        return chain_output


def summarize(message: str) -> str:
    sd = SummarizeDay(message)
    chain_output = sd.summarize_docs()
    summary_text = chain_output["output_text"]
    print(summary_text)
    return summary_text

In [None]:
%%time
daily_df["Summary"] = daily_df["Message"].apply(summarize)

In [None]:
import re

WINDOW = 1


def extract_urls_with_context(text):
    lines = text.split("\n")
    url_pattern = re.compile(
        r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"
    )
    urls_with_context = []

    for idx, line in enumerate(lines):
        for match in url_pattern.finditer(line):
            start, end = match.span()
            prev_line = lines[idx - WINDOW] if idx > 0 else ""
            next_line = lines[idx + WINDOW] if idx < len(lines) - 1 else ""
            context = f"{prev_line}\n{line}\n{next_line}".strip()
            urls_with_context.append((match.group(), context))

    return urls_with_context


class LinksContext:
    def __init__(self, plain_text):
        self.prompt_template = """For the given URL, there is some context. Newlines may or may not be related to the link, but the message in the same link as link is related to the link.
        
{text}
        
Mention URL with context. Single bullet point:"""
        # Research with weblinks where relevant EXACTLY ONCE:
        self.PROMPT = PromptTemplate(
            template=self.prompt_template, input_variables=["text"]
        )
        #         self.chain = load_summarize_chain(ChatOpenAI(temperature=0, model_name="gpt-4"), chain_type="map_reduce", return_intermediate_steps=True, map_prompt=PROMPT, combine_prompt=PROMPT)
        self.chain = load_summarize_chain(
            ChatOpenAI(temperature=0), chain_type="stuff", prompt=self.PROMPT
        )
        self.docs = self.make_docs(plain_text)

    @lru_cache
    def make_docs(self, plain_text: str):
        texts = text_splitter.split_text(plain_text)
        docs = [Document(page_content=t) for t in texts]
        return docs

    def summarize_docs(self):
        chain_output = self.chain(
            {"input_documents": self.docs}, return_only_outputs=True
        )
        return chain_output


def end_note_with_links(message: str) -> str:
    url_groups = extract_urls_with_context(message)
    link_with_description = []
    for ug in url_groups:
        lc = LinksContext(ug[1])
        chain_output = lc.summarize_docs()
        output_text = chain_output["output_text"]
        link_with_description.append(output_text)
    return "\n".join(link_with_description)


# message = daily_df["Message"][33]
# print(end_note_with_links(message))
daily_df["EndNote"] = daily_df["Message"].apply(end_note_with_links)

In [None]:
class TitleDescription:
    def __init__(self, plain_text):
        self.prompt_template = """For the given discussion, write a short title and description, separate both by \n\n

{text}
        
Return a single valid JSON with 
```
"title":
"description:":
```:"""
        # Research with weblinks where relevant EXACTLY ONCE:
        self.PROMPT = PromptTemplate(
            template=self.prompt_template, input_variables=["text"]
        )
        self.chain = load_summarize_chain(
            ChatOpenAI(temperature=0, model_name="gpt-4"),
            chain_type="map_reduce",
            return_intermediate_steps=True,
            map_prompt=self.PROMPT,
            combine_prompt=self.PROMPT,
        )
        #         self.chain = load_summarize_chain(ChatOpenAI(temperature=0), chain_type="stuff", prompt=self.PROMPT)
        self.docs = self.make_docs(plain_text)

    @lru_cache
    def make_docs(self, plain_text: str):
        texts = text_splitter.split_text(plain_text)
        docs = [Document(page_content=t) for t in texts]
        return docs

    def summarize_docs(self):
        chain_output = self.chain(
            {"input_documents": self.docs}, return_only_outputs=True
        )
        return chain_output


import json


def get_title_description(summary_text: str) -> dict[str, str]:
    td = TitleDescription(summary_text)
    title_description = td.summarize_docs()
    try:
        fields = json.loads(title_description["output_text"])
    except Exception as e:
        print(title_description["output_text"])
        title_description = td.summarize_docs()
        fields = json.loads(title_description["output_text"])
    return fields

In [None]:
daily_df.iloc[0]

In [None]:
# !pip install pytz

In [None]:
%%time
import pytz
from datetime import datetime, time


def get_page_header_date(date_object):
    # Combine the date object with a time object and set the desired timezone
    dt = datetime.combine(date_object, time())
    desired_timezone = pytz.timezone("Asia/Kolkata")
    localized_dt = desired_timezone.localize(dt)

    # Format the datetime object using strftime
    formatted_datetime = localized_dt.strftime("%Y-%m-%dT%H:%M:%S%z")
    formatted_datetime = formatted_datetime[:-2] + ":" + formatted_datetime[-2:]

    return formatted_datetime


def make_page_header(row):
    date, summary_text = row["Date"], row["Summary"]
    dt = get_page_header_date(date)
    fields = get_title_description(summary_text)
    summary_title, summary_description = fields["title"], fields["description"]

    page_header = f"""+++
title =  "{summary_title}"
date = {dt}
tags = ["daily_summary"]
featured_image = ""
description = "{summary_description}"
toc = true
+++
"""
    return page_header


page_headers = []
for idx in range(len(daily_df)):
    page_headers.append(make_page_header(daily_df.iloc[idx]))
    print(page_headers[-1])
# daily_df["page_header"] = [make_page_header(row=df_row) for df_row in daily_df.iloc[]]

In [None]:
daily_df["page_headers"] = page_headers
daily_df.to_json("daily_backup.json")
daily_df.head()

In [None]:
def make_page(row):
    page = (
        row["page_headers"]
        + "\n"
        + row["Summary"]
        + "\n"
        + "\n## Links\nThe description and link can be mismatched because of extraction errors.\n\n"
        + row["EndNote"]
    )
    file_name = f"{human_date(row['Date'])}.md"
    return page, file_name


from pathlib import Path

write_dir = Path("../../content/ai/").resolve()

for idx in range(len(daily_df)):
    page, file_name = make_page(daily_df.iloc[idx])
    file_path = write_dir / file_name
    with file_path.open("w") as f:
        f.write(page)
#     break