In [3]:
import json
import os

In [24]:
def extract_text(json_data):
    try:
        if isinstance(json_data, str):
            json_data = json.loads(json_data)
    except json.decoder.JSONDecodeError:
        pass
    if isinstance(json_data, dict):
        for key, value in json_data.items():
            if key == "text":
                yield value
            elif isinstance(value, (dict, list)):
                yield from extract_text(value)
    elif isinstance(json_data, list):
        for item in json_data:
            yield from extract_text(item)

In [5]:
def load_json_data(filepath, filename):
    with open(os.path.join(filepath, filename), 'r') as f:
        return json.load(f)

In [32]:
PARENT_DIR = os.path.join('..')
DATA_DIR = os.path.join(PARENT_DIR, 'data')
SLACK_DATA_DIR = os.path.join(DATA_DIR, 'NYC AI Builders Slack export Aug 9 2022 - Jan 16 2023')

In [10]:
slack_channels = os.listdir(SLACK_DATA_DIR)

In [25]:
slack_data = []
for subdir in slack_channels[:1]:
    for file in os.listdir(os.path.join(SLACK_DATA_DIR, subdir)):
        slack_data.extend(load_json_data(os.path.join(SLACK_DATA_DIR, subdir), file))

In [28]:
slack_data_text = extract_text(slack_data)

In [33]:
# list(slack_data_text)

# CSV Extract Processing
Andy preprocessed the JSON output into a CSV with columns:
| user | user_name | text | client_message_id | timestamp | channel |

In [34]:
import pandas as pd

In [35]:
CSV_FILENAME = 'slack_logs.csv'
df = pd.read_csv(os.path.join(DATA_DIR, CSV_FILENAME))

In [36]:
df.head()

Unnamed: 0,user,user_name,text,client_message_id,timestamp,channel
0,Kevin Merlini,kmerlini,<https://twitter.com/levelsio/status/157708679...,bfaca810-131c-455b-b274-cf1dbbc38c8d,1664981000.0,intel
1,Andy,andy,<https://www.youtube.com/watch?v=_f5GmFngZQE>\...,2bdbe62c-f440-43ac-bbca-54c636de8e8c,1664157000.0,intel
2,Andy,andy,its adults playing not kids,53c453bc-9c40-47f2-801b-c3a729c83e46,1664157000.0,intel
3,Kevin Merlini,kmerlini,<https://twitter.com/tanyaagoyal/status/157481...,9062ac3a-4bb0-440c-a8ca-375cfc6bf176,1664371000.0,intel
4,Kevin Merlini,kmerlini,<https://spolu.notion.site/spolu/Dust-1dcfe01a...,84895c8d-4516-4bad-ba90-3fb41d994255,1664371000.0,intel


In [41]:
from typing import Union
from pandas import DataFrame

In [40]:
from datetime import datetime

def timestamp_to_utc(timestamp):
    return datetime.utcfromtimestamp(timestamp)

def utc_to_timestamp(utc_time):
    return int(utc_time.timestamp())

In [45]:
def filter_dataframe(
        df: DataFrame,
        user: Union[str, list, None]=None,
        time_range:Union[tuple[datetime, datetime], None]=None,
        channel:Union[str, list ,None]=None
):
    if user:
        if isinstance(user, str):
            df = df[df["user"] == user]
        elif isinstance(user, list):
            df = df[df["user"].isin(user)]

    if time_range:
        df = df[(df["timestamp"] > time_range[0]) & (df["timestamp"] < time_range[1])]

    if channel:
        if isinstance(channel, str):
            df = df[df["channel"] == channel]
        elif isinstance(channel, list):
            df = df[df["channel"].isin(channel)]

    return df

In [43]:
import os
from langchain import OpenAI, Prompt
from langchain.text_splitter import CharacterTextSplitter
from langchain.chains.mapreduce import MapReduceChain
from dotenv import load_dotenv

In [56]:
# load_dotenv()

In [76]:
# langchain summarize taken from:
# https://colab.research.google.com/drive/1B2su1QESO_YfdW93UITosAzqVXvdMMz7#scrollTo=HV95yfK2XG17
# Found through:
# https://twitter.com/hwchase17/status/1587458155021099008

def get_summary(input_text):
    llm = OpenAI(temperature=0)

    prompt_template = """Write a concise summary of the following:

    {text}

    CONCISE SUMMARY:
    """

    prompt = Prompt(template=prompt_template, input_variables=["text"])

    text_splitter = CharacterTextSplitter()

    mr_chain = MapReduceChain.from_params(llm=llm, prompt=prompt, text_splitter=text_splitter)

    return mr_chain.run(input_text)

In [69]:
def join_text_from_df(df:DataFrame):
    df = df[df["text"].notna()]
    text = df["text"].tolist()
    return '\n\n'.join(text)

In [55]:
# df["channel"].unique()

In [58]:
prelim_data = filter_dataframe(
    df,
    time_range=(
        utc_to_timestamp(datetime(2023, 1, 8)),
        utc_to_timestamp(datetime(2023, 1, 16))
    ),
    channel="chat-gpt"
)

In [68]:
prelim_data

Unnamed: 0,user,user_name,text,client_message_id,timestamp,channel
654,,,<https://www.theinformation.com/articles/micro...,8a4e3839-61ac-4b1c-817e-b30954450e63,1673457000.0,chat-gpt
655,Andy,andy,• Bill Gates didn’t like Microsoft’s first inv...,cdb34b9d-8087-4682-b843-1dfee40ee8dc,1673458000.0,chat-gpt
656,Andy,andy,<https://www.theinformation.com/articles/the-e...,d931d32e-914d-45e0-b8bb-4690e785a90a,1673458000.0,chat-gpt
657,Andy,andy,OpenAI recorded $13M in expenses in 2020: <htt...,c5163b4f-2d2a-4c20-848a-676b6c0c3380,1673480000.0,chat-gpt
658,Andy,andy,I’m thinking the accounting structures their G...,3a759487-ed9c-4cfa-97db-a05865bdd90f,1673480000.0,chat-gpt


In [70]:
text = join_text_from_df(prelim_data)
# prelim_data = prelim_data[prelim_data["text"].notna()]

In [77]:
summary = get_summary(text)

In [78]:
summary

"\nMicrosoft has invested $1 billion in OpenAI, with OpenAI committing to using Azure, Microsoft's cloud service. OpenAI is looking for sources that outline experts’ inner monologue and thought processes, and is considering a paid version of ChatGPT. @U0493F8JDFU and @U03TPREG1QQ are discussing the progress of AI, the possibility of adding the ability to summarize conversations to their chatbot, Bob, and a hackathon for tomorrow. They are also discussing OpenAI's chatgpt detector and the early founder breakup of Adept, an OpenAI rival."

In [79]:
text

