In [153]:
import pandas as pd
from datetime import datetime
from langchain_ollama import OllamaEmbeddings
from sklearn.cluster import KMeans
from collections import Counter
import numpy as np


In [154]:
df = pd.read_csv("output.csv",sep="`")

df["date"] = pd.to_datetime(df["date"])
df["start_time"] = pd.to_datetime(df["start_time"])
df["end_time"] = pd.to_datetime(df["end_time"])

In [155]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3880 entries, 0 to 3879
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   id               3880 non-null   int64         
 1   user_email       3880 non-null   object        
 2   first_name       3880 non-null   object        
 3   date             3880 non-null   datetime64[ns]
 4   time_slot        3880 non-null   object        
 5   meeting_subject  3664 non-null   object        
 6   start_time       3880 non-null   datetime64[ns]
 7   end_time         3880 non-null   datetime64[ns]
 8   load_percentage  3880 non-null   float64       
 9   content          3880 non-null   object        
dtypes: datetime64[ns](3), float64(1), int64(1), object(5)
memory usage: 303.3+ KB


In [156]:
df["meeting_subject"].nunique()

92

In [157]:
df["meeting_subject"].fillna("Unspecified Activity", inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["meeting_subject"].fillna("Unspecified Activity", inplace=True)


In [158]:
df["duration_minutes"] = (
    (df["start_time"] - df["end_time"])
    .dt.total_seconds()
    .fillna(0) / 60
)

In [159]:
df["weekday"] = df["date"].dt.day_name()
df["week_number"] = df["date"].dt.isocalendar().week
df["month"] = df["date"].dt.month_name()
df["year"] = df["date"].dt.year

In [None]:
def time_of_day(hour):
    if hour < 12:
        return "Morning"
    elif 12 <= hour < 17:
        return "Afternoon"
    else:
        return "Evening"

In [None]:
df["duration_minutes"] = (df["end_time"] - df["start_time"]).dt.total_seconds() / 60

In [162]:
df["time_of_day"] = df["start_time"].dt.hour.apply(time_of_day)

In [163]:
def load_band(x):
    if x < 0.3:
        return "Low"
    elif x < 0.7:
        return "Medium"
    return "High"

df["load_band"] = df["load_percentage"].apply(load_band)


In [164]:
df["meeting_subject"].nunique()

93

In [170]:
def record_to_sentence(row):
    name = row["first_name"]
    subject = row["meeting_subject"]
    date = row["date"]                     
    weekday = row["weekday"]
    time_slot = row["time_slot"][:5]      
    start = row["start_time"]
    end = row["end_time"]
    duration = int(float(row["duration_minutes"]))
    load = float(row["load_percentage"]) * 100
    load_band = row["load_band"]

    return (
        f"On {weekday}, {date}, {name} logged an activity titled '{subject}' "
        f"during the {time_slot} slot from {start} to {end}, "
        f"lasting {duration} minutes. "
        f"It accounted for {load:.0f}% of the day's workload ({load_band})."
    )


In [171]:
df["summary_sentence"] = df.apply(record_to_sentence, axis=1)


In [172]:
df.head()

Unnamed: 0,id,user_email,first_name,date,time_slot,meeting_subject,start_time,end_time,load_percentage,content,duration_minutes,weekday,week_number,month,year,time_of_day,load_band,summary_sentence
0,1,andrew@slipstreamdata.co.za,Andrew,2025-11-11,10:30:00,Investec internal scoping feedback,10:30,11:00,0.72,"Investec internal scoping feedback, 10:30 - 11:00",30.0,Tuesday,46,November,2025,Morning,High,"On Tuesday, 2025-11-11, Andrew logged an activ..."
1,2,andrew@slipstreamdata.co.za,Andrew,2025-11-11,11:00:00,Internal Harmony Gold RFP presentation prep,11:00,12:00,0.72,"Internal Harmony Gold RFP presentation prep, 1...",60.0,Tuesday,46,November,2025,Morning,High,"On Tuesday, 2025-11-11, Andrew logged an activ..."
2,3,andrew@slipstreamdata.co.za,Andrew,2025-11-11,11:30:00,Internal Harmony Gold RFP presentation prep,11:00,12:00,0.72,"Internal Harmony Gold RFP presentation prep, 1...",60.0,Tuesday,46,November,2025,Morning,High,"On Tuesday, 2025-11-11, Andrew logged an activ..."
3,4,andrew@slipstreamdata.co.za,Andrew,2025-11-11,13:00:00,Harmony presentation prep,13:00,16:00,0.72,"Harmony presentation prep, 13:00 - 16:00",180.0,Tuesday,46,November,2025,Afternoon,High,"On Tuesday, 2025-11-11, Andrew logged an activ..."
4,5,andrew@slipstreamdata.co.za,Andrew,2025-11-11,13:30:00,Harmony presentation prep,13:00,16:00,0.72,"Harmony presentation prep, 13:00 - 16:00",180.0,Tuesday,46,November,2025,Afternoon,High,"On Tuesday, 2025-11-11, Andrew logged an activ..."


In [175]:
cols_to_drop = ["content", "week_number", "year", "time_slot"]

df_cleaned = df.drop(columns=cols_to_drop)

In [176]:
df_cleaned.head()

Unnamed: 0,id,user_email,first_name,date,meeting_subject,start_time,end_time,load_percentage,duration_minutes,weekday,month,time_of_day,load_band,summary_sentence
0,1,andrew@slipstreamdata.co.za,Andrew,2025-11-11,Investec internal scoping feedback,10:30,11:00,0.72,30.0,Tuesday,November,Morning,High,"On Tuesday, 2025-11-11, Andrew logged an activ..."
1,2,andrew@slipstreamdata.co.za,Andrew,2025-11-11,Internal Harmony Gold RFP presentation prep,11:00,12:00,0.72,60.0,Tuesday,November,Morning,High,"On Tuesday, 2025-11-11, Andrew logged an activ..."
2,3,andrew@slipstreamdata.co.za,Andrew,2025-11-11,Internal Harmony Gold RFP presentation prep,11:00,12:00,0.72,60.0,Tuesday,November,Morning,High,"On Tuesday, 2025-11-11, Andrew logged an activ..."
3,4,andrew@slipstreamdata.co.za,Andrew,2025-11-11,Harmony presentation prep,13:00,16:00,0.72,180.0,Tuesday,November,Afternoon,High,"On Tuesday, 2025-11-11, Andrew logged an activ..."
4,5,andrew@slipstreamdata.co.za,Andrew,2025-11-11,Harmony presentation prep,13:00,16:00,0.72,180.0,Tuesday,November,Afternoon,High,"On Tuesday, 2025-11-11, Andrew logged an activ..."


In [177]:
df_cleaned.to_csv("data_transformed.csv", sep="`", index=False,encoding="utf-8")