In [1]:
import pandas as pd
import json
import pymongo
import matplotlib.dates as mdates
from datetime import datetime, date
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import random

In [2]:
def data_import(include_mums=False):
    """Connects to database and creates dataframe containing all columns. Drops unneeded columns and sets timestamp
     datatype. Correct any incorrect time values, sets data times and sorts"""

    collection_list=['Times']
    
    if include_mums:
        collection_list.append('Mumsnet_Times')
    all_records = []

    # Connects to db and gets collection
    db = get_db()

    for collection in collection_list:
        records =list(db[collection].find())
        all_records.append(records)
    
    #Flattens list
    all_records = [val for sublist in all_records for val in sublist]
    
    df=pd.DataFrame(all_records)
    
    # Makes column to indicate which database times are from
    non_mums = ['Harvey Williams', 'Sazzle', 'Leah', 'Tom', 'Joe', 'George Sheen', 'Oliver Folkard'] 
    df['mum'] = np.where(df['user'].isin(non_mums), False, True)
    
    return df

In [3]:
def get_db(write=False):
    if write:
        connection_string = "admin_connection_string"

    else:
        connection_string = "connection_string"

    try:
        with open("local/pass.json") as file:
            file = json.loads(file.read())
            connection_string = file.get(connection_string)
            client = pymongo.MongoClient(
                connection_string)
            db = client["PlusWord"]
            return db
    except Exception as e:
        print(e)

In [4]:
def format_for_streamlit(df):
    """Makes df more readable, converts times into plottable numbers and sets index"""

    df = df[['load_ts', 'time', 'user']]
    df['time'] = df['time'].str.replace(r'(^\d\d:\d\d$)', r'00:\1', regex=True)
    df['load_ts'] = pd.to_datetime(df['load_ts'], format='%Y-%m-%d %H:%M:%S.%f')
    # df['user'] = df['user'].astype('category')
    df = df.sort_values(by=['load_ts'])
    df = df.rename(columns={'load_ts': 'timestamp'})
    df['time_delta'] = pd.to_timedelta(df['time'].astype('timedelta64[ns]'))
    df['time_delta_as_num'] = time_delta_to_num(pd.to_timedelta(df['time'].astype('string')))
    df['sub_time_delta_as_num'] = time_delta_to_num(pd.to_timedelta(df['timestamp'].dt.time.astype('string')))


    df = df.set_index('timestamp')
    df = df.sort_index(ascending=False)

    return df

In [5]:
def format_for_streamlit(df):
    """Makes df more readable, converts times into plottable numbers and sets index"""

    df = df[['load_ts', 'time', 'user', 'mum']]
    df['time'] = df['time'].str.replace(r'(^\d\d:\d\d$)', r'00:\1', regex=True)
    df['load_ts'] = pd.to_datetime(df['load_ts'], format='%Y-%m-%d %H:%M:%S.%f')
    # df['user'] = df['user'].astype('category')
    df = df.sort_values(by=['load_ts'])
    df = df.rename(columns={'load_ts': 'timestamp'})
    df['time_delta'] = pd.to_timedelta(df['time'].astype('timedelta64[ns]'))
    df['time_delta_as_num'] = time_delta_to_num(pd.to_timedelta(df['time'].astype('string')))
    df['sub_time_delta_as_num'] = time_delta_to_num(pd.to_timedelta(df['timestamp'].dt.time.astype('string')))


    df = df.set_index('timestamp')
    df = df.sort_index(ascending=False)

    return df


In [6]:
def time_delta_to_num(time_delta):
    """ Takes in time delta and converts it into a number for plotting"""

    # specify a date to use for the times

    zero_date = datetime(2022, 6, 20)

    zero_num = mdates.date2num(zero_date)

    # adds zero_data to timedelta to convert

    time_delta_plus_date = [zero_date + time_unit for time_unit in time_delta]

    # convert datetimes to numbers

    time_delta_as_num = [mins - zero_num for mins in mdates.date2num(time_delta_plus_date)]

    return time_delta_as_num

In [7]:
def spline_smooth(df, poly_value):
    """Smooths lines via interpolation and splines. Purely cosmetic"""

    df_spline = df.copy()

    df_spline['date_as_num'] = mdates.date2num(df_spline['timestamp'])

    x_smooth = np.linspace(df_spline['date_as_num'].min(), df_spline['date_as_num'].max(), poly_value)

    bspline = interpolate.make_interp_spline(df_spline['date_as_num'], df_spline['time_delta_as_num'])

    y_smooth = bspline(x_smooth)

    return x_smooth, y_smooth

In [8]:
def savgol_smooth(df, poly_value):
    """Smooths lines using a Savitzky–Golay filter"""

    df_savgol = df.copy()

    df_savgol['date_as_num'] = mdates.date2num(df_savgol['timestamp'])

    max_window = len(df_savgol)

    x_smooth = signal.savgol_filter(df_savgol['date_as_num'], max_window, poly_value)

    y_smooth = signal.savgol_filter(df_savgol['time_delta_as_num'], max_window, poly_value)

    return x_smooth, y_smooth

In [9]:
df=data_import(include_mums=True)
df = format_for_streamlit(df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['time'] = df['time'].str.replace(r'(^\d\d:\d\d$)', r'00:\1', regex=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['load_ts'] = pd.to_datetime(df['load_ts'], format='%Y-%m-%d %H:%M:%S.%f')


In [10]:
df

Unnamed: 0_level_0,time,user,mum,time_delta,time_delta_as_num,sub_time_delta_as_num
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2023-07-06 12:16:13.031,00:01:27,George Sheen,False,0 days 00:01:27,0.001007,0.511262
2023-07-06 11:48:54.226,00:01:20,Oliver Folkard,False,0 days 00:01:20,0.000926,0.492294
2023-07-06 10:21:51.428,00:11:28,Harrison Sharpe,True,0 days 00:11:28,0.007963,0.431845
2023-07-06 10:07:42.592,00:00:50,Joe,False,0 days 00:00:50,0.000579,0.422021
2023-07-06 07:09:00.000,00:01:12,Drywhitefruitycidergin,True,0 days 00:01:12,0.000833,0.297917
...,...,...,...,...,...,...
2022-06-04 09:45:00.000,00:36:59,Joe,False,0 days 00:36:59,0.025683,0.406250
2022-06-04 00:24:00.000,00:13:16,Harvey Williams,False,0 days 00:13:16,0.009213,0.016667
2022-06-03 13:29:00.000,00:14:48,Harvey Williams,False,0 days 00:14:48,0.010278,0.561806
2022-06-03 06:57:00.000,00:03:30,Joe,False,0 days 00:03:30,0.002431,0.289583


In [11]:
# Creates df

df_mean_time = df.groupby(["User", df.index])["time_delta_as_num"].mean()

df_mean_time = df_mean_time.reset_index()

# Generates 25 mins for y-axis

y_axis_time = y_axis_generator(25, 'm')

# Displays every 2 mins

y_axis_time_num_2_mins = y_axis_time[::2]

fig, ax = plt.subplots(figsize=(15, 7))

# Smooths lines out for each user and plots them

if smooth:

    df_smooth = pd.DataFrame()

    for User in df_mean_time['User'].unique():

        df_mean_time_rough = df_mean_time[df_mean_time['User'] == User]

        try:
            if time_period == 'M':
                x_smooth, y_smooth = spline_smooth(df_mean_time_rough, poly_value)

            if time_period == 'W':
                x_smooth, y_smooth = savgol_smooth(df_mean_time_rough, poly_value)

        except Exception:

            # If smoothing function errors just plot original values
            x_smooth = mdates.date2num(df_mean_time_rough['timestamp']).tolist()

            y_smooth = df_mean_time_rough['time_delta_as_num'].tolist()

        # converts x_smooth, y_smooth into a dataframe with user value associated with them

        user_list = [User] * len(x_smooth)

        x_smooth = pd.Series(x_smooth, name='date_as_num')

        y_smooth = pd.Series(y_smooth, name='time_delta_as_num')

        users = pd.Series(user_list, name='User')

        df = pd.concat([users, x_smooth, y_smooth], axis=1)

        # Concats dfs together to make one big one

        df_smooth = pd.concat([df_smooth, df])

    df = df_smooth.copy()

else:
    df = df_mean_time.copy()

    df['date_as_num'] = mdates.date2num(df['timestamp'])

AttributeError: 'DatetimeIndex' object has no attribute 'datetime'

In [None]:
df

In [None]:


# Plotting

fig = sns.lineplot(data=df,
                   x='date_as_num',
                   y='time_delta_as_num',
                   hue='User'
                   ).set(
    xlabel='Date',
    ylabel='Mean time /min')

ax.yaxis_date()

ax.set_yticks(y_axis_time_num_2_mins)

ax.set_yticklabels(y_axis_time_num_2_mins)

ax.yaxis.set_major_formatter(mdates.DateFormatter("%M:%S"))

ax.xaxis.set_major_formatter(mdates.DateFormatter("%b %Y"))

# Formats df

df_mean_time = time_delta_as_num_to_time(df_mean_time)

df_mean_time['Date'] = df_mean_time['timestamp'].datetime.strftime('%d %B %Y')

df_mean_time = df_mean_time[['User', 'Date', 'Time']]

df_mean_time = df_mean_time.rename(columns={'Time': 'Mean Time'})