In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [None]:
df = pd.read_csv(
    './data/final_data.csv', 
    parse_dates=['date'],)

In [None]:
df['total_time_hour'] = (df['drive_time_min']) / 60

In [None]:
df.sort_values(by='date', inplace=True)

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
ax = df['total_time_hour'].plot(kind='density', figsize=(15, 6))
ax.axvline(df['total_time_hour'].mean(), color='red')
ax.axvline(df['total_time_hour'].median(), color='green')

ticks = np.arange(4, 14, 0.5)
ax.set_xticks(ticks)

detailed_ticks = [tick if tick % 1 == 0 else '' for tick in ticks]
ax.set_xticklabels(detailed_ticks)

ax.set_xlim(4, 13)

plt.show()


In [None]:
import matplotlib.dates as mdates

fig, ax = plt.subplots(figsize=(16, 6))
ax.scatter(df['date'], df['total_time_hour'])
# Set the x-axis major locator to show months
locator = mdates.MonthLocator()
ax.xaxis.set_major_locator(locator)

# Set the x-axis date format to show only months
ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))

# Rotate the x-axis tick labels for better visibility
plt.xticks(rotation=45)

plt.show()

In [None]:
ax = df['total_time_hour'].plot(kind='hist', figsize=(15, 6), bins=60, fontsize=14)
ax.set_ylabel('Days')
ax.set_xlabel('Working Hours')

ticks = np.arange(4, 13, 0.5)
ax.set_xticks(ticks)

detailed_ticks = np.arange(4, 13, 0.5)
ax.set_xticklabels(detailed_ticks)

plt.show()

In [None]:
# spread and central tendency of total time for different years.
plt.figure(figsize=(10, 6))
sns.boxplot(x=df['date'].dt.year, y=df['total_time_hour'])
plt.xlabel('Year')
plt.ylabel('Total Time (hours)')
plt.title('Distribution of Total Time by Year')
plt.grid(True)
plt.show()

In [None]:
df['month'] = df['date'].dt.month
monthly_avg_total_time = df.groupby('month')['total_time_hour'].mean()

# Create a line chart to show the trend of total_time_hour over the months
plt.figure(figsize=(15, 6))
plt.plot(monthly_avg_total_time.index, monthly_avg_total_time.values, marker='o', linestyle='-')
plt.xlabel('Month')
plt.ylabel('Average Total Time (hours)')
plt.title('Trend of Total Time over the Months')
plt.xticks(range(1, 13), ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'])
plt.grid(True)
plt.show()

In [None]:
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
sum_working_hours_per_month = df.groupby(['year', 'month'])['total_time_hour'].sum()
df_result = sum_working_hours_per_month.reset_index()
pivot_result = df_result.pivot_table(index='year', columns='month', values='total_time_hour', fill_value=0)

pivot_result.plot(kind='bar', stacked=True, figsize=(10, 6))
plt.title('Total Working Hours per Month (Year-wise)')
plt.xlabel('Month')
plt.ylabel('Working Hours')
plt.legend(title='Year', loc='upper left', bbox_to_anchor=(1, 1))
plt.xticks(rotation=0)
plt.show()

In [None]:
plt.figure(figsize=(15, 4))
sns.heatmap(pivot_result, annot=True, fmt=".1f", cmap='YlGnBu', linewidths=0.5)
plt.title('Working Hours Heatmap (Year-wise)')
plt.xlabel('Month')
plt.ylabel('Year')
plt.show()

In [None]:
grouped_df = df.groupby(df['date'].dt.year)
# Iterate over each group (year) and create a visual calendar
for year, year_data in grouped_df:
    # Create a DataFrame with all days of the year
    min_date = year_data['date'].min()
    max_date = year_data['date'].max()
    all_dates = pd.date_range(start=min_date, end=max_date)

    # Create a DataFrame for all days of the year
    calendar_df = pd.DataFrame({'date': all_dates})

    # Merge the calendar DataFrame with the actual DataFrame to mark working days
    calendar_df = pd.merge(calendar_df, year_data[['date', 'total_time_hour']], on='date', how='left')

    # Fill NaN (missing values) with 0 to represent free days
    calendar_df['total_time_hour'].fillna(0, inplace=True)

    # Pivot the DataFrame for the heatmap-like visualization
    calendar_pivot = calendar_df.pivot_table(index=calendar_df['date'].dt.month,
                                            columns=calendar_df['date'].dt.day,
                                            values='total_time_hour',
                                            fill_value=0)

    # Set up the figure and plot the heatmap
    plt.figure(figsize=(15, 5))
    sns.heatmap(calendar_pivot, cmap='coolwarm', annot=True, fmt=".1f", linewidths=0.5, cbar=False)

    # Set the labels and title
    plt.xlabel('Day')
    plt.ylabel('Month')
    plt.title(f'Visual Calendar - Working Hours ({year})')

    plt.show()