In [1]:
# to format get the information from the html
import pandas as pd
from bs4 import BeautifulSoup
from IPython.display import display
import re
from dateutil import parser

#to create and display plot
import dash
from dash import dcc, html
from dash.dependencies import Input, Output
import pandas as pd
import plotly.express as px

Get data

In [5]:
# HTML fájl beolvasása
with open('watch_history.html', 'r', encoding='utf-8') as file:
    content = file.read()

soup = BeautifulSoup(content, 'html.parser')


In [6]:
data = []

# Minden YouTube megtekintéshez tartozó elem megtalálása
for item in soup.find_all('div', class_='outer-cell mdl-cell mdl-cell--12-col mdl-shadow--2dp'):
    title_element = item.find('a', href=True)
    title = title_element.text if title_element else None
    
    date_element = item.find('div', class_='content-cell mdl-cell mdl-cell--6-col mdl-typography--body-1')
    date = date_element.text if date_element else None
    
    data.append({
        'Title': title,
        'Date': date
    })

In [157]:
# Adatok DataFrame-be rendezése
df = pd.DataFrame(data)


In [158]:
# Hónapok magyar neveinek és számuknak megfeleltetése
month_map = {
    'jan': '01',
    'febr': '02',
    'márc': '03',
    'ápr': '04',
    'máj': '05',
    'jún': '06',
    'júl': '07',
    'aug': '08',
    'szept': '09',
    'okt': '10',
    'nov': '11',
    'dec': '12'
}

# Függvény a hónapok átalakítására
def convert_month(text):
    for month_name, month_num in month_map.items():
        text = re.sub(month_name, month_num, text)
    return text

# Dátumok átalakítása
df['Date'] = df['Date'].apply(convert_month)

# Mintázat, hogy csak a dátumot tartsuk meg a szövegből
date_pattern = re.compile(r'\d{4}\. \d{2}\. \d{1,2}\. \d{1,2}:\d{2}:\d{2} CEST')

def extract_date(text):
    match = date_pattern.search(text)
    if match:
        return match.group(0)
    return None

df['Date'] = df['Date'].apply(extract_date)

# Átalakítás datetime formátumba
df['Date'] = pd.to_datetime(df['Date'], format='%Y. %m. %d. %H:%M:%S CEST', errors='coerce')


Plot in Dash

In [2]:
# Load your data
df = pd.read_excel('history.xlsx')

# Convert the 'Date' column to datetime
df['Date'] = pd.to_datetime(df['Date'])

# Extract year and month for filtering
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Day'] = df['Date'].dt.day
df['Hour'] = df['Date'].dt.hour

# Aggregate data by year, month, day, and hour
agg_df = df.groupby(['Year', 'Month', 'Day', 'Hour']).size().reset_index(name='Count')

# Initialize Dash app
app = dash.Dash(__name__)

app.layout = html.Div([
    html.H1("YouTube Viewing History Heatmap"),
    dcc.Dropdown(
        id='year-dropdown',
        options=[{'label': str(year), 'value': year} for year in sorted(df['Year'].unique())],
        value=df['Year'].max(),
        clearable=False
    ),
    dcc.Dropdown(
        id='month-dropdown',
        options=[{'label': str(month), 'value': month} for month in range(1, 13)],
        value=df['Month'].max(),
        clearable=False
    ),
    dcc.Graph(id='heatmap')
])

@app.callback(
    Output('heatmap', 'figure'),
    Input('year-dropdown', 'value'),
    Input('month-dropdown', 'value')
)
def update_heatmap(selected_year, selected_month):
    filtered_df = agg_df[(agg_df['Year'] == selected_year) & (agg_df['Month'] == selected_month)]
    
    heatmap_data = filtered_df.pivot(index='Hour', columns='Day', values='Count').fillna(0)

    fig = px.imshow(
        heatmap_data,
        labels=dict(x="Day of Month", y="Hour of Day", color="Video Count"),
        x=heatmap_data.columns,
        y=heatmap_data.index,
        color_continuous_scale='Viridis'
    )
    fig.update_layout(title=f'Videos Watched in {selected_month}/{selected_year}', xaxis_nticks=31)

    return fig

if __name__ == '__main__':
    app.run_server(debug=True)