In [35]:
import json
import requests

import pandas as pd

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.io as pio


In [None]:
def parse_and_get_dataframe(url, filename, city, date):
    # * const
    headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
    columns = ['TRAIN_NAME','DATETIME_DEPARTURE_PLAN_MSK', 'DATETIME_DEPARTURE_ACTUAL_MSK', 'DATETIME_ARRIVAL_PLAN_MSK', 'DATETIME_ARRIVAL_ACTUAL_MSK']
    datetime_columns = ['DATETIME_DEPARTURE_PLAN_MSK', 'DATETIME_DEPARTURE_ACTUAL_MSK', 'DATETIME_ARRIVAL_PLAN_MSK', 'DATETIME_ARRIVAL_ACTUAL_MSK']

    # * parse
    response = requests.get(url+date, headers=headers)
    # save the response
    with open(filename, 'w', encoding='utf-8') as file:
        json.dump(response.json()['TABLO']['TABLO_TRAIN'], file, ensure_ascii=False, indent=4)

    # open the response
    df_temp = pd.read_json(filename)
    df_temp=df_temp[columns]
    df_temp[datetime_columns] = df_temp[datetime_columns].apply(pd.to_datetime)

    df = pd.DataFrame()
    df['dep_delay'] = df_temp['DATETIME_DEPARTURE_ACTUAL_MSK'] - df_temp['DATETIME_DEPARTURE_PLAN_MSK']
    df['arr_delay'] = df_temp['DATETIME_ARRIVAL_ACTUAL_MSK'] - df_temp['DATETIME_ARRIVAL_PLAN_MSK']
    df['station_name'] = city
    df['on_time'] = df_temp['DATETIME_DEPARTURE_ACTUAL_MSK'] == df_temp['DATETIME_DEPARTURE_PLAN_MSK']
    df['date'] = date
    return df

In [None]:
with open('urls.json') as file:
    urls = json.load(file)

df = pd.DataFrame()

dates = ['08.07.2024', '09.07.2024', '10.07.2024', '11.07.2024', '12.07.2024','13.07.2024','14.07.2024','15.07.2024', '16.07.2024']

for date in dates:
    for station in urls:
        station_name = station["STATION_NAME"]
        url = station["URL"]
        df = pd.concat([df, parse_and_get_dataframe(url, 'test_1.json', station_name, date)])

df.dropna(subset=['dep_delay', 'arr_delay'], how='all', inplace=True)
df.fillna(0, inplace=True)


In [None]:
df.to_json('stations.json', orient='records', force_ascii=False, indent=4)

In [None]:
df.shape

In [18]:
df[:5]

Unnamed: 0,dep_delay,arr_delay,station_name,on_time,date,dep_delay_min,arr_delay_min,total_delay_min
0,300000,240000,Белорусский вокзал,False,08.07.2024,5.0,4.0,9.0
1,0,120000,Белорусский вокзал,False,08.07.2024,0.0,2.0,2.0
2,0,0,Белорусский вокзал,False,08.07.2024,0.0,0.0,0.0
3,180000,0,Белорусский вокзал,False,08.07.2024,3.0,0.0,3.0
4,0,0,Белорусский вокзал,True,08.07.2024,0.0,0.0,0.0


In [20]:
with open('stations.json', 'r') as file:
    data = json.load(file)

df = pd.DataFrame(data)

df = df.dropna(subset=['dep_delay', 'arr_delay'], how='all')


df['dep_delay_min'] = df['dep_delay'] / 60000
df['arr_delay_min'] = df['arr_delay'] / 60000
df['total_delay_min'] = df['dep_delay_min'] + df['arr_delay_min']

pio.templates.default = "plotly_dark"


df.fillna(0, inplace=True)

In [29]:
df_grouped = df.groupby('station_name')[['dep_delay', 'arr_delay']].mean().reset_index()

on_time_counts = df.groupby(['station_name', 'on_time']).size().reset_index(name='counts')

# Создание подграфиков
fig = make_subplots(
    rows=3, 
    cols=1, 
    subplot_titles=(
        'Средняя задержка отправления по станциям', 
        'Средняя задержка прибытия по станциям', 
        'Количество поездов вовремя и с задержкой по станциям'
    )
)

fig.add_trace(
    go.Bar(
        x=df_grouped['station_name'], 
        y=df_grouped['dep_delay'], 
        name='Задержка отправления (мин)', 
        marker_color='#1f77b4'
    ), 
    row=1, 
    col=1
)

fig.add_trace(
    go.Bar(
        x=df_grouped['station_name'], 
        y=df_grouped['arr_delay'], 
        name='Задержка прибытия (мин)', 
        marker_color='#ff7f0e'
    ), 
    row=2, 
    col=1
)

for on_time_value in on_time_counts['on_time'].unique():
    df_filtered = on_time_counts[on_time_counts['on_time'] == on_time_value]
    fig.add_trace(
        go.Bar(
            x=df_filtered['station_name'], 
            y=df_filtered['counts'], 
            name='Вовремя' if on_time_value else 'С задержкой', 
            marker_color='#2ca02c' if on_time_value else '#d62728'
        ), 
        row=3, 
        col=1
    )

fig.update_layout(
    height=900, 
    showlegend=False, 
    template='plotly_dark',
    plot_bgcolor='rgba(0, 0, 0, 0)',
    paper_bgcolor='rgba(0, 0, 0, 0)'
)

fig.update_xaxes(title_text='Название станции', row=3, col=1)
fig.update_yaxes(title_text='Средняя задержка (мин)', row=1, col=1)
fig.update_yaxes(title_text='Средняя задержка (мин)', row=2, col=1)
fig.update_yaxes(title_text='Количество поездов', row=3, col=1)

fig.show()


In [22]:
on_time_counts = df['on_time'].value_counts().reset_index()
on_time_counts.columns = ['on_time', 'count']

color_discrete_map = {
    True: '#1f77b4',
    False: '#ff7f0e'
}

# Диаграмма
fig_on_time = px.bar(
    on_time_counts, 
    x='on_time', 
    y='count', 
    title='Поезда, прибывшие вовремя vs с задержкой', 
    labels={'on_time': 'Вовремя', 'count': 'Количество'},
    color='on_time', 
    color_discrete_map=color_discrete_map,
    text='count'
)

# Настройка внешнего вида
fig_on_time.update_traces(
    textfont_size=12, 
    textangle=0, 
    textposition="outside", 
    cliponaxis=False
)

fig_on_time.update_layout(
    showlegend=False,
    title_font_size=20,
    xaxis=dict(
        title='',
        tickvals=[True, False],
        ticktext=['Вовремя', 'С задержкой']
    ),
    yaxis=dict(
        title='Количество',
        gridcolor='gray'
    ),
    font=dict(
        family="Arial, sans-serif",
        size=14,
        color="white"
    ),
    plot_bgcolor='rgba(0,0,0,0)',
    paper_bgcolor='rgba(0,0,0,0)',
    bargap=0.2,
    margin=dict(l=50, r=50, t=50, b=50),
    width=1000
)

fig_on_time.show()


In [23]:
df['date'] = pd.to_datetime(df['date'], format='%d.%m.%Y')

average_delays_by_date = df.groupby('date').agg({'dep_delay_min': 'mean', 'arr_delay_min': 'mean'}).reset_index()

fig_avg_delays = go.Figure()
fig_avg_delays.add_trace(go.Scatter(
    x=average_delays_by_date['date'], 
    y=average_delays_by_date['dep_delay_min'], 
    mode='lines+markers+text', 
    name='Средняя задержка отправления (мин)', 
    line=dict(color='#1f77b4'), 
    marker=dict(color='#1f77b4', size=8), 
    text=average_delays_by_date['dep_delay_min'].round(2), 
    textposition='top center'
))
fig_avg_delays.add_trace(go.Scatter(
    x=average_delays_by_date['date'], 
    y=average_delays_by_date['arr_delay_min'], 
    mode='lines+markers+text', 
    name='Средняя задержка прибытия (мин)', 
    line=dict(color='#ff7f0e'), 
    marker=dict(color='#ff7f0e', size=8), 
    text=average_delays_by_date['arr_delay_min'].round(2), 
    textposition='top center'
))

fig_avg_delays.update_layout(
    height=600,
    title_text='Средние задержки по датам',
    template='plotly_dark',
    plot_bgcolor='rgba(0, 0, 0, 0)',
    paper_bgcolor='rgba(0, 0, 0, 0)',
    showlegend=True,
    legend=dict(
        orientation="h",
        yanchor="top",
        y=1.15,
        xanchor="center",
        x=0.5,
        font=dict(size=12)
    ),
    font=dict(size=12),
    margin=dict(l=40, r=40, t=40, b=80)
)

fig_avg_delays.update_xaxes(title_text="Дата", tickangle=-45)
fig_avg_delays.update_yaxes(title_text="Средняя задержка (минуты)")

fig_avg_delays.show()


In [24]:
delay_summary = df[['dep_delay_min', 'arr_delay_min']].describe()

on_time_counts = df['on_time'].value_counts().reset_index()
on_time_counts.columns = ['on_time', 'count']
on_time_counts['on_time'] = on_time_counts['on_time'].replace({True: 'Без задержки', False: 'Задержка'})

fig_on_time = go.Figure()
fig_on_time.add_trace(go.Pie(
    labels=on_time_counts['on_time'], 
    values=on_time_counts['count'], 
    name='Поезда вовремя vs с задержкой', 
    marker=dict(colors=['#AEC6CF', '#FFB347']),
    textinfo='label+percent',
    hoverinfo='label+percent+value',
    textfont_size=14,
    marker_line=dict(color='#000000', width=2)
))

fig_on_time.update_layout(
    height=500,
    title_text='Поезда вовремя vs с задержкой',
    template='plotly_dark',
    plot_bgcolor='rgba(0, 0, 0, 0)',
    paper_bgcolor='rgba(0, 0, 0, 0)',
    showlegend=True,
    legend=dict(
        orientation="h",
        yanchor="bottom",
        y=-0.2,
        xanchor="center",
        x=0.5,
        font=dict(size=12)
    ),
    font=dict(size=12)
)

fig_on_time.show()


In [25]:
df['dep_delay_hours'] = df['dep_delay'] / 3600000
df['arr_delay_hours'] = df['arr_delay'] / 3600000
df['date'] = pd.to_datetime(df['date'], format='%d.%m.%Y')
df['total_delay_hours'] = df['dep_delay_hours'] + df['arr_delay_hours']

average_delay_by_station = df.groupby('station_name').agg({
    'dep_delay_hours': 'mean', 
    'arr_delay_hours': 'mean', 
    'total_delay_hours': 'mean'
}).reset_index()

daily_total_delays = df.groupby('date').agg({'total_delay_hours': 'sum'}).reset_index()
daily_total_delays['text'] = daily_total_delays['total_delay_hours'].round(2).astype(str) + ' ч.'

fig_daily_total_delays = px.line(
    daily_total_delays, 
    x='date', 
    y='total_delay_hours', 
    labels={'total_delay_hours': 'Общая задержка (часы)', 'date': 'Дата'},
    title='Ежедневные общие задержки по времени',
    markers=True
)

fig_daily_total_delays.update_traces(
    line=dict(color='cyan', width=4), 
    marker=dict(color='cyan', size=8),
    text=daily_total_delays['text'],
    textposition='top center'
)

fig_daily_total_delays.update_layout(
    title='Ежедневные общие задержки по времени',
    xaxis_title='Дата',
    yaxis_title='Общая задержка (часы)',
    template='plotly_dark',
    plot_bgcolor='rgba(0, 0, 0, 0)',
    paper_bgcolor='rgba(0, 0, 0, 0)',
    font=dict(size=12),
    title_font=dict(size=20, color='white'),
    xaxis=dict(showgrid=True, gridwidth=1, gridcolor='gray'),
    yaxis=dict(showgrid=True, gridwidth=1, gridcolor='gray'),
    margin=dict(l=50, r=50, t=50, b=50)
)

max_delay = daily_total_delays['total_delay_hours'].max()
max_delay_date = daily_total_delays[daily_total_delays['total_delay_hours'] == max_delay]['date'].values[0]
fig_daily_total_delays.add_annotation(
    x=max_delay_date, 
    y=max_delay + 7, 
    text=f'Пик: {max_delay:.2f} часа', 
    showarrow=True, 
    arrowhead=5,
    ax=0,
    ay=-40,
    bgcolor="black",
    font=dict(color="white")
)

for i in range(len(daily_total_delays)):
    fig_daily_total_delays.add_annotation(
        x=daily_total_delays['date'][i],
        y=daily_total_delays['total_delay_hours'][i] + 10,
        text=daily_total_delays['text'][i],
        showarrow=False,
        font=dict(color='lightyellow')
    )

fig_daily_total_delays.show()


In [32]:
delay_metrics = df.groupby(['date', 'station_name']).agg({
    'dep_delay_min': ['mean', 'sum'], 
    'arr_delay_min': ['mean', 'sum'], 
    'total_delay_min': ['mean', 'sum']
}).reset_index()
delay_metrics.columns = ['date', 'station_name', 'avg_dep_delay', 'total_dep_delay', 'avg_arr_delay', 'total_arr_delay', 'avg_total_delay', 'total_total_delay']

total_delays_by_station = df.groupby('station_name').agg({'total_delay_min': 'sum'}).reset_index()
fig_pie_delays_station = px.pie(
    total_delays_by_station, 
    names='station_name', 
    values='total_delay_min', 
    title='Доля общих задержек по станциям',
    color_discrete_sequence=px.colors.qualitative.Bold
)

fig_pie_delays_station.update_traces(
    textinfo='percent+label', 
    hoverinfo='label+percent+value', 
    textfont_size=14,
    marker=dict(line=dict(color='#000000', width=2))
)

fig_pie_delays_station.update_layout(
    title_text='Доля общих задержек по станциям',
    title_x=0.5,  # Центрирование заголовка
    template='plotly_dark',
    paper_bgcolor='rgba(0, 0, 0, 0)',
    plot_bgcolor='rgba(0, 0, 0, 0)',
    legend_title_text='Станция',
    font=dict(size=12),
    title_font=dict(size=20, color='white')
)

fig_pie_delays_station.show()


In [34]:
df_late = df[df['total_delay_min'] > 0]

daily_delay_counts = df.groupby('date').size().reset_index(name='delay_count')

late_proportion_by_date = df.groupby('date').apply(lambda x: (x['total_delay_min'] > 0).mean()).reset_index(name='late_proportion')

fig = make_subplots(
    rows=2, 
    cols=1, 
    shared_xaxes=True, 
    vertical_spacing=0.1,
    subplot_titles=('Количество задержек по дням', 'Доля опоздавших поездов по дням')
)

fig.add_trace(
    go.Scatter(
        x=daily_delay_counts['date'], 
        y=daily_delay_counts['delay_count'], 
        mode='lines+markers', 
        name='Количество задержек',
        line=dict(color='#1f77b4'), 
        marker=dict(color='#1f77b4', size=8)
    ), 
    row=1, 
    col=1
)

fig.add_trace(
    go.Scatter(
        x=late_proportion_by_date['date'], 
        y=late_proportion_by_date['late_proportion'], 
        mode='lines+markers', 
        name='Доля опоздавших поездов',
        line=dict(color='#ff7f0e'), 
        marker=dict(color='#ff7f0e', size=8)
    ), 
    row=2, 
    col=1
)

fig.update_layout(
    height=800,
    title_text='Анализ задержек поездов',
    title_x=0.5,  
    template='plotly_dark',
    plot_bgcolor='rgba(0, 0, 0, 0)',
    paper_bgcolor='rgba(0, 0, 0, 0)',
    showlegend=True,
    legend=dict(
        orientation="h",
        yanchor="bottom",
        y=-0.3,
        xanchor="center",
        x=0.5,
        font=dict(size=12)
    ),
    font=dict(size=12)
)

fig.update_xaxes(title_text="Дата", row=2, col=1)
fig.update_yaxes(title_text="Количество задержек", row=1, col=1)
fig.update_yaxes(title_text="Доля опоздавших поездов", row=2, col=1)

fig.show();






In [33]:
df['delay_type'] = 'Без задержек'
df.loc[(df['dep_delay_min'] > 0) & (df['arr_delay_min'] > 0), 'delay_type'] = 'Задержка отправления и прибытия'
df.loc[(df['dep_delay_min'] > 0) & (df['arr_delay_min'] <= 0), 'delay_type'] = 'Задержка отправления'
df.loc[(df['dep_delay_min'] <= 0) & (df['arr_delay_min'] > 0), 'delay_type'] = 'Задержка прибытия'


fig_pie_delay = px.pie(
    df,
    names='delay_type',
    title='Процент поездов с задержками',
    color_discrete_sequence=px.colors.qualitative.Bold
)


fig_pie_delay.update_traces(
    textinfo='percent+label',
    hoverinfo='label+percent+value',
    textfont_size=14,
    marker=dict(line=dict(color='#000000', width=2))
)

fig_pie_delay.update_layout(
    title_text='Процент поездов с задержками',
    title_x=0.5,  
    template='plotly_dark',  
    paper_bgcolor='rgba(0, 0, 0, 0)',  
    plot_bgcolor='rgba(0, 0, 0, 0)',  
    legend_title_text='Тип задержки',
    font=dict(size=12),
    title_font=dict(size=20, color='white')
)

fig_pie_delay.show()
