In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
"""
Provides screening days analysis of the data in the CSV file. The format of the CSV file is as follows:
```
cinema,date,title_excerpt,url
Метрополь,01.09.1953,"АРЕНА СМЕЛЫХ",https://electro.nekrasovka.ru/books/6173751/pages/4
Метрополь,01.09.1953,"ПАРМСКАЯ ОБИТЕЛЬ (1-я серия)",https://electro.nekrasovka.ru/books/6173751/pages/4
Метрополь,01.09.1953,"ЧУК И ГЕК",https://electro.nekrasovka.ru/books/6173751/pages/4
Ударник,01.09.1953,"ПАРМСКАЯ ОБИТЕЛЬ (1-я серия)",https://electro.nekrasovka.ru/books/6173751/pages/4
Ударник,01.09.1953,"ПРАЗДНИК СВ. ИОРГЕНА",https://electro.nekrasovka.ru/books/6173751/pages/4
Орион,01.09.1953,"ПАРМСКАЯ ОБИТЕЛЬ (1-я серия)",https://electro.nekrasovka.ru/books/6173751/pages/4
```
"""

'\nProvides screening days analysis of the data in the CSV file. The format of the CSV file is as follows:\n```\ncinema,date,title_excerpt,url\nМетрополь,01.09.1953,"АРЕНА СМЕЛЫХ",https://electro.nekrasovka.ru/books/6173751/pages/4\nМетрополь,01.09.1953,"ПАРМСКАЯ ОБИТЕЛЬ (1-я серия)",https://electro.nekrasovka.ru/books/6173751/pages/4\nМетрополь,01.09.1953,"ЧУК И ГЕК",https://electro.nekrasovka.ru/books/6173751/pages/4\nУдарник,01.09.1953,"ПАРМСКАЯ ОБИТЕЛЬ (1-я серия)",https://electro.nekrasovka.ru/books/6173751/pages/4\nУдарник,01.09.1953,"ПРАЗДНИК СВ. ИОРГЕНА",https://electro.nekrasovka.ru/books/6173751/pages/4\nОрион,01.09.1953,"ПАРМСКАЯ ОБИТЕЛЬ (1-я серия)",https://electro.nekrasovka.ru/books/6173751/pages/4\n```\n'

In [3]:
# Turns on step-wise logging
verbose_logging = True
# Location where the CSV file with screening days analysis is stored (see the last code block)
save_filepath = './results/screening.csv'

In [4]:
def get_data(csv_filepath: str) -> pd.DataFrame:
    """
    Reads a CSV file from the given filepath and returns it as a pandas DataFrame.

    Args:
        csv_filepath (str): The path to the CSV file.

    Returns:
        pd.DataFrame: The data from the CSV file as a pandas DataFrame.
    """
    data = pd.read_csv(csv_filepath)
    return data

In [5]:
filepath = './results/schedule.csv'
data = get_data(filepath)
data = data[['cinema', 'date', 'title_excerpt']]
data = data.rename(columns={'title_excerpt': 'title'})
data.head()

Unnamed: 0,cinema,date,title
0,Метрополь,01.09.1953,АРЕНА СМЕЛЫХ
1,Метрополь,01.09.1953,ПАРМСКАЯ ОБИТЕЛЬ (1-я серия)
2,Метрополь,01.09.1953,ЧУК И ГЕК
3,Ударник,01.09.1953,ПАРМСКАЯ ОБИТЕЛЬ (1-я серия)
4,Ударник,01.09.1953,ПРАЗДНИК СВ. ИОРГЕНА


In [6]:
titles = data.groupby('title').count()
titles

Unnamed: 0_level_0,cinema,date
title,Unnamed: 1_level_1,Unnamed: 2_level_1
NE NE cara?,1,1
АННА ПРОЛЕТАРКА,3,3
АРЕНА СМЕЛЫХ,15,15
ЗВЕЗДА,21,21
ЛЕВ ТОЛСТОЙ,26,26
МЕЧТЫ НА ДОРОГАХ,1,1
НАД НЕМАНОМ РАССВЕТ,15,15
ПАРМСКАЯ ОБИТЕЛЬ (1-я серия),30,30
ПАРМСКАЯ ОБИТЕЛЬ (2-я серия),29,29
ПРАЗДНИК СВ. ИОРГЕНА,3,3


In [7]:
screening_info = []


for (cinema, title), group in data.groupby(['cinema', 'title']):
    group['date'] = pd.to_datetime(group['date'], format='%d.%m.%Y')
    group = group.sort_values(by='date')

    last_date_included = -1

    for i, (row_index, row) in enumerate(group.iterrows()):
        if (last_date_included != -1) and (row['date'] - last_date_included).days <= 0:
            continue

        prev_date = row['date']
        screening_days = 0

        next_rows = group.loc[row_index:, ['cinema', 'date', 'title']]

        for _, row2 in next_rows.iterrows():
            if (row2['cinema'] != cinema) or (row2['title'] != title):
                break

            # we expect to count the current row as well (thus, <= 1), since it is included into next_rows
            if (row2['date'] - prev_date).days <= 1:
                prev_date = row2['date']
                screening_days += 1

        assert (screening_days == 1 + (prev_date - row['date']).days),\
                f"Expected {screening_days} days, but got {(prev_date - row['date']).days} days"

        last_date_included = prev_date

        if verbose_logging:
            print(f"[{cinema}\t{title}\t{row['date']}]:\tSkip dates up to (incl)\t{last_date_included}\t({screening_days} days)")

        screening_info.append([cinema, row['date'].strftime('%d/%m/%Y'), screening_days, title])

[Метрополь	АННА ПРОЛЕТАРКА	1953-09-16 00:00:00]:	Skip dates up to (incl)	1953-09-16 00:00:00	(1 days)
[Метрополь	АРЕНА СМЕЛЫХ	1953-09-01 00:00:00]:	Skip dates up to (incl)	1953-09-04 00:00:00	(4 days)
[Метрополь	АРЕНА СМЕЛЫХ	1953-09-07 00:00:00]:	Skip dates up to (incl)	1953-09-12 00:00:00	(6 days)
[Метрополь	АРЕНА СМЕЛЫХ	1953-09-16 00:00:00]:	Skip dates up to (incl)	1953-09-16 00:00:00	(1 days)
[Метрополь	АРЕНА СМЕЛЫХ	1953-09-18 00:00:00]:	Skip dates up to (incl)	1953-09-19 00:00:00	(2 days)
[Метрополь	АРЕНА СМЕЛЫХ	1953-09-21 00:00:00]:	Skip dates up to (incl)	1953-09-22 00:00:00	(2 days)
[Метрополь	ЗВЕЗДА	1953-09-21 00:00:00]:	Skip dates up to (incl)	1953-09-26 00:00:00	(6 days)
[Метрополь	ЗВЕЗДА	1953-09-28 00:00:00]:	Skip dates up to (incl)	1953-09-30 00:00:00	(3 days)
[Метрополь	ЛЕВ ТОЛСТОЙ	1953-09-07 00:00:00]:	Skip dates up to (incl)	1953-09-12 00:00:00	(6 days)
[Метрополь	ЛЕВ ТОЛСТОЙ	1953-09-16 00:00:00]:	Skip dates up to (incl)	1953-09-16 00:00:00	(1 days)
[Метрополь	НАД НЕМАНО

In [8]:
screening_df = pd.DataFrame(screening_info, columns=['Cinema', 'First day of screening', 'Screening Days', 'Title'])

if os.path.exists(save_filepath):
    print(f"The save filepath '{save_filepath}' already exists. Either delete it or specify a different save location (See `save_filepath` variable above).")
else:
    print(f"Saving the screening days analysis to '{save_filepath}'")
    os.makedirs('./results', exist_ok=True)
    screening_df.to_csv(save_filepath, index=False)

Saving the screening days analysis to './results/screening.csv'
