The newspaper itself was published on 15th May 1946. We have news data between May 1946 and December 2023 from https://github.com/prnake/CialloCorpus or https://huggingface.co/datasets/Papersnake/people_daily_news, but we notice January of 1970 is missing here, so the month is taken from https://github.com/caspiankexin/people-daily-crawler-date (around half of the data is missing from this source between 1991-1992, so we gave up using it as the main source).

In [74]:
import json
import os
import csv
from collections import defaultdict
import statistics
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter, ScalarFormatter


def process_news_data(file_path):
    year = int(file_path.split('/')[-1].split('.')[0])  # Extract year from filename
    monthly_news_counts = defaultdict(int)
    news_lengths = []

    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            news_item = json.loads(line.strip())
            date = news_item.get('date')
            text = news_item.get('text', '')
            if date and text:
                month = int(date.split('.')[1])
                monthly_news_counts[month] += 1
                news_lengths.append(len(text))

    if not monthly_news_counts:
        return None
    
    missing_month = []
    for i in range(1, 13):
        if i not in monthly_news_counts.keys():
            missing_month.append(i)
    
    if missing_month:
        print('Months', str(missing_month), 'are missing for year', str(year))

    max_news_month = max(monthly_news_counts, key=monthly_news_counts.get)
    min_news_month = min(monthly_news_counts, key=monthly_news_counts.get)
    total_news_count = sum(monthly_news_counts.values())
    avg_news_per_month = total_news_count / 12
    longest_news = max(news_lengths, default=0)
    shortest_news = min(news_lengths, default=0)
    avg_news_length = statistics.mean(news_lengths) if news_lengths else 0
    total_news_length = sum(news_lengths)

    return {
        'year': year,
        'max_month': max_news_month,
        'max_count': monthly_news_counts[max_news_month],
        'min_month': min_news_month,
        'min_count': monthly_news_counts[min_news_month],
        'avg_count': avg_news_per_month,
        'total_count': total_news_count,
        'longest': longest_news,
        'shortest': shortest_news,
        'avg_length': avg_news_length,
        'total_length': total_news_length
    }


def process_all_years(data_directory):
    yearly_stats = []
    for filename in os.listdir(data_directory):
        if filename.endswith('.jsonl'):
            file_path = os.path.join(data_directory, filename)
            stats = process_news_data(file_path)
            if stats:
                yearly_stats.append(stats)
    return yearly_stats


def write_stats_to_csv(yearly_stats, output_file):
    with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames = [
            'year', 'max_month', 'max_count', 'min_month', 'min_count', 'avg_count', 'total_count',
            'longest', 'shortest', 'avg_length', 'total_length'
        ]
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for stats in yearly_stats:
            writer.writerow(stats)


def plot_stats(yearly_stats):
    years = [int(stat['year']) for stat in yearly_stats]
    total_counts = [stat['total_count'] for stat in yearly_stats]
    total_lengths = [stat['total_length'] for stat in yearly_stats]

    fig, ax1 = plt.subplots(1, 2, figsize=(12, 4))

    ax1[0].plot(years, total_counts, marker='o')
    ax1[0].set_title('Total News Count by Year')
    ax1[0].set_xlabel('year')
    ax1[0].set_ylabel('Total News Count (1e3)')
    ax1[0].yaxis.set_major_formatter(FuncFormatter(lambda x, _: f'{int(x/1e3)}k'))

    ax1[1].plot(years, total_lengths, marker='o', color='r')
    ax1[1].set_title('Total News Length by Year')
    ax1[1].set_xlabel('year')
    ax1[1].set_ylabel('Total News Length (1e6)')
    ax1[1].yaxis.set_major_formatter(FuncFormatter(lambda x, _: f'{int(x/1e6)}m'))

    plt.tight_layout()
    plt.savefig('raw_stats.png')
    plt.close()

In [75]:
rawdir = '../../corpus/raw/'
yearly_stats = process_all_years(rawdir)

Months [1, 2, 3, 4] are missing for year 1946
Months [1] are missing for year 1971


In [76]:
csv_file = 'raw_stats.csv'
write_stats_to_csv(yearly_stats, csv_file)

In [77]:
plot_stats(yearly_stats)