We take care of year 1971 separately and add the data for January.

In [17]:
import json
import os
from ltp import StnSplit

rawdir = '../../corpus/raw/'
sentdir = '../../corpus/sent'
jan1971dir = '../../corpus/jan-1971'
csv_file = 'sent_stats.csv'

os.makedirs(sentdir, exist_ok=True)

stn_split = StnSplit()

def split_and_write_sentences(indir, outdir, year):
    min_length = float('inf')
    max_length = 0
    total_length = 0
    total_sentences = 0

    if year == '1971':
        paths = []
        for root, dirs, files in os.walk(jan1971dir):
            for file in files:
                paths.append(os.path.join(root, file).encode('utf-8'))
            
        for path in paths:
            with open(path, 'r', encoding='utf-8') as txt_file, \
                open(outdir, 'a', encoding='utf-8') as outfile:
                for line in txt_file:
                    sentences = stn_split.split(line.strip())
                    for sentence in sentences:
                        sent = sentence.strip()
                        if sent:
                            sentence_length = len(sent)
                            if sentence_length < min_length:
                                min_length = sentence_length
                            if sentence_length > max_length:
                                max_length = sentence_length
                            total_length += sentence_length
                            total_sentences += 1
                            outfile.write(sent + '\n')

    with open(indir, 'r', encoding='utf-8') as infile, open(outdir, 'a', encoding='utf-8') as outfile:
        for line in infile:
            news_item = json.loads(line.strip())
            text = news_item.get('text', '')
            if text:
                sentences = stn_split.split(text)
                for sentence in sentences:
                    sent = sentence.strip()
                    if sent:
                        sentence_length = len(sent)
                        if sentence_length < min_length:
                            min_length = sentence_length
                        if sentence_length > max_length:
                            max_length = sentence_length
                        total_length += sentence_length
                        total_sentences += 1
                        outfile.write(sent + '\n')

    avg_length = total_length / total_sentences if total_sentences > 0 else 0

    return {
        'min_length': min_length if min_length != float('inf') else 0,
        'max_length': max_length,
        'avg_length': avg_length,
        'total_sentences': total_sentences
    }


def process_all_years_and_write(input_folder, output_folder, stats_file):
    all_stats = []

    for filename in os.listdir(input_folder):
        if filename.endswith('.jsonl'):
            year = filename.split('.')[0]
            input_file = os.path.join(input_folder, filename)
            output_file = os.path.join(output_folder, f'{year}.txt')
            
            stats = split_and_write_sentences(input_file, output_file, year)
            stats['year'] = year
            all_stats.append(stats)
            # print(f"Processed year {year}: {stats}")

    # Write statistics to a CSV file
    with open(stats_file, 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames = ['year', 'min_length', 'max_length', 'avg_length', 'total_sentences']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for stats in all_stats:
            writer.writerow(stats)

In [18]:
process_all_years_and_write(rawdir, sentdir, csv_file)