# Script to group C-CLAMP's corpus files by author
This script has to be available within the same folder as ...
1) `C-CLAMP_metadata.txt`
2) the folder `corpus_final_ids`

In [3]:
# Load the necessary packages
import pandas as pd
import re
import os
import shutil

In [5]:
# Load the metadata file
metadata_df = pd.read_csv("C-CLAMP_metadata.txt", sep="\t", encoding='utf-8', header=None)
print(metadata_df)

                  0     1                                                  2  \
0        GEM_1925_1  1925                                                NaN   
1        GEM_1925_2  1925                       Gedachten over architectuur.   
2        GEM_1925_3  1925  Sint Maartensavond Bij een linoleumsnede van J...   
3        GEM_1925_4  1925                                                NaN   
4        GEM_1925_5  1925  Is de nieuwe meerstemmige kerkmuziek in Nederl...   
...             ...   ...                                                ...   
52926   STR_1947_96  1947  Economische kroniek Groot-Brittannie's betalin...   
52927   STR_1947_97  1947  Internationale kroniek Frankrijks roeping door...   
52928   STR_1947_98  1947         Kroniek der tijdschriften door G. de Vocht   
52929   STR_1947_99  1947                                     Boekbespreking   
52930  STR_1947_100  1947                                                NaN   

                                       

In [7]:
# Rename columns
metadata_df = metadata_df.rename(columns={0: 'textID', 1: 'year', 2: 'title', 3: 'author', 4: 'birthDate', 5: 'birthPlace',
                                          6: 'deathDate', 7: 'deathPlace', 8: 'link'})
metadata_df.head(5)

Unnamed: 0,textID,year,title,author,birthDate,birthPlace,deathDate,deathPlace,link
0,GEM_1925_1,1925,,,,,,,
1,GEM_1925_2,1925,Gedachten over architectuur.,Jan Engelman; Willem Maas,7 juni 1900; 28 april 1897,Utrecht; Utrecht,20 maart 1972; 6 mei 1950,Amsterdam; Utrecht,https://www.dbnl.org/auteurs/auteur.php?id=eng...
2,GEM_1925_3,1925,Sint Maartensavond Bij een linoleumsnede van J...,Joep Nicolas,6 oktober 1897,,25 juli 1972,Steyl,https://www.dbnl.org/auteurs/auteur.php?id=nic...
3,GEM_1925_4,1925,,,,,,,
4,GEM_1925_5,1925,Is de nieuwe meerstemmige kerkmuziek in Nederl...,Albert Helman,7 november 1903,Paramaribo,10 juli 1996,Amsterdam,https://www.dbnl.org/auteurs/auteur.php?id=hel...


In [9]:
# Filter the DataFrame so it only contains texts written by a single author
metadata_filtered_df = metadata_df
metadata_filtered_df = metadata_filtered_df.dropna(subset = ['author'])
metadata_filtered_df = metadata_filtered_df[~metadata_filtered_df['author'].str.contains(';', na=False)]

In [11]:
# Extract the identifiers
for link in metadata_filtered_df:
    metadata_filtered_df['identifier'] = metadata_filtered_df['link'].str.extract(r'.*=(\w*)')

# Lowercase all identifiers
metadata_filtered_df['identifier'] = metadata_filtered_df['identifier'].str.lower()

metadata_filtered_df.head(5)

Unnamed: 0,textID,year,title,author,birthDate,birthPlace,deathDate,deathPlace,link,identifier
2,GEM_1925_3,1925,Sint Maartensavond Bij een linoleumsnede van J...,Joep Nicolas,6 oktober 1897,,25 juli 1972,Steyl,https://www.dbnl.org/auteurs/auteur.php?id=nic...,nico008
4,GEM_1925_5,1925,Is de nieuwe meerstemmige kerkmuziek in Nederl...,Albert Helman,7 november 1903,Paramaribo,10 juli 1996,Amsterdam,https://www.dbnl.org/auteurs/auteur.php?id=hel...,helm003
5,GEM_1925_6,1925,Dagelijksch brood en dagelijksch leven.,Willem Nieuwenhuis,1886,,1935,,https://www.dbnl.org/auteurs/auteur.php?id=nie...,nieu047
6,GEM_1925_7,1925,In Memoriam Fratris,Karel van den Oever,19 november 1879,Antwerpen,6 oktober 1926,Antwerpen,https://www.dbnl.org/auteurs/auteur.php?id=oev...,oeve003
8,GEM_1925_9,1925,Koraal.,Wies Moens,28 januari 1898,Sint-Gillis-Bij-Dendermonde,5 februari 1982,Geleen,https://www.dbnl.org/auteurs/auteur.php?id=moe...,moen007


In [13]:
## Group the corpus files by author, i.e., identifier
# Define the source and destination directories
source = 'corpus_final_ids'  # Directory with original files
destination = 'cclamp_author/corpus'  # New directory to store selected files

#If needed, create the destination directory
os.makedirs(destination, exist_ok=True)

# Group the textID values by identifier
grouped = metadata_filtered_df.groupby('identifier')['textID'].apply(list).to_dict()

# Loop through the identifiers in the created dictionary grouped
for identifier, textIDs in grouped.items():
    author_files_content = []

    # Loop through the textIDs and extract the files' content
    for textID in textIDs:
        filename = f"{textID}.txt"
        if filename in os.listdir(source):
            source_file = os.path.join(source, filename)
            with open(source_file, 'r', encoding='utf-8') as in_file:
                file_content = in_file.read()
            author_files_content.append(file_content)

    # Once the loop completed for a single author, write all content to a new .txt file
    if author_files_content:
        destination_file = os.path.join(destination, f"{identifier}.txt")
        with open(destination_file, 'w', encoding='utf-8') as out_file:
            out_file.write("\n\n".join(author_files_content))