In [8]:
import pandas as pd
from pathlib import Path

current_dir = Path.cwd()
if current_dir.name == 'notebooks':
    project_root = current_dir.parent
else:
    if (current_dir / 'data').is_dir():
        project_root = current_dir
    else:
        print("Warning: Could not reliably determine project root. Adjust paths if needed.")
        project_root = current_dir

raw_data_dir = project_root / 'data' / 'raw'
metadata_file_path = raw_data_dir / 'metadata.csv'

print(f"Project root determined as: {project_root}")
print(f"Raw data directory: {raw_data_dir}")
print(f"Metadata file path: {metadata_file_path}")

print("\n--- Step 1: Metadata load ---")
try:
    metadata_df = pd.read_csv(metadata_file_path, sep=',')
    print("Metadata loaded successfully. First 5 fows:")
    display(metadata_df.head())
    print("\nDataframe file info (metadata_df.info()):")
    metadata_df.info()

    print(f"\nAll entires in metadatafile: {len(metadata_df)}")
    if 'author' in metadata_df.columns:
        print(f"Number of unique authors in metadata: {metadata_df['author'].nunique()}")
    else:
        print("Error: Column 'author' has not been found in the file. Unique authors cannot be counted.")

except FileNotFoundError:
    print(f"Error: metadata file has not been found under: {metadata_file_path}")
    print("Please ensure that the file exists and the path is correct")
    metadata_df = pd.DataFrame() # Empty data frame is created to avoid further errors.
except Exception as e:
    print(f"Unexpected error has occurred during metadata file load: {e}")

print("\n--- Step 2: txt files verification ---")
if not metadata_df.empty:
    if 'filename' not in metadata_df.columns:
        print("Error: Column 'filename' has not been found in the file. Txt files cannot be verified.")
    else:
        expected_filenames_from_meta = set(metadata_df['filename'].tolist())

        if not raw_data_dir.is_dir():
            print(f"Error: raw data directory has not been found under: {raw_data_dir}")
            actual_txt_files_on_disk = set() # Empty set if the directory does not exist.
        else:
            # We are looking only for the .txt files.
            all_files_in_raw_dir = [f.name for f in raw_data_dir.iterdir() if f.is_file()]
            actual_txt_files_on_disk = set([f for f in all_files_in_raw_dir if f.endswith('.txt')])
            print(f"Verification found {len(actual_txt_files_on_disk)} .txt files under directory {raw_data_dir}.")

            # Metadata file verification (if all files exist)
            missing_on_disk = expected_filenames_from_meta - actual_txt_files_on_disk
            if missing_on_disk:
                print(f"\nWarning: following files have been mentioned in the metadata file BUT, they have not been found in directory:")
                for f in missing_on_disk:
                    print(f"- {f}")
            else:
                print("\nAll files mentioned in metadata file have been found in directory.")

            # Raw data verification (if in metadata)
            extra_on_disk = actual_txt_files_on_disk - expected_filenames_from_meta
            if extra_on_disk:
                print(f"\nWarning: following files have been found in the directory BUT, they have not been mentioned in the metadata file:")
                for f in extra_on_disk:
                    print(f"- {f}")
            else:
                print("\nAll .txt files in directory are present in the metadata file.")
else:
    print("Txt files verification has been skipped as metadata has not been loaded.")

print("\n--- Step 3: Load and Display piece of text from raw data ---")
if not metadata_df.empty and 'filename' in metadata_df.columns and len(metadata_df) > 0:
    sample_entry = metadata_df.iloc[0]
    sample_filename = sample_entry['filename']
    sample_author = sample_entry.get('author', 'N/A')
    sample_title = sample_entry.get('title', 'N/A')

    sample_text_path = raw_data_dir / sample_filename

    if sample_text_path.is_file():
        try:
            # Important: Encoding specification
            with open(sample_text_path, 'r', encoding='utf-8') as f:
                text_content = f.read()

            print(f"\n--- Sample text: {sample_title} by {sample_author} from ({sample_filename}) ---")
            # Only 500 signs will be displayed to not overload the display
            print(text_content[:500] + "..." if len(text_content) > 500 else text_content)
            print("--------------------------------------------------------------------------------")
        except UnicodeDecodeError:
            print(f"Error: File '{sample_text_path}' cannot be encoded using current encoder (default is utf-8). Please verify file encoding and try again.")
        except Exception as e:
            print(f"Unexpected error has occurred during sample text load '{sample_text_path}': {e}")
    else:
        print(f"Error: Sample text file '{sample_filename}' has not been found under: {sample_text_path}")
else:
    print("No metadata or files to display.")

Project root determined as: C:\Users\tomas\PycharmProjects\KoineML_Authorship_Classifier
Raw data directory: C:\Users\tomas\PycharmProjects\KoineML_Authorship_Classifier\data\raw
Metadata file path: C:\Users\tomas\PycharmProjects\KoineML_Authorship_Classifier\data\raw\metadata.csv

--- Step 1: Metadata load ---
Metadata loaded successfully. First 5 fows:


Unnamed: 0,Text_ID,filename,author,title,period,genre,notes,authorship
0,LUKEGOS1,luke_gospel.txt,Luke,Gospel According to Luke,NT Koine,Gospel,,certain
1,PAULROM1,paul_romans.txt,Paul,Letter to Romans,NT Koine,Letter,,certain
2,PAULCOR1,paul_corinthians.txt,Paul,1st Letter to Corithians,NT Koine,Letter,,certain
3,NNHEB1,hebrew.txt,Not known,Letter to Hebrew,NT Koine,Letter,Goal text to determine the authorship,in doubt



Dataframe file info (metadata_df.info()):
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Text_ID     4 non-null      object
 1   filename    4 non-null      object
 2   author      4 non-null      object
 3   title       4 non-null      object
 4   period      4 non-null      object
 5   genre       4 non-null      object
 6   notes       4 non-null      object
 7   authorship  4 non-null      object
dtypes: object(8)
memory usage: 384.0+ bytes

All entires in metadatafile: 4
Number of unique authors in metadata: 3

--- Step 2: txt files verification ---
Verification found 4 .txt files under directory C:\Users\tomas\PycharmProjects\KoineML_Authorship_Classifier\data\raw.

All files mentioned in metadata file have been found in directory.

All .txt files in directory are present in the metadata file.

--- Step 3: Load and Display piece of text from r