Load required packages

In [14]:
import pandas as pd

Get filepaths for parliament reports

In [15]:
# Load each file, using raw string literals to avoid issues with backslashes
file_paths = [
    (r'..\data\raw\scottish_parliament_report_07_01_25.txt', '07/01/25'),
    (r'..\data\raw\scottish_parliament_report_08_10_24.txt', '08/10/24'),
    (r'..\data\raw\scottish_parliament_report_10_09_24.txt', '10/09/24'),
    (r'..\data\raw\scottish_parliament_report_26_06_24.txt', '26/06/24')
]

Define methods for data processing

In [16]:
#MOVE INTO A SERVICE CLASS

def load_file(filename : str) -> list:
    """
    Load a file and return its content as a list of lines.

    Parameters:
    filename (str): The path to the file to be loaded.

    Returns:
    list: A list of lines from the file.
    """

    with open(filename, 'r', encoding='utf-8') as file:
        lines = file.readlines()
    return lines

def split_lines(lines : list) -> tuple:
    """
    Load a list of strings and split them into three lists: names, speeches and times. 
    The data coming in is split by '\n' characters, so the function counts the number of lines between each '\n' character.
    To determine the structure of the data. Which whether or not a time stamp is present. 

    Parameters:
    lines (list): A list of strings from a loaded file.

    Returns:
    tuple: A tuple of three lists: names, speeches and times.
    """
    count = 0
    times = []
    names = []
    speeches = []
    
    for i in range(0, len(lines)):
        if lines[i] != '\n':
            count += 1
        else:
            if count > 2:
                times.append(lines[i - count].strip())
                names.append(lines[i - count + 1].replace(":","").strip())
                speeches.append(lines[i - count + 2].strip())
            else:
                times.append("")
                names.append(lines[i - count].replace(":","").strip())
                speeches.append(lines[i - count + 1].strip())
            count = 0

    return names, speeches, times

def process_file(file_path : str, date : str) -> pd.DataFrame:
    """
    Load a parliament file, split its content into names, speeches, times and dates and return a DataFrame with the data. 

    Parameters:
    file_path (str): The path to the file to be loaded.
    date (str): The date of the parliament session.

    Returns:
    pd.DataFrame: A DataFrame with columns 'Name', 'Speech', 'Time' and 'Date'.
    """
    lines = load_file(file_path)
    names, speeches, times = split_lines(lines)
    df = pd.DataFrame({'Name': names, 'Speech': speeches, 'Time': times})
    df['Date'] = date
    return df


Run data processing

In [20]:
# Load and process all files
dfs = [process_file(file_path, date) for file_path, date in file_paths]
# Combine all DataFrames into a single one
combined_df = pd.concat(dfs, ignore_index=True)
# save combined data to csv
combined_df.to_csv(r'..\data\combined_parliament_reports.csv', index=False)
combined_df.head()

Unnamed: 0,Name,Speech,Time,Date
0,The Convener,The next item on our agenda is to take evidenc...,,07/01/25
1,Stephen Boyle (Auditor General for Scotland),"Many thanks, convener, and happy new year to t...",,07/01/25
2,Stephen Boyle (Auditor General for Scotland),The report found that the Scottish Government ...,,07/01/25
3,Stephen Boyle (Auditor General for Scotland),Although my report was published prior to the ...,,07/01/25
4,Stephen Boyle (Auditor General for Scotland),The Scottish Government continues to face many...,,07/01/25
