In [1]:
from google.colab import drive
import pandas as pd
import glob

# Mount Google Drive
drive.mount('/content/drive')

# Set the path to the shared folder on your Google Drive
data_folder = '/content/drive/My Drive/0812Fullmerged/'

# Step 1: Load the president term data from Excel
president_terms_file = '/content/drive/My Drive/Data/president_terms.xlsx'
president_terms = pd.read_excel(president_terms_file)

# Sort president terms by start date for correct behavior
president_terms = president_terms.sort_values('Start_date')

# Get all Parquet files in the shared folder
all_files = glob.glob(data_folder + "*.parquet")

# Step 2: Loop through each Parquet file, process it, and save the results
for file_path in all_files:
    # Load the Parquet file
    df = pd.read_parquet(file_path)

    # Ensure action_date is in datetime format
    df['action_date'] = pd.to_datetime(df['action_date'], format='mixed')

    # Step 1: Initialize an empty president_party column
    df['president_party'] = None

    # Step 2: Apply the president terms to the data
    for _, row in president_terms.iterrows():
        # Create a mask where action_date is between the president's start and end dates
        mask = (df['action_date'] >= row['Start_date']) & (df['action_date'] <= row['End_date'])

        # Assign the president's party based on the mask
        df.loc[mask, 'president_party'] = row['Party']

    # Get the file name from the file path (to use in output)
    file_name = file_path.split('/')[-1].replace('.parquet', '_processed.parquet')

    # Save the processed data with a new name
    output_file = f'/content/drive/My Drive/Data/{file_name}'
    df.to_parquet(output_file, index=False)

    print(f'Processed and saved file: {output_file}')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Processed and saved file: /content/drive/My Drive/Data/0812_merged_chunk_1_processed.parquet


KeyboardInterrupt: 