In [1]:
import pandas as pd
import os
import glob
import re

# Set the working directory - corrected path
data_dir = r'C:\Users\clint\Desktop\Lifecycle_RA\Data\webplotdigitizer'
os.chdir(data_dir)

# Function to extract age from filename
def extract_age(filename):
    match = re.search(r'(\d+)YO', filename)
    if match:
        return f"{match.group(1)}YO"
    return None

# Function to load and process a CSV file
def process_csv(file_path):
    try:
        # Read CSV, assuming no header, first column is date, second column is price
        df = pd.read_csv(file_path, header=None, names=['Date', 'Price'])
        
        # Convert date strings to datetime objects
        df['Date'] = pd.to_datetime(df['Date'], format='%Y/%m/%d')
        
        # Extract age from filename
        age = extract_age(os.path.basename(file_path))
        
        if age:
            return df, age
        else:
            print(f"Could not extract age from {file_path}")
            return None, None
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return None, None

# Get all CSV files in the directory
csv_files = glob.glob(os.path.join(data_dir, "*.csv"))

# Print the list of CSV files found for debugging
print(f"Found {len(csv_files)} CSV files:")
for file in csv_files[:5]:  # Show first 5 files to avoid lengthy output
    print(f"  - {os.path.basename(file)}")
if len(csv_files) > 5:
    print(f"  ... and {len(csv_files) - 5} more files")

# Dictionary to store DataFrames by age
age_dataframes = {}

# Process each CSV file
for file_path in csv_files:
    df, age = process_csv(file_path)
    if df is not None and age is not None:
        if age in age_dataframes:
            age_dataframes[age].append(df)
        else:
            age_dataframes[age] = [df]

# For each age, combine DataFrames and average prices for the same dates
combined_data = {}
for age, dfs in age_dataframes.items():
    if dfs:
        # Concatenate all DataFrames for this age
        combined_df = pd.concat(dfs, ignore_index=True)
        
        # Group by date and calculate mean price
        combined_df = combined_df.groupby('Date')['Price'].mean().reset_index()
        
        combined_data[age] = combined_df

# Create final DataFrame with all ages
# Start with any age as the base
if combined_data:
    ages = list(combined_data.keys())
    final_df = combined_data[ages[0]].rename(columns={'Price': ages[0]})
    
    # Merge with other ages
    for age in ages[1:]:
        temp_df = combined_data[age].rename(columns={'Price': age})
        final_df = pd.merge(final_df, temp_df, on='Date', how='outer')
    
    # Sort by date
    final_df = final_df.sort_values('Date')
    
    # Display the first few rows of the final DataFrame
    print("First few rows of the combined data:")
    print(final_df.head())
    
    # Save to a new CSV file
    output_file = os.path.join(data_dir, "combined_digitizer_data.csv")
    final_df.to_csv(output_file, index=False)
    print(f"\nCombined data saved to {output_file}")
else:
    print("No data was processed successfully.")

# Summary of ages found
print("\nAges found:")
for age in combined_data.keys():
    print(f"- {age}")

Found 14 CSV files:
  - 2YO.csv
  - 3YO (2).csv
  - 3YO (3).csv
  - 3YO (5).csv
  - 3YO.csv
  ... and 9 more files
Error processing C:\Users\clint\Desktop\Lifecycle_RA\Data\webplotdigitizer\combined_digitizer_data.csv: time data "4YO" doesn't match format "%Y/%m/%d", at position 0. You might want to try:
    - passing `format` if your strings have a consistent format;
    - passing `format='ISO8601'` if your strings are all ISO8601 but not necessarily in exactly the same format;
    - passing `format='mixed'`, and the format will be inferred for each element individually. You might want to use `dayfirst` alongside this.
First few rows of the combined data:
        Date  2YO           3YO           4YO           5YO
0 2015-01-16  NaN  92092.885611           NaN           NaN
1 2015-01-17  NaN           NaN  77306.593205  68710.037515
2 2015-01-20  NaN  91139.688310           NaN           NaN
3 2015-01-21  NaN           NaN  76394.885565           NaN
4 2015-01-22  NaN           NaN    

In [2]:
data = pd.read_csv(r'C:\Users\clint\Desktop\Lifecycle_RA\Data\webplotdigitizer\combined_digitizer_data.csv', parse_dates=['Date'])
data

Unnamed: 0,Date,2YO,3YO,4YO,5YO
0,2015-01-16,,92092.885611,,
1,2015-01-17,,,77306.593205,68710.037515
2,2015-01-20,,91139.688310,,
3,2015-01-21,,,76394.885565,
4,2015-01-22,,,,68174.740697
...,...,...,...,...,...
1957,2024-12-04,,,78460.054008,56157.326702
1958,2024-12-13,,94439.402692,,
1959,2024-12-15,,,77975.831321,55730.071389
1960,2024-12-23,,93938.089792,,


## Debugging Notes

Common issues that might be causing errors:

1. Path inconsistency: Make sure both the data source path and the output path exist
2. File not found: Ensure the CSV files are in the expected directory
3. File access: Check permissions for reading/writing files
4. File format: Ensure CSV files have the expected structure (date in first column, price in second)

In [3]:
import pandas as pd
import numpy as np

# First, let's convert the date column to datetime format
data['Date'] = pd.to_datetime(data['Date'], errors='coerce')

# Create a quarter column
data['Quarter'] = data['Date'].dt.to_period('Q')

# Define functions for statistics without extremes
def mean_no_extremes(x):
    if len(x) <= 2:
        return np.nan
    return x[(x != x.min()) & (x != x.max())].mean()

def median_no_extremes(x):
    if len(x) <= 2:
        return np.nan
    return x[(x != x.min()) & (x != x.max())].median()

# Define function to remove 2 standard deviation outliers
def remove_outliers(x):
    if len(x) < 2:
        return x
    mean = x.mean()
    std_dev = x.std()
    return x[(x >= mean - 2 * std_dev) & (x <= mean + 2 * std_dev)]

# Add no_outliers columns for each age group
for column in ['2YO', '3YO', '4YO', '5YO']:
    data[f'{column}_no_outliers'] = data.groupby('Quarter')[column].transform(lambda x: remove_outliers(x))

# Create statistics for all columns
results = {}

# Updated to include all requested columns
for column in ['2YO', '3YO', '4YO', '5YO']:
    # Calculate statistics for the current column
    quarterly_stats = data.groupby('Quarter')[column].agg([
        ('mean', 'mean'),
        ('median', 'median'),
        ('min', 'min'),
        ('max', 'max'),
        ('mean_no_extremes', mean_no_extremes),
        ('median_no_extremes', median_no_extremes),
        ('mean_no_outliers', lambda x: remove_outliers(x).mean()),
        ('median_no_outliers', lambda x: remove_outliers(x).median())
    ])
    
    # Rename columns to include the original column name
    quarterly_stats.columns = [f'{column}_{stat}' for stat in quarterly_stats.columns]
    
    # Store in dictionary
    results[column] = quarterly_stats

# Combine all results into a single dataframe
combined_stats = pd.concat(results.values(), axis=1)

# Reset index to make Quarter a column
combined_stats = combined_stats.reset_index()

# Convert Quarter period to string for better display
combined_stats['Quarter'] = combined_stats['Quarter'].astype(str)

# Display the resulting dataframe
combined_stats

  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, ou

Unnamed: 0,Quarter,2YO_mean,2YO_median,2YO_min,2YO_max,2YO_mean_no_extremes,2YO_median_no_extremes,2YO_mean_no_outliers,2YO_median_no_outliers,3YO_mean,...,4YO_mean_no_outliers,4YO_median_no_outliers,5YO_mean,5YO_median,5YO_min,5YO_max,5YO_mean_no_extremes,5YO_median_no_extremes,5YO_mean_no_outliers,5YO_median_no_outliers
0,2015Q1,,,,,,,,,85439.935491,...,72386.581372,72031.303713,63983.307411,63180.581862,61399.666675,68710.037515,63830.229599,63180.581862,63983.307411,63180.581862
1,2015Q2,,,,,,,,,78351.575737,...,72136.576152,71669.583229,62883.816897,62474.017045,61243.480053,64862.357314,62859.659499,62474.017045,62883.816897,62474.017045
2,2015Q3,,,,,,,,,78051.486007,...,68889.526332,69038.410965,59029.743362,59007.864518,57376.476526,61010.34475,59006.362323,59007.864518,59029.743362,59007.864518
3,2015Q4,,,,,,,,,74165.322882,...,62600.422269,62558.216943,55578.482057,55489.366568,53597.53567,58014.515493,55537.110507,55489.366568,55578.482057,55489.366568
4,2016Q1,103330.240196,103194.344889,94182.306084,112679.862188,103325.068712,103194.344889,103330.240196,103194.344889,82002.393135,...,97835.752936,100803.562886,53108.598702,53895.581788,46121.276339,57429.64644,53197.474523,53895.581788,53108.598702,53895.581788
5,2016Q2,92995.250059,92932.845109,88887.091308,95331.666036,93075.783822,92932.845109,93173.865657,92951.913323,77317.451312,...,79803.625884,84397.886977,46436.168594,46404.35889,42517.653192,50530.357804,46431.288765,46404.35889,46436.168594,46404.35889
6,2016Q3,86839.834613,87339.205671,80577.533855,90325.947772,86966.024958,87339.205671,87112.108559,87370.189204,70968.074079,...,75223.401338,81845.179896,45953.304287,45965.284402,43716.908838,47770.445424,45968.832225,45965.284402,45953.304287,45965.284402
7,2016Q4,82476.32684,81555.671648,79970.934385,97714.816065,81415.235442,81555.671648,81304.135361,81554.439348,67459.283265,...,70037.632693,74493.204828,41784.166564,41251.588602,40047.200605,52283.820146,41346.032183,41251.588602,41284.183061,41135.013061
8,2017Q1,99125.16975,99239.388113,97318.145316,99992.105693,99068.470937,99087.568804,99231.465305,99341.451514,77410.027439,...,80990.43081,79616.20617,46279.295609,44863.025105,44279.841594,51953.666587,46171.209816,44863.025105,45948.14547,44831.417961
9,2017Q2,94886.849617,95175.457394,90321.37066,99338.439226,94890.917093,95175.457394,94886.849617,95175.457394,76274.916432,...,82762.485511,92834.909721,43535.434037,43862.456756,41558.794402,45258.166994,43543.624575,43862.456756,43535.434037,43862.456756


In [4]:
# Save the combined statistics with directory verification
output_path = r'C:\Users\clint\Desktop\Lifecycle_RA\Data\Processed\Combined_Csvs\webplot_extracted.csv'

# Ensure the output directory exists
output_dir = os.path.dirname(output_path)
if not os.path.exists(output_dir):
    print(f"Creating directory: {output_dir}")
    os.makedirs(output_dir, exist_ok=True)

# Save the file
try:
    combined_stats.to_csv(output_path, index=False)
    print(f"Successfully saved to: {output_path}")
except Exception as e:
    print(f"Error saving file: {e}")

Successfully saved to: C:\Users\clint\Desktop\Lifecycle_RA\Data\Processed\Combined_Csvs\webplot_extracted.csv


In [5]:
# Parse data monthly instead of quarterly
data['Month'] = data['Date'].dt.to_period('M')

# Create statistics for all columns on a monthly basis
monthly_results = {}

for column in ['2YO', '3YO', '4YO', '5YO']:
    # Calculate statistics for the current column
    monthly_stats = data.groupby('Month')[column].agg([
        ('mean', 'mean'),
        ('median', 'median'),
        ('min', 'min'),
        ('max', 'max'),
        ('mean_no_extremes', mean_no_extremes),
        ('median_no_extremes', median_no_extremes),
        ('mean_no_outliers', lambda x: remove_outliers(x).mean()),
        ('median_no_outliers', lambda x: remove_outliers(x).median())
    ])
    
    # Rename columns to include the original column name
    monthly_stats.columns = [f'{column}_{stat}' for stat in monthly_stats.columns]
    
    # Store in dictionary
    monthly_results[column] = monthly_stats

# Combine all monthly results into a single dataframe
combined_monthly_stats = pd.concat(monthly_results.values(), axis=1)

# Reset index to make Month a column
combined_monthly_stats = combined_monthly_stats.reset_index()

# Convert Month period to string for better display
combined_monthly_stats['Month'] = combined_monthly_stats['Month'].astype(str)

# Display the resulting dataframe
combined_monthly_stats

  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, ou

Unnamed: 0,Month,2YO_mean,2YO_median,2YO_min,2YO_max,2YO_mean_no_extremes,2YO_median_no_extremes,2YO_mean_no_outliers,2YO_median_no_outliers,3YO_mean,...,4YO_mean_no_outliers,4YO_median_no_outliers,5YO_mean,5YO_median,5YO_min,5YO_max,5YO_mean_no_extremes,5YO_median_no_extremes,5YO_mean_no_outliers,5YO_median_no_outliers
0,2015-01,,,,,,,,,89886.722411,...,75773.451957,75821.771234,67645.954327,67716.138018,66441.503756,68710.037515,67716.138018,67716.138018,67645.954327,67716.138018
1,2015-02,,,,,,,,,84091.056129,...,71430.510402,71035.162771,63718.979255,63626.038134,62308.970664,65381.080462,63655.956101,63626.038134,63718.979255,63626.038134
2,2015-03,,,,,,,,,82611.803806,...,71617.158240,71174.069171,61805.870956,61552.957332,61399.666675,62930.517111,61626.260488,61552.957332,61805.870956,61552.957332
3,2015-04,,,,,,,,,80832.386505,...,73911.138181,73930.043039,64308.367934,64366.104785,63838.836472,64862.357314,64280.215295,64366.104785,64308.367934,64366.104785
4,2015-05,,,,,,,,,75965.206351,...,71095.643548,70930.373732,62641.401771,62474.017045,62263.677775,63341.411077,62560.830443,62474.017045,62641.401771,62474.017045
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115,2024-08,,,,,,,,,91078.327568,...,59053.168263,58920.244388,49549.111203,49549.111203,49264.274329,49833.948078,,,49549.111203,49549.111203
116,2024-09,,,,,,,,,89053.612115,...,58217.646764,58322.086951,46174.040848,46063.743625,43893.064687,48675.611454,46063.743625,46063.743625,46174.040848,46063.743625
117,2024-10,,,,,,,,,84415.043607,...,58654.396639,58663.891201,42363.626305,42462.369755,42057.901393,42570.607768,42462.369755,42462.369755,42363.626305,42462.369755
118,2024-11,,,,,,,,,94724.239567,...,,,55701.587702,55701.587702,55701.587702,55701.587702,,,,


In [6]:
# Save the monthly statistics with directory verification
monthly_output_path = r'C:\Users\clint\Desktop\Lifecycle_RA\Data\Processed\Combined_Csvs\webplot_monthly_extracted.csv'

# Ensure the output directory exists
monthly_output_dir = os.path.dirname(monthly_output_path)
if not os.path.exists(monthly_output_dir):
    print(f"Creating directory: {monthly_output_dir}")
    os.makedirs(monthly_output_dir, exist_ok=True)

# Save the file
try:
    combined_monthly_stats.to_csv(monthly_output_path, index=False)
    print(f"Successfully saved to: {monthly_output_path}")
except Exception as e:
    print(f"Error saving file: {e}")

Successfully saved to: C:\Users\clint\Desktop\Lifecycle_RA\Data\Processed\Combined_Csvs\webplot_monthly_extracted.csv
