In [18]:
import pandas as pd
import csv
import re

def read_and_parse_csv(file_path):
    rows = []
    
    with open(file_path, 'r') as file:
        reader = csv.reader(file, delimiter=',', quotechar='"')
        for row in reader:
            # Manually split each row using regex to handle quoted commas
            fixed_row = []
            for cell in row:
                # Check for quoted string with comma
                if '"' in cell:
                    fixed_row.extend(re.split(r'(?<!"),(?!")', cell))
                else:
                    fixed_row.append(cell)
            rows.append(fixed_row)
    
    # Convert the list of rows into a DataFrame
    data = pd.DataFrame(rows[1:], columns=rows[0])
    
    # Return the DataFrame
    return data

# Path to your CSV file
file_path = 'RAW_us_deaths_sample.csv'

# Read and parse the CSV file
data = read_and_parse_csv(file_path)

# Check the shape of the DataFrame
print(data.shape)

# Display the first few rows to verify
print(data.head())

# Display the first few entries of a sample row
print(data.iloc[0, :20].tolist())


(9, 1155)
  Province_State   Admin2       UID iso2 iso3 code3    FIPS Country_Region  \
0        Alabama  Autauga  84001001   US  USA   840  1001.0             US   
1        Alabama  Baldwin  84001003   US  USA   840  1003.0             US   
2        Alabama  Barbour  84001005   US  USA   840  1005.0             US   
3        Alabama     Bibb  84001007   US  USA   840  1007.0             US   
4        Alabama   Blount  84001009   US  USA   840  1009.0             US   

           Lat                     Long_  ... 2/28/23 3/1/23 3/2/23 3/3/23  \
0  32.53952745     -86.64408227,"Autauga  ...     230    232    232    232   
1  30.72774991     -87.72207058,"Baldwin  ...     724    726    726    726   
2    31.868263      -85.3871286,"Barbour  ...     103    103    103    103   
3  32.99642064  -87.12511459999996,"Bibb  ...     109    109    109    109   
4  33.98210918      -86.56790593,"Blount  ...     261    261    261    261   

  3/4/23 3/5/23 3/6/23 3/7/23 3/8/23 3/9/23  
0    2

In [21]:
# Function to aggregate data by year
def aggregate_by_year(data):
    # Convert columns to datetime where possible
    date_columns = data.columns[11:]  # Skipping non-date columns
    data[date_columns] = data[date_columns].apply(pd.to_numeric, errors='coerce')
    data[date_columns] = data[date_columns].fillna(0)

    # Extract year from date columns and group by year
    yearly_data = data[date_columns].groupby(by=[col[-2:] for col in date_columns], axis=1).sum()

    # Merge non-date columns with the yearly aggregated data
    result = pd.concat([data.iloc[:, :11], yearly_data], axis=1)

    return result

# Aggregate the data by year
yearly_data = aggregate_by_year(data)

# Check the shape of the DataFrame
print(yearly_data.shape)

# Display the first few rows to verify
print(yearly_data.head())


(9, 16)
  Province_State   Admin2       UID iso2 iso3 code3    FIPS Country_Region  \
0        Alabama  Autauga  84001001   US  USA   840  1001.0             US   
1        Alabama  Baldwin  84001003   US  USA   840  1003.0             US   
2        Alabama  Barbour  84001005   US  USA   840  1005.0             US   
3        Alabama     Bibb  84001007   US  USA   840  1007.0             US   
4        Alabama   Blount  84001009   US  USA   840  1009.0             US   

           Lat                     Long_ Combined_Key       20        21  \
0  32.53952745     -86.64408227,"Autauga      Alabama   5589.0   41785.0   
1  30.72774991     -87.72207058,"Baldwin      Alabama  12271.0  136367.0   
2    31.868263      -85.3871286,"Barbour      Alabama   2035.0   22337.0   
3  32.99642064  -87.12511459999996,"Bibb      Alabama   2632.0   25347.0   
4  33.98210918      -86.56790593,"Blount      Alabama   3855.0   52469.0   

         22       23   on  
0   77553.0  15658.0  0.0  
1  248554.

In [22]:
# Function to aggregate data by month
def aggregate_by_month(data):
    # Convert columns to datetime where possible
    date_columns = data.columns[11:]  # Skipping non-date columns
    data[date_columns] = data[date_columns].apply(pd.to_numeric, errors='coerce')
    data[date_columns] = data[date_columns].fillna(0)
    
    # Extract month-year and group by it
    month_year_columns = pd.to_datetime(date_columns).to_series().dt.to_period('M')
    monthly_data = data.groupby(month_year_columns, axis=1).sum()
    
    # Convert PeriodIndex to string
    monthly_data.columns = monthly_data.columns.astype(str)
    
    # Merge non-date columns with the monthly aggregated data
    result = pd.concat([data.iloc[:, :11], monthly_data], axis=1)
    
    return result

# Aggregate the data by month
monthly_data = aggregate_by_month(data)

# Check the shape of the DataFrame
print(monthly_data.shape)

# Display the first few rows to verify
print(monthly_data.head())


ParserError: Unknown string format: Population