In [1]:
import pandas as pd
from tqdm import tqdm
import os
from pathlib import Path

In [2]:
def spending_with_districts(project_name="WeThePeopleAudit"):
    """Find the project root directory."""
    here = Path.cwd()
    
    #if here returns values
    if here.name == project_name:
        return here
        
    #check all folders
    for parent in here.parents:
        if parent.name == project_name:
            return parent
    
    #research
    for root, dirs, _ in os.walk(Path.home()):
        if project_name in dirs:
            return Path(root) / project_name
    #Print the error here
    raise FileNotFoundError(f"Can't find the '{project_name}' folder anywhere")

try:
    project_folder = spending_with_districts()
    print(f"Found project at: {project_folder}")
    
    # Load the data
    data_file = project_folder / "data" / "updated_spending_with_districts.csv"
    print(f"Loading: {data_file}")
    df = pd.read_csv(data_file, low_memory=False)
    
    print(f"\nLoaded {len(df)} rows with {len(df.columns)} columns")
    print("\nColumns:")
    for col in df.columns:
        print(f"- {col}")
    
    print("\nFirst few rows:")
    print(df.head())
    
    print("\nLast few rows:")
    print(df.tail())
    
    df['amount'] = pd.to_numeric(df['amount'], errors='coerce')
    total_cash = df['amount'].sum()
    
    if 'Congressional_District' in df.columns:
        print("\nCongressional District Breakdown:")
        
        counts = df['Congressional_District'].value_counts()
        spending = df.groupby('Congressional_District')['amount'].sum()
        averages = df.groupby('Congressional_District')['amount'].mean()
        
        for district in spending.sort_values(ascending=False).index:
            count = counts[district]
            money = spending[district]
            avg = averages[district]
            records_pct = count/len(df)*100
            money_pct = money/total_cash*100
            
            print(f"- {district}: {count} records ({records_pct:.2f}%) | ${money:,.2f} ({money_pct:.2f}%) | ${avg:,.2f} avg")

    if 'Senate_District' in df.columns:
        print("\nSenate District Breakdown:")
        
        counts = df['Senate_District'].value_counts()
        spending = df.groupby('Senate_District')['amount'].sum()
        averages = df.groupby('Senate_District')['amount'].mean()
        
        for district in spending.sort_values(ascending=False).index:
            count = counts[district]
            money = spending[district]
            avg = averages[district]
            records_pct = count/len(df)*100
            money_pct = money/total_cash*100
            
            print(f"- {district}: {count} records ({records_pct:.2f}%) | ${money:,.2f} ({money_pct:.2f}%) | ${avg:,.2f} avg")
    
    if 'House_District' in df.columns:
        print("\nHouse District Breakdown:")
        
        counts = df['House_District'].value_counts()
        spending = df.groupby('House_District')['amount'].sum()
        averages = df.groupby('House_District')['amount'].mean()
        
        for district in spending.sort_values(ascending=False).index:
            count = counts[district]
            money = spending[district]
            avg = averages[district]
            records_pct = count/len(df)*100
            money_pct = money/total_cash*100
            
            print(f"- {district}: {count} records ({records_pct:.2f}%) | ${money:,.2f} ({money_pct:.2f}%) | ${avg:,.2f} avg")

except FileNotFoundError as e:
    print(f"Error: {e}")
    print("We're currently in:", Path.cwd())
    print("Check your project folder structure or give me the full path to the CSV.")

Found project root: /Users/phatngo/Desktop/WeThePeopleAudit
Reading data from: /Users/phatngo/Desktop/WeThePeopleAudit/data/updated_spending_with_districts.csv

DataFrame loaded successfully with 4754026 rows and 28 columns.

Column names:
- base_id
- budget_fiscal_year
- fiscal_period
- date
- cabinet_secretariat
- department
- appropriation_type
- appropriation_name
- object_class
- object_code
- encumbrance_id
- zip_code
- amount
- fund
- fund_code
- appropriation_code
- object
- department_code
- vendor
- vendor_id
- payment_id
- payment_method
- state
- city
- create_date
- Congressional_District
- Senate_District
- House_District

First 5 rows:
                base_id  budget_fiscal_year  fiscal_period  \
0  B0000000000360065574                2022              7   
1  B0000000000360065575                2022              7   
2  B0000000000360067383                2022              7   
3  B0000000000360072140                2022              7   
4  B0000000000360072141        

In [44]:
project_root = spending_with_districts()  
file_path = project_root / "data" / "updated_spending_with_districts.csv"  
df = pd.read_csv(file_path, low_memory=False)

In [49]:
df.head(5)
#df.columns
#print(len(df))
mask = df['House_District'] != 'Other state'
filtered_df = df.loc[mask]

filtered_df.head(5).to_csv("sample_with_header.csv", index=False, header=True)

In [18]:
# Using your existing project root finder
def spending_data_root(project_name="WeThePeopleAudit"):
    """Hunt down the project folder"""
    here = Path.cwd()
    
    #if here returns values
    if here.name == project_name:
        return here
        
    #check all folders
    for parent in here.parents:
        if parent.name == project_name:
            return parent
    
    #research
    for root, dirs, _ in os.walk(Path.home()):
        if project_name in dirs:
            return Path(root) / project_name
    #Print the error here
    raise FileNotFoundError(f"Can't find the '{project_name}' folder anywhere")

try:
    project = spending_data_root()
    print(f"Found project at: {project}")
    data_file = project / "data" / "combined_spending.csv"
    print(f"Loading from: {data_file}")
    

    rows = 0
    missing = 0
    
    print("Reading file in chunks...")
    for chunk in pd.read_csv(data_file, chunksize=100000, low_memory=False):
        chunk_size = len(chunk)
        rows += chunk_size
        print(f"Read {rows} rows...")
    
    print(f"\nTotal rows: {rows}")
    
except FileNotFoundError as e:
    print(f"Error: {e}")
    print("We're in:", Path.cwd())
    print("Check your project structure or give me the full path.")
except Exception as e:
    print(f"Something went wrong: {e}")

Found project root: /Users/phatngo/Desktop/WeThePeopleAudit
Reading data from: /Users/phatngo/Desktop/WeThePeopleAudit/data/combined_spending.csv
Processing file in chunks...
Processed 100000 rows so far...
Processed 200000 rows so far...
Processed 300000 rows so far...
Processed 400000 rows so far...
Processed 500000 rows so far...
Processed 600000 rows so far...
Processed 700000 rows so far...
Processed 800000 rows so far...
Processed 900000 rows so far...
Processed 1000000 rows so far...
Processed 1100000 rows so far...
Processed 1200000 rows so far...
Processed 1300000 rows so far...
Processed 1400000 rows so far...
Processed 1500000 rows so far...
Processed 1600000 rows so far...
Processed 1700000 rows so far...
Processed 1800000 rows so far...
Processed 1900000 rows so far...
Processed 2000000 rows so far...
Processed 2100000 rows so far...
Processed 2200000 rows so far...
Processed 2300000 rows so far...
Processed 2400000 rows so far...
Processed 2500000 rows so far...
Processed

In [3]:
def spending_politicians(project_name="WeThePeopleAudit"):
    """Hunt down the project folder"""
    here = Path.cwd()
    
    #if here returns values
    if here.name == project_name:
        return here
        
    #check all folders
    for parent in here.parents:
        if parent.name == project_name:
            return parent
    
    #research
    for root, dirs, _ in os.walk(Path.home()):
        if project_name in dirs:
            return Path(root) / project_name
    #Print the error here
    raise FileNotFoundError(f"Can't find the '{project_name}' folder anywhere")

try:
    project = spending_politicians()
    print(f"Found project at: {project}")
    data_file = project / "data" / "merged_spending_politicians_with_party.csv"
    print(f"Reading: {data_file}")
    
    df = pd.read_csv(data_file, low_memory=False)
    
    print(f"\nLoaded {len(df)} rows with {len(df.columns)} columns")
    print("\nColumns:")
    for col in df.columns:
        print(f"- {col}")
    
    print("\nFirst few rows:")
    print(df.head())
    
    print("\nLast few rows:")
    print(df.tail())

    df['amount'] = pd.to_numeric(df['amount'], errors='coerce')
    total = df['amount'].sum()
    
#------------Congressional district breakdown
    if 'Congressional_District' in df.columns:
        print("\nCongressional Districts:")
        
        counts = df['Congressional_District'].value_counts()
        money = df.groupby('Congressional_District')['amount'].sum()
        avgs = df.groupby('Congressional_District')['amount'].mean()
        
        for dist in money.sort_values(ascending=False).index:
            n = counts[dist]
            cash = money[dist]
            avg = avgs[dist]
            pct_rows = n/len(df)*100
            pct_cash = cash/total*100
            
            print(f"- {dist}: {n} records ({pct_rows:.2f}%) | ${cash:,.2f} ({pct_cash:.2f}%) | ${avg:,.2f} avg")
    
#------------Senate districts
    if 'Senate_District' in df.columns:
        print("\nSenate Districts:")
        counts = df['Senate_District'].value_counts()
        money = df.groupby('Senate_District')['amount'].sum()
        avgs = df.groupby('Senate_District')['amount'].mean()
        
        for dist in money.sort_values(ascending=False).index:
            n = counts[dist]
            cash = money[dist]
            avg = avgs[dist]
            pct_rows = n/len(df)*100
            pct_cash = cash/total*100
            
            print(f"- {dist}: {n} records ({pct_rows:.2f}%) | ${cash:,.2f} ({pct_cash:.2f}%) | ${avg:,.2f} avg")
    
#------------House districts
    if 'House_District' in df.columns:
        print("\nHouse Districts:")
        
        counts = df['House_District'].value_counts()
        money = df.groupby('House_District')['amount'].sum()
        avgs = df.groupby('House_District')['amount'].mean()
        
        for dist in money.sort_values(ascending=False).index:
            n = counts[dist]
            cash = money[dist]
            avg = avgs[dist]
            pct_rows = n/len(df)*100
            pct_cash = cash/total*100
            print(f"- {dist}: {n} records ({pct_rows:.2f}%) | ${cash:,.2f} ({pct_cash:.2f}%) | ${avg:,.2f} avg")

except FileNotFoundError as e:
    print(f"Error: {e}")
    print("We're in:", Path.cwd())
    print("Check your project structure or use a full path to the file.")

Found project root: /Users/phatngo/Desktop/WeThePeopleAudit
Reading data from: /Users/phatngo/Desktop/WeThePeopleAudit/data/merged_spending_politicians_with_party.csv

DataFrame loaded successfully with 13790184 rows and 33 columns.

Column names:
- base_id
- budget_fiscal_year
- fiscal_period
- date
- cabinet_secretariat
- department
- appropriation_type
- appropriation_name
- object_class
- object_code
- encumbrance_id
- zip_code
- amount
- fund
- fund_code
- appropriation_code
- object
- department_code
- vendor
- vendor_id
- payment_id
- payment_method
- state
- city
- create_date
- Congressional_District
- Senate_District
- House_District
- people_id
- name
- party
- role
- political_party

First 5 rows:
                base_id  budget_fiscal_year  fiscal_period  \
0  B0000000000360065574                2022              7   
1  B0000000000360065575                2022              7   
2  B0000000000360067383                2022              7   
3  B0000000000360072140          

In [None]:
df.head(5)
#df.columns
#print(len(df))
mask = df['House_District'] != 'Other state'
filtered_df = df.loc[mask]

filtered_df.head(5).to_csv("sample_with_header.csv", index=False, header=True)