In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

###  Load Data


In [26]:
dataset_2004_2017 = pd.read_csv('C:/Users/vaish/OneDrive/Desktop/python projects/Drug-addiction-analysis/Data/Multiple Cause of Death, 2004-2017.csv')
dataset_2018_2023 = pd.read_csv('C:/Users/vaish/OneDrive/Desktop/python projects/Drug-addiction-analysis/Data/Multiple Cause of Death, 2018-2023.csv')

### Standardize column names

In [27]:
dataset_2004_2017 = dataset_2004_2017.rename(columns={
    'Race': 'Single Race 6',
    'Race Code': 'Single Race 6 Code'
})

### Combine datasets

In [29]:
opioid_deaths = pd.concat([dataset_2005_2017, dataset_2018_2023], 
                          ignore_index=True)


#Sort by State, Year, Race, Age Group
opioid_deaths = opioid_deaths.sort_values([
    'State', 'Year', 'Single Race 6', 'Ten-Year Age Groups'
]).reset_index(drop=True)

In [31]:
# To check year range
print(f"Year range: {opioid_deaths['Year'].min()} - {opioid_deaths['Year'].max()}")

# To check for duplicate rows
duplicates = opioid_deaths.groupby([
    'State', 'Year', 'Single Race 6', 'Ten-Year Age Groups'
]).size().reset_index(name='count')
duplicates = duplicates[duplicates['count'] > 1]
print(f"Duplicate rows found: {len(duplicates)}")


# To verify record counts
year_counts = opioid_deaths['Year'].value_counts().sort_index()
print("Records per year:")
print(year_counts)


Year range: 2004.0 - 2023.0
Duplicate rows found: 1950
Records per year:
Year
2004.0    334
2005.0    343
2006.0    369
2007.0    387
2008.0    382
2009.0    389
2010.0    382
2011.0    399
2012.0    418
2013.0    424
2014.0    435
2015.0    467
2016.0    486
2017.0    515
2018.0    613
2019.0    651
2020.0    745
2021.0    812
2022.0    854
2023.0    881
Name: count, dtype: int64


In [25]:
# 1. Race categories - verify same races appear throughout timespan
race_by_year = opioid_deaths.groupby('Year')['Single Race 6'].nunique()
print("Unique races per year:")
print(race_by_year)

# 2. Age groups - confirm consistent age groupings  
age_by_year = opioid_deaths.groupby('Year')['Ten-Year Age Groups'].nunique()
print("Unique age groups per year:")
print(age_by_year)

# 3. Geographic coverage - check for missing states/years
state_year_coverage = opioid_deaths.groupby('Year')['State'].nunique()
print("States reporting per year:")
print(state_year_coverage)

# 4. "Unreliable" patterns - document suppression trends
unreliable_pattern = opioid_deaths.groupby('Year').apply(
    lambda x: (x['Crude Rate'] == 'Unreliable').sum()
)
print("Unreliable entries per year:")
print(unreliable_pattern)

# Find which race appears in newer data but not older
races_old = set(dataset_2004_2017['Single Race 6'].unique())
races_new = set(dataset_2018_2023['Single Race 6'].unique())

print("Races in 2004-2017:", races_old)
print("Races in 2018-2023:", races_new)
print("Missing from older data:", races_new - races_old)
print("Missing from newer data:", races_old - races_new)

Unique races per year:
Year
2004.0    4
2005.0    4
2006.0    4
2007.0    4
2008.0    4
2009.0    4
2010.0    4
2011.0    4
2012.0    4
2013.0    4
2014.0    4
2015.0    4
2016.0    4
2017.0    4
2018.0    5
2019.0    4
2020.0    5
2021.0    5
2022.0    5
2023.0    5
Name: Single Race 6, dtype: int64
Unique age groups per year:
Year
2004.0     8
2005.0     8
2006.0     8
2007.0     8
2008.0     8
2009.0     8
2010.0     8
2011.0     8
2012.0     8
2013.0     8
2014.0     8
2015.0     8
2016.0     8
2017.0     8
2018.0     8
2019.0     8
2020.0     9
2021.0     9
2022.0    10
2023.0    10
Name: Ten-Year Age Groups, dtype: int64
States reporting per year:
Year
2004.0    50
2005.0    48
2006.0    50
2007.0    50
2008.0    51
2009.0    51
2010.0    50
2011.0    50
2012.0    50
2013.0    50
2014.0    51
2015.0    51
2016.0    51
2017.0    51
2018.0    51
2019.0    51
2020.0    51
2021.0    51
2022.0    51
2023.0    51
Name: State, dtype: int64
Unreliable entries per year:
Year
2004.0     92

  unreliable_pattern = opioid_deaths.groupby('Year').apply(


In [None]:
# Collect Population Data for All Years (1999-2023)
# This will get population data to match your death data

import pandas as pd
import requests
import time

# PUT YOUR API KEY HERE (same as before)
MY_API_KEY = "a2fa499d96110671df7e4e2b0ac6db3e224e81da"  # Replace with your actual key

def collect_population_all_years():
    """Get population data for 1999-2023 to match death data"""
    
    print("📊 Collecting population data for all years 1999-2023...")
    print("This might take a few minutes...")
    
    all_data = []
    
    # Different Census datasets for different year ranges
    year_ranges = [
        # ACS 5-year estimates (2009-2023)
        {"years": list(range(2009, 2024)), "dataset": "acs/acs5", "name": "ACS 5-year"},
        
        # ACS 1-year estimates for some years (2005-2023, but not all years available)
        {"years": [2022, 2021, 2020, 2019, 2018, 2017, 2016, 2015, 2014, 2013, 2012, 2011, 2010, 2009, 2008, 2007, 2006, 2005], 
         "dataset": "acs/acs1", "name": "ACS 1-year"},
        
        # Population estimates (available for most years)
        {"years": list(range(2010, 2024)), "dataset": "pep/population", "name": "Population Estimates"}
    ]
    
    # Try ACS 5-year first (most reliable)
    print("\n🔍 Trying ACS 5-year estimates (2009-2023)...")
    
    for year in range(2009, 2024):
        print(f"  📅 Getting {year}...", end=" ")
        
        try:
            url = f"https://api.census.gov/data/{year}/acs/acs5"
            
            # Basic variables we need
            variables = [
                "B01003_001E",  # Total Population
                "B19013_001E",  # Median Household Income  
                "B23025_005E",  # Unemployed
                "B23025_002E",  # Labor Force
                "B25003_001E",  # Housing Units
                "NAME"
            ]
            
            params = {
                "get": ",".join(variables),
                "for": "state:*",
                "key": MY_API_KEY
            }
            
            response = requests.get(url, params=params, timeout=30)
            response.raise_for_status()
            
            data = response.json()
            df = pd.DataFrame(data[1:], columns=data[0])
            df['year'] = year
            df['data_source'] = f'ACS_5yr_{year}'
            
            all_data.append(df)
            print("✅")
            
            time.sleep(0.5)  # Be nice to the API
            
        except Exception as e:
            print(f"❌ {e}")
            continue
    
    # For earlier years (1999-2008), try Population Estimates
    print(f"\n🔍 Trying Population Estimates for earlier years...")
    
    # Population estimates are available but with fewer variables
    for year in range(2010, 2024):  # Start with what's most likely to work
        if year >= 2009:  # We already have ACS data for these
            continue
            
        print(f"  📅 Getting {year}...", end=" ")
        
        try:
            # Try intercensal estimates
            url = f"https://api.census.gov/data/{year}/pep/population"
            
            params = {
                "get": "POP,NAME",
                "for": "state:*",
                "key": MY_API_KEY
            }
            
            response = requests.get(url, params=params, timeout=30)
            response.raise_for_status()
            
            data = response.json()
            df = pd.DataFrame(data[1:], columns=data[0])
            df['year'] = year
            df['data_source'] = f'PopEst_{year}'
            
            # Rename to match our standard
            df.rename(columns={'POP': 'B01003_001E'}, inplace=True)
            
            all_data.append(df)
            print("✅")
            
            time.sleep(0.5)
            
        except Exception as e:
            print(f"❌ {e}")
            continue
    
    if all_data:
        # Combine all data
        combined_df = pd.concat(all_data, ignore_index=True)
        
        # Standardize column names
        combined_df.rename(columns={
            'B01003_001E': 'total_population',
            'B19013_001E': 'median_income',
            'B23025_005E': 'unemployed', 
            'B23025_002E': 'labor_force',
            'B25003_001E': 'housing_units',
            'NAME': 'state_name'
        }, inplace=True)
        
        # Convert to numeric
        numeric_cols = ['total_population', 'median_income', 'unemployed', 'labor_force', 'housing_units']
        for col in numeric_cols:
            if col in combined_df.columns:
                combined_df[col] = pd.to_numeric(combined_df[col], errors='coerce')
        
        # Calculate unemployment rate where possible
        if 'unemployed' in combined_df.columns and 'labor_force' in combined_df.columns:
            combined_df['unemployment_rate'] = (combined_df['unemployed'] / combined_df['labor_force']) * 100
        
        # Clean up state names
        if 'state_name' in combined_df.columns:
            combined_df['state_name'] = combined_df['state_name'].str.strip()
        
        # Sort by year and state
        combined_df = combined_df.sort_values(['year', 'state_name'])
        
        print(f"\n✅ SUCCESS! Collected {len(combined_df)} records")
        print(f"📅 Years: {combined_df['year'].min()} - {combined_df['year'].max()}")
        print(f"🏛️ States per year: ~{combined_df.groupby('year').size().mean():.0f}")
        
        # Save the data
        combined_df.to_csv('population_data_all_years.csv', index=False)
        print(f"💾 Saved to: population_data_all_years.csv")
        
        # Show data coverage
        print(f"\n📊 Data Coverage by Year:")
        coverage = combined_df.groupby('year').agg({
            'state_name': 'count',
            'total_population': lambda x: x.notna().sum()
        }).reset_index()
        
        for _, row in coverage.iterrows():
            year = int(row['year'])
            states = int(row['state_name']) 
            pop_data = int(row['total_population'])
            print(f"  {year}: {states} states, {pop_data} with population data")
        
        return combined_df
    
    else:
        print("❌ No population data collected")
        return pd.DataFrame()

def fill_missing_years():
    """For years we can't get from Census API, we'll interpolate"""
    
    print(f"\n🔧 For missing years (1999-2008), we can:")
    print(f"  1. Use linear interpolation between known data points")
    print(f"  2. Apply average growth rates")  
    print(f"  3. Use decennial census data as anchors (2000, 2010)")
    
    # This is a placeholder - we'd implement interpolation logic here
    print(f"  💡 For now, let's see what years we successfully collected...")

def main():
    """Main function"""
    
    print("🚀 COLLECTING POPULATION DATA FOR ALL YEARS")
    print("=" * 60)
    
    # Check API key
    if "YOUR_API_KEY_HERE" in MY_API_KEY:
        print("❌ Please update your API key in the script first!")
        return
    
    # Collect data
    pop_data = collect_population_all_years()
    
    if not pop_data.empty:
        
        # Show what we got
        print(f"\n📋 SUMMARY:")
        print(f"  ✅ Total records: {len(pop_data)}")
        
        # Check for gaps
        years_collected = sorted(pop_data['year'].unique())
        years_needed = list(range(1999, 2024))
        missing_years = [y for y in years_needed if y not in years_collected]
        
        if missing_years:
            print(f"  ⚠️ Missing years: {missing_years}")
            fill_missing_years()
        else:
            print(f"  🎉 All years collected!")
            
        print(f"\n✅ Ready to combine with your death data!")
        
    else:
        print(f"❌ No data collected. Check your API key and connection.")

if __name__ == "__main__":
    main()

🚀 COLLECTING POPULATION DATA FOR ALL YEARS
📊 Collecting population data for all years 1999-2023...
This might take a few minutes...

🔍 Trying ACS 5-year estimates (2009-2023)...
  📅 Getting 2009... ❌ 400 Client Error:  for url: https://api.census.gov/data/2009/acs/acs5?get=B01003_001E%2CB19013_001E%2CB23025_005E%2CB23025_002E%2CB25003_001E%2CNAME&for=state%3A%2A&key=a2fa499d96110671df7e4e2b0ac6db3e224e81da
  📅 Getting 2010... ❌ 400 Client Error:  for url: https://api.census.gov/data/2010/acs/acs5?get=B01003_001E%2CB19013_001E%2CB23025_005E%2CB23025_002E%2CB25003_001E%2CNAME&for=state%3A%2A&key=a2fa499d96110671df7e4e2b0ac6db3e224e81da
  📅 Getting 2011... ✅
  📅 Getting 2012... ✅
  📅 Getting 2013... ✅
  📅 Getting 2014... ✅
  📅 Getting 2015... ✅
  📅 Getting 2016... ✅
  📅 Getting 2017... ✅
  📅 Getting 2018... ✅
  📅 Getting 2019... ✅
  📅 Getting 2020... ✅
  📅 Getting 2021... ✅
  📅 Getting 2022... ✅
  📅 Getting 2023... ✅

🔍 Trying Population Estimates for earlier years...

✅ SUCCESS! Collecte