In [11]:
import pandas as pd

Prison_pop = pd.read_csv(
    "https://raw.githubusercontent.com/vera-institute/incarceration-trends/refs/heads/main/incarceration_trends_state.csv"
)
Prison_pop

Unnamed: 0,year,state_abbr,state_name,state_fips,region,division,total_pop_15to64,female_pop_15to64,male_pop_15to64,aapi_pop_15to64,...,aapi_female_prison_adm_rate,black_female_prison_adm_rate,latinx_female_prison_adm_rate,native_female_prison_adm_rate,white_female_prison_adm_rate,aapi_male_prison_adm_rate,black_male_prison_adm_rate,latinx_male_prison_adm_rate,native_male_prison_adm_rate,white_male_prison_adm_rate
0,2024,AK,Alaska,2,West,Pacific,484376.0,224281.0,260095.0,44427.0,...,,,,,,,,,,
1,2023,AK,Alaska,2,West,Pacific,484376.0,224281.0,260095.0,44427.0,...,,,,,,,,,,
2,2022,AK,Alaska,2,West,Pacific,484376.0,224281.0,260095.0,44427.0,...,,,,,,,,,,
3,2021,AK,Alaska,2,West,Pacific,486916.0,225591.0,261325.0,44109.0,...,,,,,,,,,,
4,2020,AK,Alaska,2,West,Pacific,488623.0,226819.0,261804.0,43920.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3526,1962,WY,Wyoming,56,West,Mountain,,,,,...,,,,,,,,,,
3527,1961,WY,Wyoming,56,West,Mountain,,,,,...,,,,,,,,,,
3528,1960,WY,Wyoming,56,West,Mountain,,,,,...,,,,,,,,,,
3529,1955,WY,Wyoming,56,West,Mountain,,,,,...,,,,,,,,,,


In [12]:
# ------------------------------------------------------------------
# 2) Define columns to keep
# ------------------------------------------------------------------
keep_cols = [
    "year",
    "state_abbr",
    "state_name",
    "state_fips",
    "region",
    "division",
    "total_pop_15to64",
    "female_pop_15to64",
    "male_pop_15to64",
    "aapi_pop_15to64",
    "black_pop_15to64",
    "latinx_pop_15to64",
    "native_pop_15to64",
    "white_pop_15to64",
    "aapi_female_pop_15to64",
    "black_female_pop_15to64",
    "latinx_female_pop_15to64",
    "native_female_pop_15to64",
    "white_female_pop_15to64",
    "aapi_male_pop_15to64",
    "black_male_pop_15to64",
    "latinx_male_pop_15to64",
    "native_male_pop_15to64",
    "white_male_pop_15to64",
    "total_incarceration",
    "total_incarceration_rate",
    "held_in_jail",
    "total_jail_pop",
    "female_jail_pop",
    "male_jail_pop",
    "total_jail_pop_rate",
    "female_jail_pop_rate",
    "male_jail_pop_rate",
    "total_prison_pop",
    "female_prison_pop",
    "male_prison_pop",
    "total_prison_pop_rate",
    "female_prison_pop_rate",
    "male_prison_pop_rate",
]

# ------------------------------------------------------------------
# 3) Subset to the selected columns
# ------------------------------------------------------------------
Prison_pop_reduced = Prison_pop[keep_cols]

# ------------------------------------------------------------------
# 4) Filter to years 2015–2022
# ------------------------------------------------------------------
Prison_pop_reduced = Prison_pop_reduced[Prison_pop_reduced["year"].between(2015, 2022)]

# # ------------------------------------------------------------------
# # 5) Sum 'held_in_jail' into 'total_jail_pop'
# # ------------------------------------------------------------------
# Prison_pop_reduced["total_jail_pop"] = (
#     Prison_pop_reduced["total_jail_pop"] + Prison_pop_reduced["held_in_jail"]
# )

# # ------------------------------------------------------------------
# # 6) Remove the 'held_in_jail' column entirely
# # ------------------------------------------------------------------
# Prison_pop_reduced = Prison_pop_reduced.drop(columns=["held_in_jail"])

# ------------------------------------------------------------------
# 7) Save the final processed dataset
# ------------------------------------------------------------------
output_path = "Prison_population_2015_2022.csv"
Prison_pop_reduced.to_csv(output_path, index=False)

# ------------------------------------------------------------------
# 8) Confirmation
# ------------------------------------------------------------------
print(
    f"Processed data (years 2015–2022) with held_in_jail summed into total_jail_pop "
    f"and held_in_jail dropped. Saved to:\n{output_path}"
)
print(Prison_pop_reduced[["year", "state_abbr", "total_jail_pop"]].head())

Processed data (years 2015–2022) with held_in_jail summed into total_jail_pop and held_in_jail dropped. Saved to:
Prison_population_2015_2022.csv
   year state_abbr  total_jail_pop
2  2022         AK          2934.0
3  2021         AK          3076.0
4  2020         AK          2784.0
5  2019         AK          2693.0
6  2018         AK          2441.0


In [13]:
# 2) Create total female and male incarcerated population columns
Prison_pop_reduced["female_total"] = (
    Prison_pop_reduced["female_jail_pop"] + Prison_pop_reduced["female_prison_pop"]
)
Prison_pop_reduced["male_total"] = (
    Prison_pop_reduced["male_jail_pop"] + Prison_pop_reduced["male_prison_pop"]
)

# 3) Compute race-specific population percentages
races = ["aapi", "black", "latinx", "native", "white"]
for race in races:
    pop_col = f"{race}_pop_15to64"
    pct_col = f"{race}_pct"
    Prison_pop_reduced[pct_col] = round(
        Prison_pop_reduced[pop_col] / Prison_pop_reduced["total_pop_15to64"] * 100, 2
    )

# 4) Reorder to insert new columns next to their related fields
new_order = []
for col in Prison_pop_reduced.columns:
    new_order.append(col)
    # after jail/prison pop totals, insert female_total/male_total
    if col == "female_prison_pop":
        new_order.append("female_total")
    if col == "male_prison_pop":
        new_order.append("male_total")
    # after each race pop column, insert its percentage
    for race in races:
        if col == f"{race}_pop_15to64":
            new_order.append(f"{race}_pct")

df = Prison_pop_reduced[new_order]

# # 5) Save the updated dataset
# output_path = "Prison_population_2015_2022_updated.csv"
# df.to_csv(output_path, index=False)

# 6) Confirmation
print(
    f"Created 'female_total', 'male_total', and race pct columns; saved to:\n{output_path}"
)
print(df.head(5))

Created 'female_total', 'male_total', and race pct columns; saved to:
Prison_population_2015_2022.csv
   year state_abbr state_name  state_fips region division  total_pop_15to64  \
2  2022         AK     Alaska           2   West  Pacific          484376.0   
3  2021         AK     Alaska           2   West  Pacific          486916.0   
4  2020         AK     Alaska           2   West  Pacific          488623.0   
5  2019         AK     Alaska           2   West  Pacific          493016.0   
6  2018         AK     Alaska           2   West  Pacific          497876.0   

   female_pop_15to64  male_pop_15to64  aapi_pop_15to64  ...  \
2           224281.0         260095.0          44427.0  ...   
3           225591.0         261325.0          44109.0  ...   
4           226819.0         261804.0          43920.0  ...   
5           229285.0         263731.0          43819.0  ...   
6           231938.0         265938.0          43449.0  ...   

   total_prison_pop_rate  female_prison_pop_

In [14]:
# 2) Remove any duplicate column names – this keeps the first occurrence of each
df = df.loc[:, ~df.columns.duplicated()]

# 3) Save the deduplicated dataset
output_path = "Prison_population_2015_2022_updated.csv"
df.to_csv(output_path, index=False)

# 4) Confirmation
print(f"Dropped duplicate columns and saved unique-column dataset to: {output_path}")
print("Remaining columns:")
print(df.columns.tolist())

Dropped duplicate columns and saved unique-column dataset to: Prison_population_2015_2022_updated.csv
Remaining columns:
['year', 'state_abbr', 'state_name', 'state_fips', 'region', 'division', 'total_pop_15to64', 'female_pop_15to64', 'male_pop_15to64', 'aapi_pop_15to64', 'aapi_pct', 'black_pop_15to64', 'black_pct', 'latinx_pop_15to64', 'latinx_pct', 'native_pop_15to64', 'native_pct', 'white_pop_15to64', 'white_pct', 'aapi_female_pop_15to64', 'black_female_pop_15to64', 'latinx_female_pop_15to64', 'native_female_pop_15to64', 'white_female_pop_15to64', 'aapi_male_pop_15to64', 'black_male_pop_15to64', 'latinx_male_pop_15to64', 'native_male_pop_15to64', 'white_male_pop_15to64', 'total_incarceration', 'total_incarceration_rate', 'held_in_jail', 'total_jail_pop', 'female_jail_pop', 'male_jail_pop', 'total_jail_pop_rate', 'female_jail_pop_rate', 'male_jail_pop_rate', 'total_prison_pop', 'female_prison_pop', 'female_total', 'male_prison_pop', 'male_total', 'total_prison_pop_rate', 'female_pris

In [15]:
# ------------------------------------------------------------------
# 2) List of columns to convert to integer
# ------------------------------------------------------------------
int_cols = [
    'total_pop_15to64', 'female_pop_15to64', 'male_pop_15to64',
    'aapi_pop_15to64', 'black_pop_15to64', 'latinx_pop_15to64',
    'native_pop_15to64', 'white_pop_15to64',
    'aapi_female_pop_15to64', 'black_female_pop_15to64',
    'latinx_female_pop_15to64', 'native_female_pop_15to64',
    'white_female_pop_15to64',
    'aapi_male_pop_15to64', 'black_male_pop_15to64',
    'latinx_male_pop_15to64', 'native_male_pop_15to64',
    'white_male_pop_15to64',
    'total_incarceration', 'held_in_jail', 'total_jail_pop',
    'female_jail_pop', 'male_jail_pop',
    'total_prison_pop', 'female_prison_pop',
    'female_total', 'male_prison_pop', 'male_total',
    'total_incarceration_rate', 'total_jail_pop_rate',	
    'female_jail_pop_rate',	'male_jail_pop_rate', 'total_prison_pop_rate',	
    'female_prison_pop_rate','male_prison_pop_rate'

]

# ------------------------------------------------------------------
# 3) Convert each column to integer
# ------------------------------------------------------------------
for col in int_cols:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0).astype(int)

# ------------------------------------------------------------------
# 4) Save the cleaned dataset
# ------------------------------------------------------------------
output_path = 'Prison_population_2015_2022_updated.csv'
df.to_csv(output_path, index=False)

# ------------------------------------------------------------------
# 5) Confirmation
# ------------------------------------------------------------------
print(f"Converted columns to integer and saved to: {output_path}")
df[int_cols].head()


Converted columns to integer and saved to: Prison_population_2015_2022_updated.csv


Unnamed: 0,total_pop_15to64,female_pop_15to64,male_pop_15to64,aapi_pop_15to64,black_pop_15to64,latinx_pop_15to64,native_pop_15to64,white_pop_15to64,aapi_female_pop_15to64,black_female_pop_15to64,...,female_total,male_prison_pop,male_total,total_incarceration_rate,total_jail_pop_rate,female_jail_pop_rate,male_jail_pop_rate,total_prison_pop_rate,female_prison_pop_rate,male_prison_pop_rate
2,484376,224281,260095,44427,21821,37108,77627,303393,22686,8891,...,410,1430,4029,986,605,149,999,310,33,549
3,486916,225591,261325,44109,21813,36553,77714,306727,22457,8889,...,442,1478,4197,952,631,158,1040,320,37,565
4,488623,226819,261804,43920,21707,35801,77565,309630,22371,8897,...,419,1695,4159,936,569,141,941,367,43,647
5,493016,229285,263731,43819,21812,35800,77429,314156,22373,8997,...,426,1700,4049,907,546,150,890,361,35,644
6,497876,231938,265938,43449,21926,35475,77469,319557,22250,9046,...,379,1852,4001,879,490,125,808,389,37,696
