In [1]:
import pandas as pd

BJA = pd.read_csv(
    "https://raw.githubusercontent.com/Ramil-cyber/Research_Linking_Analyzing_Deaths_US_Prisons/refs/heads/main/Data/BJA_cleaned.csv",
    low_memory=False,
)
BJS_prison = pd.read_csv(
    "https://raw.githubusercontent.com/Ramil-cyber/Research_Linking_Analyzing_Deaths_US_Prisons/refs/heads/main/Data/BJS_cleaned.csv",
    low_memory=False,
)

# Aligning on common columns
common_cols = list(set(BJA.columns).intersection(BJS_prison.columns))
BJA = BJA[common_cols]
BJS_prison = BJS_prison[common_cols]

# Concatenating row-wise
Merged_BJA_BJS_prison = pd.concat([BJA, BJS_prison], ignore_index=True)

# Reordering columns in the specified sequence
cols_order = [
    "First_Name",
    "Last_Name",
    "Gender",
    "Race",
    "Birth_Year",
    "Reporting_Death_Year",
    "State_Name",
    "City_Name",
    "Facility_Name",
    "Death_Cause",
    "Source",
]
Merged_BJA_BJS_prison = Merged_BJA_BJS_prison[cols_order]

In [2]:
# Ensuring Reporting_Death_Year is integer
Merged_BJA_BJS_prison["Reporting_Death_Year"] = pd.to_numeric(
    Merged_BJA_BJS_prison["Reporting_Death_Year"], errors="coerce"
).astype("Int64")

# Sorting by Reporting_Death_Year ascending
Merged_BJA_BJS_prison = Merged_BJA_BJS_prison.sort_values(
    by="Reporting_Death_Year", ascending=True
).reset_index(drop=True)
Merged_BJA_BJS_prison

Unnamed: 0,First_Name,Last_Name,Gender,Race,Birth_Year,Reporting_Death_Year,State_Name,City_Name,Facility_Name,Death_Cause,Source
0,Jose,Sanchez,Male,Hispanic or Latino,1927.0,2015,Wyoming,Torrington,Wyoming Medium Correctional,Illness,BJS
1,Ernesto,Nieves,Male,Hispanic or Latino,1964.0,2015,Massachusetts,Bridgewater,Bwater State Hospital,Illness,BJS
2,Francis,Soffen,Male,White (not Hispanic),1939.0,2015,Massachusetts,Shirley,MCI Shirley,Illness,BJS
3,Kenneth,Getchell,Male,White (not Hispanic),1931.0,2015,Massachusetts,Shirley,MCI Shirley,Illness,BJS
4,Alfred,Trudell,Male,White (not Hispanic),1942.0,2015,Massachusetts,Shirley,MCI Shirley,Illness,BJS
...,...,...,...,...,...,...,...,...,...,...,...
42013,Brandon,Stephenson,Male,Black or African American,1994.0,2023,Ohio,COLUMBUS,Ohio Office of Criminal Justice Services,Illness,BJA
42014,Joseph,Bennet Jr.,Male,White (not Hispanic),1981.0,2023,Ohio,Columbus,Ohio Office of Criminal Justice Services,Illness,BJA
42015,Anthony,Starr,Male,White (not Hispanic),1943.0,2023,Ohio,Grafton,Ohio Office of Criminal Justice Services,Illness,BJA
42016,Joseph,Williams,Male,Black or African American,1952.0,2023,Ohio,COLUMBUS,Ohio Office of Criminal Justice Services,Illness,BJA


In [3]:
# Uppercasing all string values (rows) but keep column names intact
for col in Merged_BJA_BJS_prison.select_dtypes(include=["object"]).columns:
    Merged_BJA_BJS_prison[col] = Merged_BJA_BJS_prison[col].str.upper()

# Ensuring Birth_Year is integer
Merged_BJA_BJS_prison["Birth_Year"] = pd.to_numeric(
    Merged_BJA_BJS_prison["Birth_Year"], errors="coerce"
).astype("Int64")
Merged_BJA_BJS_prison

# Saving the final updated dataset
output_path = "Merged_BJA_BJS_prison.csv"
Merged_BJA_BJS_prison.to_csv(output_path, index=False)
print(f"Final dataset saved to: {output_path}")

Final dataset saved to: Merged_BJA_BJS_prison.csv


In [4]:
# Total number of records
total_records = len(Merged_BJA_BJS_prison)

# Number of unique records (across all columns)
unique_records = len(Merged_BJA_BJS_prison.drop_duplicates())

# Number of duplicate records
duplicate_records = total_records - unique_records

print(f"Total records: {total_records}")
print(f"Unique records: {unique_records}")
print(f"Duplicate records: {duplicate_records}")

Total records: 42018
Unique records: 41978
Duplicate records: 40


In [5]:
# Finding rows duplicated across every column
duplicates = Merged_BJA_BJS_prison[Merged_BJA_BJS_prison.duplicated(keep=False)]
duplicates

Unnamed: 0,First_Name,Last_Name,Gender,Race,Birth_Year,Reporting_Death_Year,State_Name,City_Name,Facility_Name,Death_Cause,Source
16683,LARRY,MUKES,MALE,BLACK OR AFRICAN AMERICAN,1974,2020,TEXAS,HUMBLE,"GOVERNOR, TEXAS OFFICE OF THE",SUICIDE,BJA
16722,LARRY,MUKES,MALE,BLACK OR AFRICAN AMERICAN,1974,2020,TEXAS,HUMBLE,"GOVERNOR, TEXAS OFFICE OF THE",SUICIDE,BJA
20196,LEE,CREELY,MALE,WHITE (NOT HISPANIC),1986,2020,GEORGIA,SAVANNAH,CRIMINAL JUSTICE COORDINATING COUNCIL,ACCIDENT,BJA
21385,LEE,CREELY,MALE,WHITE (NOT HISPANIC),1986,2020,GEORGIA,SAVANNAH,CRIMINAL JUSTICE COORDINATING COUNCIL,ACCIDENT,BJA
22663,BRUCE,LESLIE,MALE,WHITE (NOT HISPANIC),1952,2021,WYOMING,TORRINGTON,ATTORNEY GENERAL OF WYOMING,ILLNESS,BJA
...,...,...,...,...,...,...,...,...,...,...,...
36702,UNKNOWN,UNKNOWN,MALE,BLACK OR AFRICAN AMERICAN,,2023,ILLINOIS,UNKNOWN,ILLINOIS CRIMINAL JUSTICE INFORMATION AUTHORITY,USE OF FORCE,BJA
36933,MATTHEW,BRIGGS,MALE,WHITE (NOT HISPANIC),1981,2023,IOWA,COUNCIL BLUFFS,DEPT OF PUB SAFETY,USE OF FORCE,BJA
36970,MATTHEW,BRIGGS,MALE,WHITE (NOT HISPANIC),1981,2023,IOWA,COUNCIL BLUFFS,DEPT OF PUB SAFETY,USE OF FORCE,BJA
39155,ERIC,REAVES,MALE,BLACK OR AFRICAN AMERICAN,1969,2023,PENNSYLVANIA,COAL TOWNSHIP,CRIME & DELINQUENCY PA COMMISSION ON,ILLNESS,BJA


In [6]:
# Computing missing counts and percentages
missing_counts = Merged_BJA_BJS_prison.isna().sum()
missing_percent = (Merged_BJA_BJS_prison.isna().mean() * 100).round(2)

# Combining into a summary DataFrame
missing_summary = pd.DataFrame(
    {"missing_count": missing_counts, "missing_percent": missing_percent}
)

print(missing_summary)

                      missing_count  missing_percent
First_Name                        5             0.01
Last_Name                         3             0.01
Gender                           24             0.06
Race                              0             0.00
Birth_Year                      765             1.82
Reporting_Death_Year              0             0.00
State_Name                       89             0.21
City_Name                       109             0.26
Facility_Name                    23             0.05
Death_Cause                     180             0.43
Source                            0             0.00


In [None]:
# # 2) Identify rows with "Unknown" names (case-insensitive)
# mask_unknown = (Merged_BJA_BJS_prison['First_Name'].str.upper() == 'UNKNOWN') | (Merged_BJA_BJS_prison['Last_Name'].str.upper() == 'UNKNOWN')

# # 3) Split into known vs. unknown subsets
# df_known = Merged_BJA_BJS_prison[~mask_unknown]
# df_unknown = Merged_BJA_BJS_prison[mask_unknown]

# # 4) Drop duplicates in the known subset based on First_Name and Last_Name
# df_known = df_known.drop_duplicates(subset=['First_Name', 'Last_Name'], keep='first')

# # 5) Recombine the cleaned known subset with all unknown rows
# Cleaned_BJA_BJS_prison = pd.concat([df_known, df_unknown], ignore_index=True)

# # 6) Reset index
# Cleaned_BJA_BJS_prison.reset_index(drop=True, inplace=True)

# # 7) Save the deduplicated dataset
# output_path = 'Cleaned_BJA_BJS_prison_deduped.csv'
# Cleaned_BJA_BJS_prison.to_csv(output_path, index=False)

# # 8) Display summary
# print(f"Original rows: {len(Merged_BJA_BJS_prison)}")
# print(f"Rows after deduplication (excluding UNKNOWN): {len(Cleaned_BJA_BJS_prison)}")
# print(f"Deduplicated file saved to: {output_path}")


Original rows: 42018
Rows after deduplication (excluding UNKNOWN): 38189
Deduplicated file saved to: Cleaned_BJA_BJS_prison_deduped.csv


In [13]:
# 2) Identify “unknown” rows (where either first or last name is UNKNOWN)
mask_unknown = (
    Merged_BJA_BJS_prison['First_Name'].str.strip().str.upper() == 'UNKNOWN'
) | (
    Merged_BJA_BJS_prison['Last_Name'].str.strip().str.upper() == 'UNKNOWN'
)

# 3) Split into known vs. unknown subsets
df_known   = Merged_BJA_BJS_prison[~mask_unknown]
df_unknown = Merged_BJA_BJS_prison[mask_unknown]

# 4) Drop exact duplicates in the known subset (keeps first occurrence)
df_known = df_known.drop_duplicates(keep='first')

# 5) Recombine known + all unknown rows
Cleaned_BJA_BJS_prison = pd.concat([df_known, df_unknown], ignore_index=True)

# 6) (Optional) Reset index
Cleaned_BJA_BJS_prison.reset_index(drop=True, inplace=True)

# 7) Save the deduplicated file
Cleaned_BJA_BJS_prison.to_csv('Cleaned_BJA_BJS_prison.csv', index=False)
print(f"Original rows: {len(Merged_BJA_BJS_prison):,}")
print(f"Cleaned rows : {len(Cleaned_BJA_BJS_prison):,}")


Original rows: 42,018
Cleaned rows : 41,993


In [14]:
# Finding rows duplicated across every column
duplicates = Cleaned_BJA_BJS_prison[Cleaned_BJA_BJS_prison.duplicated(keep=False)]
duplicates

Unnamed: 0,First_Name,Last_Name,Gender,Race,Birth_Year,Reporting_Death_Year,State_Name,City_Name,Facility_Name,Death_Cause,Source
41940,UNKNOWN,UNKNOWN,MALE,BLACK OR AFRICAN AMERICAN,,2022,ILLINOIS,UNKNOWN,ILLINOIS CRIMINAL JUSTICE INFORMATION AUTHORITY,USE OF FORCE,BJA
41943,UNKNOWN,UNKNOWN,MALE,BLACK OR AFRICAN AMERICAN,,2022,ILLINOIS,UNKNOWN,ILLINOIS CRIMINAL JUSTICE INFORMATION AUTHORITY,USE OF FORCE,BJA
41944,UNKNOWN,UNKNOWN,MALE,BLACK OR AFRICAN AMERICAN,,2022,ILLINOIS,UNKNOWN,ILLINOIS CRIMINAL JUSTICE INFORMATION AUTHORITY,"UNAVAILABLE, INVESTIGATION PENDING",BJA
41946,UNKNOWN,UNKNOWN,MALE,BLACK OR AFRICAN AMERICAN,,2022,ILLINOIS,UNKNOWN,ILLINOIS CRIMINAL JUSTICE INFORMATION AUTHORITY,"UNAVAILABLE, INVESTIGATION PENDING",BJA
41947,UNKNOWN,UNKNOWN,MALE,WHITE (NOT HISPANIC),,2022,ILLINOIS,UNKNOWN,ILLINOIS CRIMINAL JUSTICE INFORMATION AUTHORITY,SUICIDE,BJA
41949,UNKNOWN,UNKNOWN,MALE,WHITE (NOT HISPANIC),,2022,ILLINOIS,UNKNOWN,ILLINOIS CRIMINAL JUSTICE INFORMATION AUTHORITY,SUICIDE,BJA
41950,UNKNOWN,UNKNOWN,MALE,WHITE (NOT HISPANIC),,2022,ILLINOIS,UNKNOWN,ILLINOIS CRIMINAL JUSTICE INFORMATION AUTHORITY,SUICIDE,BJA
41951,UNKNOWN,UNKNOWN,MALE,WHITE (NOT HISPANIC),,2022,ILLINOIS,UNKNOWN,ILLINOIS CRIMINAL JUSTICE INFORMATION AUTHORITY,"UNAVAILABLE, INVESTIGATION PENDING",BJA
41952,UNKNOWN,UNKNOWN,MALE,WHITE (NOT HISPANIC),,2022,ILLINOIS,UNKNOWN,ILLINOIS CRIMINAL JUSTICE INFORMATION AUTHORITY,"UNAVAILABLE, INVESTIGATION PENDING",BJA
41953,UNKNOWN,UNKNOWN,MALE,WHITE (NOT HISPANIC),,2022,ILLINOIS,UNKNOWN,ILLINOIS CRIMINAL JUSTICE INFORMATION AUTHORITY,"UNAVAILABLE, INVESTIGATION PENDING",BJA
