# Import Libraries / Path / Data

In [3]:
#import libraries
import pandas as pd
import numpy as np
import os

In [5]:
#create path
path = r'C:\Users\TypicalPancake\6.10 Gun Violence 12-2024'

In [7]:
#import data
df = pd.read_csv(os.path.join(path, '02 Data', 'Prepared Data', 'gv_missing.csv'))

# Optimizing Dtypes Per Column

In [9]:
# Display initial memory usage
print("Initial memory usage:")
print(f"{df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

Initial memory usage:
366.64 MB


In [11]:
# Optimize each column
for col in df.columns:
    col_type = df[col].dtype
    
    # Optimize numeric columns
    if pd.api.types.is_numeric_dtype(col_type):
        if pd.api.types.is_integer_dtype(col_type):
            df[col] = pd.to_numeric(df[col], downcast='integer')
        elif pd.api.types.is_float_dtype(col_type):
            df[col] = pd.to_numeric(df[col], downcast='float')
    
    # Optimize object columns (e.g., strings)
    elif pd.api.types.is_object_dtype(col_type):
        unique_count = df[col].nunique()
        total_count = len(df[col])
        
        # Convert to category if unique values are less than 50% of total rows
        if unique_count / total_count < 0.5:
            df[col] = df[col].astype('category')
    
    # Optimize datetime columns
    elif pd.api.types.is_datetime64_any_dtype(col_type):
        df[col] = pd.to_datetime(df[col], errors='coerce')

# Display final memory usage
print("\nFinal memory usage:")
print(f"{df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")


Final memory usage:
172.58 MB


In [13]:
# Check data types
print(df.dtypes)

Unnamed: 0                        int32
incident_id                       int32
date                           category
state                          category
city_or_county                 category
address                          object
n_killed                           int8
n_injured                          int8
incident_url                     object
source_url                       object
incident_url_fields_missing        bool
congressional_district          float32
gun_stolen                     category
gun_type                       category
incident_characteristics       category
latitude                        float32
location_description           category
longitude                       float32
n_guns_involved                 float32
notes                            object
participant_age                category
participant_age_group          category
participant_gender             category
participant_name               category
participant_relationship       category


# Exporting Dataframe

In [23]:
df.to_csv(os.path.join(path, '02 Data','Prepared Data', 'gv_cleaned.csv'))