# 02_data_cleaning.ipynb  
## Data Cleaning and Preparation

### 1. Import Required Libraries  
In this step, we load the essential libraries for data cleaning and manipulation:  
- **pandas** for tabular data structures and analysis  
- **numpy** for numerical operations  
- **pathlib** for portable file path management  


In [1]:
import pandas as pd
import numpy as np
from pathlib import Path


### 2. Define Data Directories  
Set up input (`data/raw`) and output (`data/processed`) paths, ensuring the output folder exists before saving cleaned files.

In [2]:
RAW_DIR = Path('../data/raw')
PROC_DIR = Path('../data/processed')
PROC_DIR.mkdir(parents=True, exist_ok=True)


### 3. Load Raw Datasets  
- **Crime:** CSV file containing NYPD incident records  
- **Weather:** CSV file with daily NOAA observations  
  
Print the file paths for verification, then load both into pandas DataFrames.

In [3]:


# 1. Set the directory where raw data lives
RAW_DATA_DIR = Path('data/raw')

# 2. Point to the actual CSV filenames
crime_csv   = RAW_DATA_DIR / 'nypd_crime_2024_onwards.csv'
weather_csv = RAW_DATA_DIR / 'noaa_ghcnd_2024.csv'

# 3. Load each file into a DataFrame, parsing the appropriate date columns
df_crime   = pd.read_csv(crime_csv,   parse_dates=['CMPLNT_FR_DT'])
df_weather = pd.read_csv(weather_csv, parse_dates=['DATE'])

# 4. Print the number of rows and show a quick preview
print(f"Number of crime records:   {len(df_crime)}")
print(f"Number of weather records: {len(df_weather)}")

df_crime.head(), df_weather.head()


FileNotFoundError: [Errno 2] No such file or directory: 'data\\raw\\nypd_crime_2024_onwards.csv'


#### 3.1. Preview & Data Types  
- Display the first few rows of each DataFrame for a quick glance.  
- Inspect each column’s data type to determine necessary conversions.


In [None]:
## 3.1. Preview and datatypes
display(df_crime.head())  
display(df_crime.dtypes)

display(df_weather.head())
"display(df_weather.dtypes)"  # fix display below

# %%
# Fix display of df_weather.dtypes
display(df_weather.dtypes)


### 4. Convert Date & Time Columns  
- **Crime:** Merge date (`CMPLNT_FR_DT`) and time (`CMPLNT_FR_TM`) into a single `complaint_datetime` column, and extract a separate `date` field.  
- **Weather:** Parse the `DATE` column into a standardized `date` field.


In [None]:
# 4.1. Convert crime date/time
# Combine date and time into full datetime and create a separate date column

df_crime['complaint_datetime'] = pd.to_datetime(
    df_crime['CMPLNT_FR_DT'].astype(str) + ' ' + df_crime['CMPLNT_FR_TM'].astype(str),
    errors='coerce'
)
df_crime['date'] = df_crime['complaint_datetime'].dt.date


In [None]:
# %%
# 4.2. Convert weather date
df_weather['date'] = pd.to_datetime(df_weather['DATE'], errors='coerce').dt.date


In [None]:
## 5. Normalize column names to snake_case

# %%
# Rename crime columns
mapping_crime = {
    'CMPLNT_FR_DT': 'complaint_date',
    'CMPLNT_FR_TM': 'complaint_time',
    'CMPLNT_TO_DT': 'complaint_to_date',
    'CMPLNT_TO_TM': 'complaint_to_time',
    'ADDR_PCT_CD': 'precinct_code',
    'KY_CD': 'offense_code',
    'OFNS_DESC': 'offense_desc',
    'BORO_NM': 'borough',
    'LAW_CODE': 'law_code',
    'LAW_CAT_CD': 'law_category',
    # add more as needed
}

df_crime = df_crime.rename(columns=mapping_crime)

# Rename weather columns
mapping_weather = {
    'DATE': 'date_recorded',
    'TMAX': 'temp_max',
    'TMIN': 'temp_min',
    'PRCP': 'precipitation',
    # add more based on your CSV
}

df_weather = df_weather.rename(columns=mapping_weather)

In [None]:

# %%
# 6.1. Missing percentage in crime
df_crime_missing = df_crime.isna().mean().sort_values(ascending=False) * 100
display(df_crime_missing.head(10))

In [None]:
# 6.2. Missing percentage in weather
df_weather_missing = df_weather.isna().mean().sort_values(ascending=False) * 100
display(df_weather_missing.head(10))


In [None]:

# %%
# 6.3. Simple imputation example for weather
# Fill missing precipitation with 0 (assuming missing means no rain)
df_weather['precipitation'] = df_weather['precipitation'].fillna(0)

In [None]:

# %%
# 6.4. Drop crime rows with invalid dates
df_crime = df_crime.dropna(subset=['complaint_datetime', 'date'])


In [None]:
## 7. Filter analysis period


# %%
start_date = pd.to_datetime('2018-01-01').date()
end_date   = pd.to_datetime('2022-12-31').date()

df_crime = df_crime[(df_crime['date'] >= start_date) & (df_crime['date'] <= end_date)]
df_weather = df_weather[(df_weather['date'] >= start_date) & (df_weather['date'] <= end_date)]

print(f"Crime records between {start_date} and {end_date}: {len(df_crime)}")
print(f"Weather records between {start_date} and {end_date}: {len(df_weather)}")


In [None]:
## 8. Save cleaned datasets

# %%
clean_crime_file = PROC_DIR / 'crime_clean.csv'
clean_weather_file = PROC_DIR / 'weather_clean.csv'

df_crime.to_csv(clean_crime_file, index=False)
df_weather.to_csv(clean_weather_file, index=False)

print(f"Saved cleaned crime data to: {clean_crime_file}")
print(f"Saved cleaned weather data to: {clean_weather_file}")
