### Import Libraries

In [1]:
import pandas as pd
import datetime
import sqlite3
import zipfile

### Import Data

Crime Data:
- ZIP File.

In [2]:
crime_zip = zipfile.ZipFile("c:/users/barbieri/dsc-program/DC-Criminalistics/data/dc-crime-data/dc-crime-data.csv.zip", mode='r')
crime_csv = crime_zip.open('dc-crime-data.csv')
crime_df = pd.read_csv(crime_csv)

Weather Data
- DB Name: weather_data
- Table Name: weather_data

In [3]:
conn = sqlite3.connect('c:/users/barbieri/dsc-program/DC-Criminalistics/data/weather-data/weather_data.db')
weather_df = pd.read_sql('''select * from weather_data;''', conn)
conn.close()

Census Data

- DB Name: census_bg
- Table Name: census_blockgroup

In [4]:
conn = sqlite3.connect('c:/users/barbieri/dsc-program/DC-Criminalistics/data/census-data/census_bg.db')
census_df = pd.read_sql('''select * from census_blockgroup;''', conn)
conn.close()

### Wrangle Data

Crime Data:
- Create year, month, and time of day variables.

In [5]:
def assign_tod(row):
    try:
        timestamp = pd.Timestamp(row['START_DATE'])
        
        year = timestamp.year
        month = timestamp.month

        start_hour = timestamp.time().hour

        if 0 <= start_hour < 3:
            time_of_day = 'Midnight'
        elif 3 <= start_hour < 6:
            time_of_day = 'Early Morning'
        elif 6 <= start_hour < 12:
            time_of_day = 'Morning'
        elif 12 <= start_hour < 18:
            time_of_day = 'Afternoon'
        elif 18 <= start_hour < 21:
            time_of_day = 'Evening'
        elif 21 <= start_hour <= 23:
            time_of_day = 'Night'
    except:
        year, month, time_of_day = '','',''

    return year, month, time_of_day

In [6]:
crime_df[['year','month','tod']] = crime_df.apply(assign_tod, axis=1, result_type='expand')

Weather Data:
- Remove negative time values, that is, time before 1970. Erroneous times.
- Convert date-time to a format to match to the crime data, named "crime_time".
- Rename variables.
- Delete those that are non-numeric (one could encoding these variables, theoretically).

In [7]:
def convertTime(row):
    time = datetime.datetime.fromtimestamp(row).strftime('%Y-%m-%dT%H:%M:%S.000')

    return time

In [8]:
weather_df = weather_df[weather_df['currently_time'] > 0]
weather_df['crime_time_format'] = weather_df['currently_time'].apply(convertTime)

In [9]:
rename = dict(currently_apparentTemperature = 'apparent_temp',
              currently_cloudCover = 'cloud_cover',
              currently_dewPoint = 'dew_point',
              currently_humidity = 'humidity',
              currently_icon = 'icon',
              currently_precipIntensity = 'percip_intensity',
              currently_precipProbability = 'percip_probability',
              currently_precipType = 'percip_type',
              currently_pressure = 'pressure',
              currently_summary = 'summary',
              currently_temperature = 'temperature',
              currently_time = 'time',
              currently_uvIndex = 'uv_index',
              currently_visibility = 'visibility',
              currently_windBearing = 'wind_bearing',
              currently_windGust = 'wind_gust',
              currently_windSpeed = 'wind_speed',
              latitude = 'weather_latitude',
              longitude = 'weather_longitude')

weather_df.rename(columns=rename, inplace=True)
weather_df.drop(labels=['index','code','summary','icon','error','percip_type'], axis='columns', inplace=True)

Census Data

In [10]:
#Fill NA values and values less than 0 with the mean of values greater than zero.
columns = ['TotalPop','TPopMargin','UnWgtSampleCtPop','PerCapitaIncome','PerCapIncMargin','MedianHouseholdInc',
'MedHouseholdIncMargin','MedianAge','MedianAgeMargin','HousingUnits','HousingUnitsMargin',
'UnweightedSampleHousingUnits']

for col in columns:
    #Edit string (object) columns to numeric (float).
    if census_df[col].dtypes == 'object':
        numeric_column = pd.to_numeric(census_df[col], errors = 'coerce')
        census_df[col] = numeric_column

    #Calculate Mean.
    mean = census_df[census_df[col] > 0][col].mean()

    #Fill NA with a dictionary of column name and the mean value
    census_df.fillna(value={col: mean}, inplace=True)

    #Replace values less than zero with the mean.
    census_df[census_df[col] < 0] = mean

#Reformat columns and rename year column.
census_df['BlockGroup'] = census_df['BlockGroup'].astype(str).replace(']]', '', regex=True)
census_df['BlockGroup'] = census_df['BlockGroup'].astype(str).replace('\.0', '', regex=True)
census_df['Tract'] = census_df['Tract'].astype(str).replace('\.0', '', regex=True)
census_df['Tract'] = census_df['Tract'].apply(lambda x: x.zfill(6))
census_df['Year'] = census_df['Year'].astype(str).replace('\.0', '', regex=True)
census_df.rename(columns=dict(Year = 'census_year'), inplace=True)

#Create an index to merge with crime data.
census_df['index'] = census_df['Tract'] + census_df['BlockGroup'] + census_df['census_year']
census_df_nodup = census_df.drop_duplicates(subset='index')

### Merge Weather and Crime Data

Merge weather and crime data before aggregating.

In [11]:
crime_weather_mr = crime_df.merge(weather_df,
                                  how='left',
                                  left_on=['LATITUDE','LONGITUDE','START_DATE'],
                                  right_on=['weather_latitude','weather_longitude','crime_time_format'])

### Aggregate Crime/Weather Data

In [12]:
agg_vars = ['offensegroup', 'apparent_temp', 'cloud_cover', 'dew_point', 'humidity',
            'percip_intensity', 'percip_probability', 'pressure', 'temperature', 'uv_index',
            'visibility', 'wind_bearing', 'wind_gust', 'wind_speed']

In [13]:
agg_dict = dict()

for var in agg_vars:
    if var == 'offensegroup':
        agg_dict[var] = 'size'
    else:
        agg_dict[var] = 'mean'

In [24]:
crime_weather_agg = crime_weather_mr.groupby(by=['BLOCK_GROUP','year','month','tod'], as_index=False).agg(agg_dict)

### Merge Weather-Crime and Census Data

In [17]:
crime_weather_agg['index'] = crime_weather_agg['BLOCK_GROUP'] + crime_weather_agg['year'].astype('str')
crime_weather_agg['index'] = crime_weather_agg['index'].str.replace(" ","")

In [18]:
crime_weather_census = crime_weather_agg.merge(census_df, how='left', on='index')

In [19]:
crime_weather_census.columns

Index(['BLOCK_GROUP', 'year', 'month', 'tod', 'offensegroup', 'apparent_temp',
       'cloud_cover', 'dew_point', 'humidity', 'percip_intensity',
       'percip_probability', 'pressure', 'temperature', 'uv_index',
       'visibility', 'wind_bearing', 'wind_gust', 'wind_speed', 'index',
       'TotalPop', 'TPopMargin', 'UnWgtSampleCtPop', 'PerCapitaIncome',
       'PerCapIncMargin', 'MedianHouseholdInc', 'MedHouseholdIncMargin',
       'MedianAge', 'MedianAgeMargin', 'HousingUnits', 'HousingUnitsMargin',
       'UnweightedSampleHousingUnits', 'State', 'County', 'Tract',
       'BlockGroup', 'census_year'],
      dtype='object')