In [45]:
# Importing necessary libraries to look at the data
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os

## EDA
---

1. We do not need new fields, these fields are sufficient. But we might add more to compare different major cities.

In [12]:
# Getting the display settings so that we can see all of the data
pd.set_option('display.max_columns', None) 
pd.set_option('display.width', 1000)
pd.set_option('display.max_rows', None)  

In [13]:
# loading the csv into a dataframe using pandas
crime_df = pd.read_csv('../data/crimedata2.csv',encoding='latin-1')

In [14]:
crime_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2215 entries, 0 to 2214
Columns: 147 entries, Êcommunityname to nonViolPerPop
dtypes: float64(75), int64(29), object(43)
memory usage: 2.5+ MB


In [15]:
# Looking at the number of rows and columns in the dataframe
crime_df.shape

(2215, 147)

## Missing Values
---

In [21]:
len(crime_df[crime_df.isna()==True])

2215

#### Question Mark Missing Values

In [22]:
# Count the occurrences of '?' in the DataFrame
question_mark_count = crime_df.apply(lambda row: row.astype(str).str.count('\?')).sum().sum()
question_mark_count

np.int64(44592)

2. There is about 2215 missing values, and there are 44592 values where there are "?" values that are not interpretted as missing values.

## Features list
---

In [17]:
population_features  = ["population"]
crime_features       = [ "autoTheft", "autoTheftPerPop", "larcenies", "larcPerPop", "burglaries", "burglPerPop", "robberies", "robbbPerPop"]
money_features       = ["medIncome", "RentMedian", "MedRent"]
race_features        = ["racepctblack", "racePctWhite", "racePctAsian", "racePctHisp"]
age_features         = ["agePct12t21", "agePct12t29", "agePct16t24", "agePct65up"]
categorical_features = ['Êcommunityname', 'state', 'countyCode', 'communityCode']
qm_columns = [
    'LemasSwornFT', 'LemasSwFTPerPop', 'LemasSwFTFieldOps', 'LemasSwFTFieldPerPop', 
    'LemasTotalReq', 'LemasTotReqPerPop', 'PolicReqPerOffic', 'PolicPerPop', 
    'RacialMatchCommPol', 'PctPolicWhite', 'PctPolicBlack', 'PctPolicHisp', 
    'PctPolicAsian', 'PctPolicMinor', 'OfficAssgnDrugUnits', 'NumKindsDrugsSeiz', 
    'PolicAveOTWorked', 'PolicCars', 'PolicOperBudg', 'LemasPctPolicOnPatr', 
    'LemasGangUnitDeploy', "PolicBudgPerPop"
]


In [23]:
crime_df_new = crime_df[categorical_features + population_features + crime_features + money_features]
crime_df_new.head()

Unnamed: 0,Êcommunityname,state,countyCode,communityCode,population,autoTheft,autoTheftPerPop,larcenies,larcPerPop,burglaries,burglPerPop,robberies,robbbPerPop,medIncome,RentMedian,MedRent
0,BerkeleyHeightstownship,NJ,39,5320,11980,16,131.26,138,1132.08,14,114.85,1,8.2,75122,1001,1001
1,Marpletownship,PA,45,47616,23123,26,110.55,376,1598.78,57,242.37,5,21.26,47917,560,627
2,Tigardcity,OR,?,?,29344,136,376.3,1797,4972.19,274,758.14,56,154.95,35669,428,484
3,Gloversvillecity,NY,35,29443,16656,47,271.93,716,4142.56,225,1301.78,10,57.86,20580,250,333
4,Bemidjicity,MN,7,5068,11245,91,728.93,1060,8490.87,91,728.93,4,32.04,17390,283,332


In [53]:
# def correlation(dataframe: pd.DataFrame, chunk_size: int = 15, output_folder: str = "../images/correlation"):
#     # Ensure the output directory exists
#     if not os.path.exists(output_folder):
#         os.makedirs(output_folder)
    
#     dataframe = dataframe.select_dtypes(include="number")
#     # Compute the correlation matrix
#     corr_matrix = dataframe.corr()
#     rows, cols = corr_matrix.shape
#     chunk_number = 1

#     # Loop over the correlation matrix in chunks
#     for i in range(0, rows, chunk_size):
#         for j in range(0, cols, chunk_size):
#             # Select a chunk using iloc
#             corr_chunk = corr_matrix.iloc[i:i+chunk_size, j:j+chunk_size]

#             # Plot the current chunk as a heatmap
#             plt.figure(figsize=(10, 8))
#             sns.heatmap(corr_chunk, annot=True, cmap="coolwarm", linewidths=.5, fmt=".2f")
#             plt.title(f"Correlation Matrix Chunk {chunk_number}")

#             # Save the heatmap to the specified folder
#             output_path = os.path.join(output_folder, f"correlation_chunk_{chunk_number}.png")
#             plt.savefig(output_path)
#             plt.close()  # Close the figure to avoid displaying it in the notebook or memory overflow

#             # Increment the chunk number
#             chunk_number += 1

In [58]:
# # Function to filter correlations higher than 0.70 or lower than -0.70, without taking absolute value
# def filter_high_low_correlations(dataframe: pd.DataFrame, threshold: float = 0.70):
#     dataframe = dataframe.select_dtypes(include="number")

#     # Compute the correlation matrix
#     corr_matrix = dataframe.corr()  # Keep the actual correlation values

#     # Filter the matrix to keep only values > 0.70 or < -0.70, excluding the 1.0 self-correlations
#     filtered_corr = corr_matrix[((corr_matrix > threshold) | (corr_matrix < -threshold)) & (corr_matrix != 1.0)].stack().reset_index()

#     # Rename columns for clarity
#     filtered_corr.columns = ['Feature 1', 'Feature 2', 'Correlation']

#     # Drop duplicate pairs (since correlation is symmetric)
#     filtered_corr = filtered_corr.drop_duplicates(subset=['Correlation'])

#     return filtered_corr

In [59]:
# # Apply the function to get high and low correlations
# high_low_correlations = filter_high_low_correlations(crime_df)

# # Display or use the result
# print(high_low_correlations)

## Outliers
---

In [60]:
def outlierChecker(data: pd.DataFrame, column: str):
    iqr = data[column].quantile(0.75) - data[column].quantile(0.25)
    upper_outliers = data[data[column] > data[column].quantile(0.75) + 1.5 * iqr]
    lower_outliers = data[data[column] < data[column].quantile(0.25) - 1.5 * iqr]
    return upper_outliers, lower_outliers

def outlierSummary(data: pd.DataFrame, column: str):
    upper, lower = outlierChecker(data, column)
    print(f"Column: {column}")
    print(f"Upper Outliers: {len(upper)}")
    print(f"Lower Outliers: {len(lower)}")
    print(f"Normalized Upper Outliers: {len(upper)/data.shape[0]}")
    print(f"Normalized Lower Outliers: {len(lower)/data.shape[0]}")
    print("-----------------------------------")

def outlierSummary2(data: pd.DataFrame, column: str):
    upper, lower = outlierChecker(data, column)
    print(f"Column: {column} has {len(upper)+len(lower)} outliers")

In [61]:
for col in crime_df.select_dtypes(include=["number"]).columns.tolist():
    outlierSummary2(data=crime_df, column=col)

Column: fold has 0 outliers
Column: population has 219 outliers
Column: householdsize has 93 outliers
Column: racepctblack has 252 outliers
Column: racePctWhite has 88 outliers
Column: racePctAsian has 240 outliers
Column: racePctHisp has 268 outliers
Column: agePct12t21 has 153 outliers
Column: agePct12t29 has 151 outliers
Column: agePct16t24 has 206 outliers
Column: agePct65up has 38 outliers
Column: numbUrban has 165 outliers
Column: pctUrban has 0 outliers
Column: medIncome has 43 outliers
Column: pctWWage has 25 outliers
Column: pctWFarmSelf has 117 outliers
Column: pctWInvInc has 2 outliers
Column: pctWSocSec has 19 outliers
Column: pctWPubAsst has 76 outliers
Column: pctWRetire has 34 outliers
Column: medFamInc has 63 outliers
Column: perCapInc has 111 outliers
Column: whitePerCap has 119 outliers
Column: blackPerCap has 72 outliers
Column: indianPerCap has 133 outliers
Column: AsianPerCap has 116 outliers
Column: HispPerCap has 85 outliers
Column: NumUnderPov has 235 outliers
C

4. If we calculate the outliers with IQR method, then we see outliers in almost every column to take note of.

## Correlation
---

In [74]:
# Dropping columns that we don't want to make numeric and creating a new df
columns_to_drop = ['Êcommunityname', 'state', 'countyCode', 'communityCode']
numeric_crime_df = crime_df.drop(columns=columns_to_drop)
numeric_crime_df = numeric_crime_df.apply(pd.to_numeric, errors='coerce')

In [86]:
# Creating a correlation matrix and save the correlation matrix to a CSV to check all columns
correlation_matrix = numeric_crime_df.corr()
correlation_matrix.to_csv('../data/correlation_matrix.csv', encoding='utf-8', index=False)

In [87]:
# Threshold so that we only get the results for features we would want to use
threshold = 0.7

high_correlation = correlation_matrix[(correlation_matrix > threshold) | (correlation_matrix < -threshold)]

In [92]:
# We are focused on autoTheft so we are going to only grab the columns of interest
interest_columns = ['autoTheft', 'autoTheftPerPop']
interest_correlation = correlation_matrix[interest_columns].sort_values(by='autoTheft', ascending=False)

# print(interest_correlation)

In [91]:
# # Plot a heatmap of the autoTheft correlation matrix
# plt.figure(figsize=(20, 18))
# sns.heatmap(interest_correlation, cmap='coolwarm', annot=False, vmin=-1, vmax=1)

In [15]:
# Correlation of each feature with 'autoTheft'
target_corr = correlation_matrix['autoTheft'].sort_values(ascending=False)
print(target_corr)

autoTheft        1.000000
NumUnderPov      0.983919
population       0.980754
numbUrban        0.979503
robberies        0.971612
                   ...   
PctTeen2Par     -0.167564
PctFam2Par      -0.172405
PctPolicWhite   -0.175015
PctKids2Par     -0.177880
racePctWhite    -0.206655
Name: autoTheft, Length: 143, dtype: float64


In [90]:
# Find correlations for 'autoTheft' and 'autoTheftPerPop'
autoTheft_corr = correlation_matrix['autoTheft'].drop('autoTheft')  # Drop self-correlation
autoTheftPerPop_corr = correlation_matrix['autoTheftPerPop'].drop('autoTheftPerPop')  # Drop self-correlation

# Sort and take the top 6
top_autoTheft_corr = autoTheft_corr.abs().sort_values(ascending=False).head(6)
top_autoTheftPerPop_corr = autoTheftPerPop_corr.abs().sort_values(ascending=False).head(6)

# Create a df using the results
top_correlations = pd.DataFrame({
    'autoTheft': top_autoTheft_corr,
    'autoTheftPerPop': top_autoTheftPerPop_corr
})

# Display the result
# print(top_correlations)

In [20]:
# Transpose the df
transposed_df = top_correlations.T

transposed_df.columns

Index(['NumUnderPov', 'ViolentCrimesPerPop', 'burglPerPop', 'burglaries', 'murdPerPop', 'murders', 'nonViolPerPop', 'numbUrban', 'population', 'racePctWhite', 'robbbPerPop', 'robberies'], dtype='object')

In [21]:
# # Create a df with the original data but only with the columns of interest
# featured_data = crime_df[['Êcommunityname', 'state', 'countyCode', 
#                           'communityCode','NumUnderPov', 'ViolentCrimesPerPop',
#                           'burglPerPop', 'burglaries', 'murdPerPop', 'murders', 
#                           'nonViolPerPop', 'numbUrban', 'population', 'racePctWhite', 
#                           'robbbPerPop', 'robberies', 'autoTheft','autoTheftPerPop']]

- Only categorical columns are geographic locations, so we can use them but I don't think it will help that much when creating a model.
- There are a lot of numerical columns (some are shown as Object types). We might do binning depending on the distribution of a feature, if there is highly skewed variables, or for categorical representation like age range or area. 

In [95]:
print("Our target variables will be autoTheft and autoTheftPerPop")
print("Our predictor variables will be the different races per capita, race pecentages, median income, maybe murders since it has a moderately positive correlation \nwith auto theft, age, unemployed, employed, number in shelters, etc.")
print("Note: We might remove some features or replace them if we have different or new hyptheses, but we will definitely use race, median income, unemployed as demographic features. \nCrime features that we might use are larcenies, burglaries, and robberies. \nThese crime features might be more useful as predictor variables but our goal to see the probablity of a auto theft happening depending on the value of these crime features.")
print("We can use some stereo types as our hypothesis and see what the outcome is to see if our hypothesis/stereotypes are actually true.")

Our target variables will be autoTheft and autoTheftPerPop
Our predictor variables will be the different races per capita, race pecentages, median income, maybe murders since it has a moderately positive correlation 
with auto theft, age, unemployed, employed, number in shelters, etc.
Note: We might remove some features or replace them if we have different or new hyptheses, but we will definitely use race, median income, unemployed as demographic features. 
Crime features that we might use are larcenies, burglaries, and robberies. 
These crime features might be more useful as predictor variables but our goal to see the probablity of a auto theft happening depending on the value of these crime features.
We can use some stereo types as our hypothesis and see what the outcome is to see if our hypothesis/stereotypes are actually true.
