In [None]:
!pip install plotly

In [None]:
# Importing required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import warnings
warnings.filterwarnings("ignore")

In [None]:
# Reading the dataset
SPD_File = r"/Users/payalchavan/Downloads/SPD_Crime_Data__2008-Present.csv"
SPD_DF = pd.read_csv(SPD_File, low_memory=False)

### TASK 1: Data Cleaning

In [None]:
# Checking the dataset
SPD_DF.head()

In [None]:
# Checking shape of dataset
SPD_DF.shape

In [None]:
# Getting information of dataset
SPD_DF.info()

In [None]:
# Checking null/missing values
SPD_DF.isnull().sum()

In [None]:
# Checking duplicate values
SPD_DF.duplicated().sum()

In [None]:
# Copy the dataframe to a new dataframe
#SPD_Crime_Data = SPD_DF.copy()

In [None]:
# Sort the rows based on the 'Report Number' column
#sorted_SPD_df = SPD_Crime_Data.sort_values('Report Number',ascending=True).reset_index(drop=True)
#sorted_SPD_df

In [None]:
# Convert columns to datetime format
SPD_DF["Offense Start DateTime"] = pd.to_datetime(SPD_DF["Offense Start DateTime"], errors='coerce')
SPD_DF["Offense End DateTime"] = pd.to_datetime(SPD_DF["Offense End DateTime"], errors='coerce')
SPD_DF["Report DateTime"] = pd.to_datetime(SPD_DF["Report DateTime"], errors='coerce')

In [None]:
# Extract the desired date components
SPD_DF['hour_added'] = SPD_DF['Report DateTime'].dt.hour
SPD_DF['date_added'] = SPD_DF['Report DateTime'].dt.day
SPD_DF['month_added'] = SPD_DF['Report DateTime'].dt.month
SPD_DF['year_added'] = SPD_DF['Report DateTime'].dt.year

In [None]:
# Filling missing values
SPD_DF['Offense Start DateTime'].fillna(method='ffill', inplace=True)
SPD_DF['Offense End DateTime'].fillna(method='ffill', inplace=True)
SPD_DF['Precinct'].fillna(SPD_DF['Precinct'].mode()[0], inplace=True)
SPD_DF['Sector'].fillna(SPD_DF['Sector'].mode()[0], inplace=True)
SPD_DF['Beat'].fillna(SPD_DF['Beat'].mode()[0], inplace=True)
SPD_DF['MCPP'].fillna('Unknown', inplace=True)
SPD_DF['100 Block Address'].fillna('Unknown', inplace=True)

In [None]:
# Manipulate string to capitalize first letter of string
SPD_DF['Crime Against Category'] = SPD_DF['Crime Against Category'].str.title()

In [None]:
SPD_DF.info()

In [None]:
SPD_DF.sample(5)

### TASK 2: Offense Type Distribution Analysis 

In [None]:
# Create a pie chart to show ratio of different types of crimes
SPD_DF['Offense'].value_counts().plot(kind = 'pie',figsize = (10,10),autopct='%1.1f%%')

In [None]:
offense_counts = SPD_DF['Offense'].value_counts().reset_index()

fig = px.pie(offense_counts, names=offense_counts['Offense'], values='Offense', hole=0.4, 
             title='Distribution of Offense Types')

fig.update_traces(textposition='inside', textinfo='percent+label')
fig.update_layout(showlegend=False)  # Hiding legend for clarity
fig.update_layout(height=800)
fig.show()

Insights: Based on the crime data analysis, Theft Motor Vehicles is the most significant contributor, accounting for 15.9% of reported incidents. Burglary/Breaking & Entering follows closely at 11.3%, indicating a substantial impact on public safety. The noteworthy occurrence of Destruction/Damage/Vandalism of Property suggests a concerning trend, contributing significantly to the overall crime landscape. The combined contribution of all other larcenies at 8.45% and Simple Assault at 7.6% underscores the diverse nature of criminal activities affecting the community. These are the top 5 offenses that have been reported.

### TASK 3: Line Chart Analysis of offense Trends 

In [None]:
# Replace 'Specific Offense Type' with the offense type you want to analyze
specific_offense_type = 'Robbery'


# Filtering data for the specific offense type
specific_offense_df = SPD_DF[SPD_DF['Offense'] == specific_offense_type]

# Grouping by month and year and counting the occurrences
offense_frequency = specific_offense_df.groupby(['year_added', 'month_added']).size().reset_index(name='Frequency')

# Plotting a line chart
fig = px.line(offense_frequency, x='month_added', y='Frequency', color='year_added',
              title=f'Frequency of {specific_offense_type} Over Time',
              labels={'Frequency': 'Number of Incidents', 'month_added': 'Month (1-12)'})

# Show the chart
fig.show()


Insights: The x-axis represents the "month" of crime incidents while the y-axis represents the "No. of incidents". From the above line chart, we can observe that the highest incidence of 'Robbery' noting 189 occured on Jan 2022. Also the least incidents of 'Robbery' noting 46 took place in Dec 2023. We can observe there are variations of trends of incidents taking place every year.

### TASK 4: Identifying Safest and Least Safe Localities

In [None]:
# Assuming 'MCPP' is the column representing localities

locality_crime_counts = SPD_DF.groupby('MCPP')['Offense'].count().reset_index(name='CrimeFrequency')

# Handling null values, if any
locality_crime_counts = locality_crime_counts.dropna(subset=['MCPP', 'CrimeFrequency'])
sorted_localities = locality_crime_counts.sort_values(by='CrimeFrequency', ascending=False)

top_3_localities = sorted_localities.tail(4)
least_3_localities = sorted_localities.head(3)

# Remove rows with '<Null>' in 'MCPP'
top_3_localities = top_3_localities[top_3_localities['MCPP'] != '<Null>']

In [None]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go
#fig = make_subplots(rows=2)

fig = make_subplots(rows=1, cols=2, subplot_titles=['Top 3 Safest', 'Least 3 Safe'])

# Add bar chart for Top 3 Safest
fig.add_trace(go.Bar(x=top_3_localities['MCPP'], y=top_3_localities['CrimeFrequency'],
                     marker_color=top_3_localities['CrimeFrequency'], name='Top 3 Safest'),
              row=1, col=1)

# Add bar chart for Top 3 Least Safe
fig.add_trace(go.Bar(x=least_3_localities['MCPP'], y=least_3_localities['CrimeFrequency'],
                     marker_color=least_3_localities['CrimeFrequency'], name='Least 3 Safest'),
              row=1, col=2)

# Update layout
fig.update_layout(title_text='Top 3 Safest and Least Safe Localities', barmode='group')

fig.update_xaxes(title_text='Locality', row=1, col=1)
fig.update_xaxes(title_text='Locality', row=1, col=2)
fig.update_yaxes(title_text='Crime Frequency', row=1, col=1)

# fig.update_layout(legend=dict(title=dict(text='Crime Frequency')))

# Show the chart
fig.show()

In [None]:
print(top_3_localities)

In [None]:
print(least_3_localities)

Insights: From the above bar plots, we can notice that the crime frequency is highest in the locality "Commercial Harbor Island", followed by "Cascade" and lowest in the locality "Downtown Commercial", followed by "Capitol Hill" and "Northgate".

### TASK 5: Histogram/Line chart to display the distribution of crime incidents over time

In [None]:
# Plotting a histogram
fig = px.histogram(SPD_DF, x='hour_added', title='Distribution of Crime Incidents Over Different Times of the Day',
                   labels={'hour_added': 'Time(in hrs)', 'count': 'Number of Incidents'})

# Show the chart
fig.show()

Insights: From the above histogram, we can notice that the Distribution of Crime Incidents is at the peak at hour 13 with the maximum incident count to be 74.069K. And the least incident counts are reported at hour 4 with the count of 15.345K. It is obivious that there are less chances of "Robbery" in the early hours of day

### TASK 6: Box Plot for Crime Time Analysis

In [None]:
# Plotting box plots

fig = px.box(SPD_DF, x='Crime Against Category', y='hour_added', title='Distribution of Times for different Crime Types', color='Crime Against Category')
fig.update_xaxes(title='Crime Types')
fig.update_yaxes(title='Time(in hrs)')
fig.show()

In [None]:
# Assuming 'Offense Start DateTime' is the column containing timestamps
# and 'Offense' is the column containing offense types
#SPD_DF['hour_added'] = df_spd['Report DateTime'].dt.hour

# Plotting a box plot with swapped axes
fig = px.box(SPD_DF, x='hour_added', y='Offense Parent Group', title='Distribution of Crime Times by Offense Type',
             labels={'hour_added': 'Hour of the Day', 'Crime Against Category': 'Type of Crime'})

# Increasing vertical figure size
fig.update_layout(height=1200)
# Show the chart
fig.show()

Insights:The box plot provides us with some intriguing insights. Gambling is not a daytime activity, and it is most prevalent at the 19th hour. Purchasing prostitution occurs in a small time range, usually at night. In contrast, liquor law violations, drunkenness, aggravated assault, and driving under the influence can occur at any time of day. Alcohol appears to be a common factor in crimes that occur within a large time window. Most of the data points fall within the 5-20 hour time window, making it the most likely time period to observe a criminal offense 