# Exploratory Data Analysis of Los Angeles Crime Data


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# import libraries and upload a dataset to a dataframe

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

In [None]:
import requests
import geopandas
import plotly.express as px
import folium
import matplotlib.pyplot as plt
import seaborn as sns
import re
import geopandas as gpd
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
from IPython.display import display

In [None]:
# reading the file with the 'error_bad_lines' parameter set to False to skip problematic lines
df = pd.read_csv('/content/Crime_Data_from_2020_to_Present.csv', on_bad_lines='skip')

# Display the dataframe to verify
print(df)

In [None]:
# Display the first 5 records
df.head()

In [None]:
# Display the last 5 records of the Dataframe
df.tail()

In [None]:
# shape of the data
df.shape

In [None]:
# data information
df.info()

In [None]:
# describtive statistics
df.describe()

In [None]:
# list of columns
df.columns.tolist()

In [None]:
# check for missing values:
df.isnull().sum()

In [None]:
#checking duplicate values
df.nunique()

# Data Cleaning

In [None]:
# Drop columns with all null values
df_cleaned = df.dropna(axis=1, how='all')
df_cleaned.shape

In [None]:
# Drop rows with all null values

df_cleaned = df_cleaned.dropna(how='all', axis=0)
df_cleaned.shape

In [None]:
# Calculate the threshold
threshold = df.shape[0] - 400000

# Drop columns with more than 400,000 NaN values
df_cleaned = df.dropna(axis=1, thresh=threshold)
df_cleaned.shape

In [None]:
# finding differences between both columns

difference = set(df.columns).symmetric_difference(set(df_cleaned.columns))
difference

In [None]:
df_cleaned.info()

In [None]:
df_cleaned
print(df_cleaned.isnull().sum())

In [None]:
df_cleaned.isnull().sum()

# Charts

In [None]:
# Assuming df_cleaned['AREA NAME'] contains location names
# Crimes by AREA NAME
crime_counts = df_cleaned['AREA NAME'].value_counts()

# Using Matplotlib to create a count plot
plt.figure(figsize=(16, 6))
plt.bar(crime_counts.index, crime_counts, color='pink')
plt.title('Crimes Count by AREA NAME')
plt.xlabel('AREA NAME')
plt.ylabel('Crime Count')

# Rotating X-axis labels
plt.xticks(rotation = 25)

plt.show()

In [None]:
# Assuming df_cleaned['Crm Cd Desc'] contains your crime descriptions
# Calculate the counts and keep only the top 25
crime_counts = df_cleaned['Crm Cd Desc'].value_counts().head(25)

# Plot with Matplotlib
plt.figure(figsize=(25, 6))
plt.bar(crime_counts.index, crime_counts, color='lightgreen')
plt.title('Crime Counts of Top 25 Crm Cd Desc')
plt.xlabel('Crm Cd Desc (Defines the Crime Code provided)')
plt.ylabel('Crime Count')

# Rotate the x-axis labels to 90 degrees for better readability
plt.xticks(rotation=90)

# Show the plot
plt.show()

In [None]:
# Descent Code: A - Other Asian B - Black C - Chinese D - Cambodian F - Filipino G - Guamanian
# H - Hispanic/Latin/Mexican I - American Indian/Alaskan Native J - Japanese
# K - Korean L - Laotian O - Other P - Pacific Islander S - Samoan U - Hawaiian
# V - Vietnamese W - White X - Unknown Z - Asian Indian

# Crimes by Vict Descent
crime_counts = df_cleaned['Vict Descent'].value_counts()

# Using Matplotlib to create a count plot
plt.figure(figsize=(16, 6))
plt.bar(crime_counts.index, crime_counts, color='skyblue')
plt.title('Crime Count by Vict Descent')
plt.xlabel('Vict Descent')
plt.ylabel('Crime Count')

# Rotating X-axis labels
plt.xticks(rotation = 25)

plt.show()

In [None]:
# Creating subplots of categorical columns
sns.set_style("darkgrid")

# Identify categorical columns
categorical_columns = ['AREA NAME', 'Vict Sex', 'Vict Descent', 'Status Desc']


plt.figure(figsize=(18, len(categorical_columns) * 3))
for idx, feature in enumerate(categorical_columns, 1):
    plt.subplot(len(categorical_columns), 2, idx)  # Adjust the layout as needed

    top_categories = df_cleaned[feature].value_counts().head(10).index

    # DataFrame to display the top categories
    filtered_data = df_cleaned[df_cleaned[feature].isin(top_categories)]

    # A countplot for the selected data
    sns.countplot(y=filtered_data[feature], order=top_categories)
    plt.title(f"Countplot of {feature}")


# Adjust layout and show plots
plt.title('Bar Chart of Categorical Columns')
plt.tight_layout()
plt.show()

In [None]:
df_cleaned.head()

In [None]:
data_s = df_cleaned[['AREA NAME','LAT','LON','DR_NO','Crm Cd Desc','Vict Sex','Status Desc','Premis Desc']]
data_s['AREA NAME'] = data_s['AREA NAME'].str.upper()

data_s.head()

In [None]:
data = data_s.copy()

data.isnull().sum()
data.dropna(inplace=True)

In [None]:
data.isnull().sum()

In [None]:
data.dropna(thresh=0, inplace=True)

In [None]:
data.head()

In [None]:
data.shape

In [None]:
data['AREA NAME'].unique()

In [None]:
# Function to rename values
def rename_area_name(x):
    if x == 'N HOLLYWOOD':
        return 'NORTH HOLLYWOOD'
    elif x == 'WEST LA':
        return 'WEST LOS ANGELES'
    else:
        return x

# Apply the function to the 'AREA NAME' column
data['AREA NAME'] = data['AREA NAME'].apply(rename_area_name)

In [None]:
# Generate population data based on the count of (lat, lon) pairs
data['population_crime_by_lat_lon'] = data.groupby(['AREA NAME','LAT','LON'])['LAT'].transform('count')
data.shape
# data.groupby(['AREA NAME','LAT','LON'])['LAT'].count()

In [None]:
# Generate population data based on the count of (lat, lon) pairs
data['population_crime'] = data.groupby(['AREA NAME'])['AREA NAME'].transform('count')
data.shape

In [None]:
data.head()

In [None]:
data[data['AREA NAME']=='CENTRAL']

In [None]:
data[data['AREA NAME']=='CENTRAL']['population_crime_by_lat_lon'].sum()

In [None]:
# Top 10 crime populated Area Name (Districts)
population_grouped = data.groupby('AREA NAME')['population_crime_by_lat_lon'].sum()
population_grouped = population_grouped.to_frame().reset_index().copy()
population_grouped.rename(columns={'population_crime_by_lat_lon':'total crime count'}, inplace=True)
population_grouped.sort_values(by='total crime count', ascending=False, inplace=True)
population_grouped.head(10)

In [None]:
top_10_areaNames = list(population_grouped.head(10)['AREA NAME'].unique())
top_10_areaNames

In [None]:
top_10_crimes_df = data[data['AREA NAME'].isin(top_10_areaNames)]
top_10_crimes_df.shape

In [None]:
top_10_crimes_df.head()

In [None]:
# Load the geoJson file
station_boundaries = gpd.read_file("/content/Station_Boundaries.geojson")
print(station_boundaries.shape)

# Load the population data
population_data = population_grouped
print(population_data.shape)

def extract_between_lapd_division(text):
    pattern = r'LAPD\s+(.*?)\s+Division'
    match = re.search(pattern, text)
    if match:
        return match.group(1)
    else:
        return None

# Apply the function to the 'Description' column
station_boundaries = station_boundaries[station_boundaries['S_TYPE']=='LAPD'].reset_index(drop=True)
station_boundaries['AREA NAME_Extracted'] = station_boundaries['OMEGA_LABEL'].apply(extract_between_lapd_division)
station_boundaries['AREA NAME_Extracted'] = station_boundaries['AREA NAME_Extracted'].apply(lambda x : x.upper())
print(station_boundaries.shape)

In [None]:
# Merge the population data with the shapefile
merged = pd.merge(top_10_crimes_df,station_boundaries, left_on='AREA NAME', right_on='AREA NAME_Extracted',how='left')
print(merged.shape)

In [None]:
merged.head()

In [None]:
merged.nunique()

In [None]:
merged

In [None]:
lapd_data = merged[['AREA NAME','DR_NO','LAT','LON','population_crime_by_lat_lon','population_crime','Crm Cd Desc','Vict Sex','AREA NAME_Extracted','geometry']]
lapd_data

In [None]:
lapd_data = lapd_data.drop_duplicates(subset=['DR_NO']).copy()
lapd_data.shape

In [None]:
lapd_data.reset_index(drop=True, inplace=True)

In [None]:
lapd_data

In [None]:
lapd_data.info()

In [None]:
print(lapd_data.isnull().sum())


In [None]:
# Filter rows that contain any NaN values
rows_with_nan = lapd_data[lapd_data.isnull().any(axis=1)]

# Display the rows with NaN values
rows_with_nan

In [None]:
rows_with_nan.nunique()

In [None]:
import geopandas as gpd
import matplotlib.pyplot as plt
import folium
from folium import plugins

In [None]:
geo_data_df = lapd_data[['AREA NAME_Extracted','geometry']].drop_duplicates().reset_index(drop=True)

In [None]:
lapd_gdf = gpd.GeoDataFrame(lapd_data, geometry='geometry')
lapd_gdf.head()

In [None]:
# Using Seaborn to create a heatmap
plt.figure(figsize=(15, 10))


df_cleaned['DR_NO'] = pd.to_datetime(df_cleaned['DR_NO'])
# Drop the original date column or any non-numeric columns before correlation calculation
numeric_df = df_cleaned.select_dtypes(include=[np.number])


sns.heatmap(numeric_df.corr(), annot=True, fmt='.2f', cmap='Pastel2', linewidths=2)
plt.title('Correlation Heatmap')
plt.show()

In [None]:
# Plot the choropleth map
fig, ax = plt.subplots(1, 1, figsize=(15, 10))
lapd_gdf.plot(column='population_crime', cmap='OrRd', linewidth=0.8, ax=ax, edgecolor='0.8', legend=True)

# Label areas
labeled_areas = set()
for idx, row in lapd_gdf.iterrows():
    area_name = row['AREA NAME']
    if area_name not in labeled_areas:
        centroid = row.geometry.centroid
        ax.annotate(area_name, (centroid.x, centroid.y), ha='center', fontsize=8, color='black', bbox=dict(facecolor='white', alpha=0.5))
        labeled_areas.add(area_name)

ax.set_title('Population Choropleth Map')
ax.set_axis_off()
plt.show()

# Create a base map
m = folium.Map(location=[34.05, -118.25], tiles="Cartodb dark_matter", zoom_start=10)

# Add heatmap
heat_data = [[geom.y, geom.x] for geom in lapd_gdf.geometry.centroid]
plugins.HeatMap(heat_data).add_to(m)

# Add markers
for idx, row in lapd_gdf.iterrows():
    folium.Marker(
        [row.geometry.centroid.y, row.geometry.centroid.x],
        popup=f'Area Name: {row["AREA NAME"]} <br> Population: {row["population_crime"]}'
    ).add_to(m)

m.save('LA_population_map.html')  # Save to HTML file


In [None]:
lapd_gdf['centroid'] = lapd_gdf.geometry.apply(lambda x: x.centroid)

In [None]:
lapd_gdf.geometry.type

In [None]:
import folium
from folium import plugins
import geopandas as gpd

# Load your GeoDataFrame
# Calculate centroids of the polygons
lapd_gdf['centroid'] = lapd_gdf.geometry.centroid

# Create a base map
map = folium.Map(location=[34.05, -118.25], tiles="Cartodb dark_matter", zoom_start=10)  # Centered on Los Angeles

# Prepare heat data using the centroids
heat_data = [[point.y, point.x] for point in lapd_gdf['centroid']]

# Add heatmap to the base map
plugins.HeatMap(heat_data).add_to(map)

# Display the map
map

In [None]:
# Function to create a Folium heat map for a given area
def create_heatmap(area_data, area_name):
    # Create a base map
    map_ = folium.Map(location=[34.05, -118.25], tiles="Cartodb dark_matter", zoom_start=10)  # Centered on Los Angeles

    # Prepare heat data
    heat_data = [[row['LAT'], row['LON']] for index, row in area_data.iterrows()]

    # Add heatmap to the base map
    plugins.HeatMap(heat_data).add_to(map_)

    return map_

# Create and display heat maps for the first 10 unique area names
unique_areas = lapd_data['AREA NAME'].unique()[:10]

for area in unique_areas:
    area_data = lapd_data[lapd_data['AREA NAME'] == area]
    map_ = create_heatmap(area_data, area)
    display(map_)

In [None]:
# Function to create a Folium heat map for a given area
def create_heatmap(area_data, area_name):
    # Create a base map
    map_ = folium.Map(location=[34.05, -118.25], tiles="Cartodb dark_matter", zoom_start=10)  # Centered on Los Angeles

    # Prepare heat data
    heat_data = [[row['LAT'], row['LON']] for index, row in area_data.iterrows()]

    # Add heatmap to the base map
    plugins.HeatMap(heat_data).add_to(map_)

    # Save the map to an HTML file
    file_name = f"{area_name.replace(' ', '_')}_heatmap.html"
    map_.save(file_name)
    return file_name

# Create heat maps for the first 10 unique area names
unique_areas = lapd_data['AREA NAME'].unique()[:10]
html_files = []
for area in unique_areas:
    area_data = lapd_data[lapd_data['AREA NAME'] == area]
    html_file = create_heatmap(area_data, area)
    html_files.append(html_file)

# Embed the generated heat maps into a single HTML file
html_content = """
<!DOCTYPE html>
<html>
<head>
    <title>Folium Heatmaps</title>
    <style>
        .map-container {
            display: flex;
            flex-wrap: wrap;
        }
        .map {
            width: 45%;
            height: 400px;
            margin: 10px;
        }
    </style>
</head>
<body>
    <h1>Folium Heatmaps of Los Angeles Crime Data</h1>
    <div class="map-container">
"""

for html_file in html_files:
    html_content += f'<iframe src="{html_file}" class="map"></iframe>'

html_content += """
    </div>
</body>
</html>
"""

# Save the combined HTML file
combined_file_path = "/content/combined_heatmaps.html"
with open(combined_file_path, "w") as file:
    file.write(html_content)

# Display the path to the combined HTML file
combined_file_path

In [None]:
lapd_data.groupby

In [None]:


# Function to create a Folium heat map for a given area
def create_heatmap(area_data, area_name):
    # Create a base map
    map_ = folium.Map(location=[34.05, -118.25], tiles="Cartodb dark_matter", zoom_start=10)  # Centered on Los Angeles

    # Prepare heat data
    heat_data = [[row['LAT'], row['LON']] for index, row in area_data.iterrows()]

    # Add heatmap to the base map
    plugins.HeatMap(heat_data).add_to(map_)

    # Add title as a marker
    folium.Marker(
        location=[34.05, -118.25],  # Centered on Los Angeles
        icon=folium.DivIcon(html=f'<div style="font-size: 20pt">{area_name}</div>')
    ).add_to(map_)

    # Save the map to an HTML file
    file_name = f"{area_name.replace(' ', '_')}_heatmap.html"
    map_.save(file_name)
    return file_name

# Create heat maps for the first 10 unique area names
unique_areas = lapd_data['AREA NAME'].unique()[:10]
html_files = []
for area in unique_areas:
    area_data = lapd_data[lapd_data['AREA NAME'] == area]
    html_file = create_heatmap(area_data, area)
    html_files.append(html_file)

# Embed the generated heat maps into a single HTML file
html_content = """
<!DOCTYPE html>
<html>
<head>
    <title>Folium Heatmaps</title>
    <style>
        .map-container {
            display: flex;
            flex-wrap: wrap;
        }
        .map {
            width: 45%;
            height: 400px;
            margin: 10px;
        }
    </style>
</head>
<body>
    <h1>Folium Heatmaps of Los Angeles Crime Data</h1>
    <div class="map-container">
"""

for html_file in html_files:
    html_content += f'<iframe src="{html_file}" class="map"></iframe>'

html_content += """
    </div>
</body>
</html>
"""

# Save the combined HTML file
combined_file_path = "/content/combined_heatmaps.html"
with open(combined_file_path, "w") as file:
    file.write(html_content)

# Display the path to the combined HTML file
combined_file_path

In [None]:
# Function to create a Folium heat map for a given area
def create_heatmap(area_data, area_name):
    # Create a base map
    map_ = folium.Map(location=[34.05, -118.25], tiles="Cartodb dark_matter", zoom_start=10)  # Centered on Los Angeles

    # Prepare heat data
    heat_data = [[row['LAT'], row['LON']] for index, row in area_data.iterrows()]

    # Add heatmap to the base map
    plugins.HeatMap(heat_data).add_to(map_)

    # Add markers for each crime
    for index, row in area_data.iterrows():
        popup_text = f"Crime: {row['Crm Cd Desc']}<br>Latitude: {row['LAT']}<br>Longitude: {row['LON']}"
        folium.Marker(
            location=[row['LAT'], row['LON']],
            popup=popup_text
        ).add_to(map_)

    # Add title as a marker
    folium.Marker(
        location=[34.05, -118.25],  # Centered on Los Angeles
        icon=folium.DivIcon(html=f'<div style="font-size: 20pt">{area_name}</div>')
    ).add_to(map_)

    # Save the map to an HTML file
    file_name = f"{area_name.replace(' ', '_')}_heatmap.html"
    map_.save(file_name)
    return file_name

# Create heat maps for the first 10 unique area names
unique_areas = lapd_data['AREA NAME'].unique()[:10]
html_files = []
for area in unique_areas:
    area_data = lapd_data[lapd_data['AREA NAME'] == area]
    html_file = create_heatmap(area_data, area)
    html_files.append(html_file)

# Embed the generated heat maps into a single HTML file
html_content = """
<!DOCTYPE html>
<html>
<head>
    <title>Folium Heatmaps</title>
    <style>
        .map-container {
            display: flex;
            flex-wrap: wrap;
        }
        .map {
            width: 45%;
            height: 400px;
            margin: 10px;
        }
    </style>
</head>
<body>
    <h1>Folium Heatmaps of Los Angeles Crime Data</h1>
    <div class="map-container">
"""

for html_file in html_files:
    html_content += f'<iframe src="{html_file}" class="map"></iframe>'

html_content += """
    </div>
</body>
</html>
"""

# Save the combined HTML file
combined_file_path = "/content/combined_heatmaps.html"
with open(combined_file_path, "w") as file:
    file.write(html_content)

# Display the path to the combined HTML file
combined_file_path

In [None]:
# Function to create a Folium heat map for a given area
def create_heatmap(area_data, area_name):
    # Create a base map
    map_ = folium.Map(location=[34.05, -118.25], tiles="Cartodb dark_matter", zoom_start=10)  # Centered on Los Angeles

    # Prepare heat data
    heat_data = [[row['LAT'], row['LON']] for index, row in area_data.iterrows()]

    # Add heatmap to the base map
    plugins.HeatMap(heat_data).add_to(map_)

    # Add markers for each crime
    for index, row in area_data.iterrows():
        popup_text = f"Crime: {row['Crm Cd Desc']}<br>Latitude: {row['LAT']}<br>Longitude: {row['LON']}<br>Population Crime: {row['population_crime_by_lat_lon']}"
        folium.Marker(
            location=[row['LAT'], row['LON']],
            popup=popup_text
        ).add_to(map_)

    # Add title as a marker
    folium.Marker(
        location=[34.05, -118.25],  # Centered on Los Angeles
        icon=folium.DivIcon(html=f'<div style="font-size: 20pt">{area_name}</div>')
    ).add_to(map_)

    # Save the map to an HTML file
    file_name = f"{area_name.replace(' ', '_')}_crime_heatmap.html"
    map_.save(file_name)
    return file_name

# Create heat maps for the first 10 unique area names
unique_areas = lapd_data['AREA NAME'].unique()[:10]
html_files = []
for area in unique_areas:
    area_data = lapd_data[lapd_data['AREA NAME'] == area]
    html_file = create_heatmap(area_data, area)
    html_files.append(html_file)

# Embed the generated heat maps into a single HTML file
html_content = """
<!DOCTYPE html>
<html>
<head>
    <title>Folium Heatmaps</title>
    <style>
        .map-container {
            display: flex;
            flex-wrap: wrap;
        }
        .map {
            width: 45%;
            height: 400px;
            margin: 10px;
        }
    </style>
</head>
<body>
    <h1>Folium Heatmaps of Los Angeles Crime Data</h1>
    <div class="map-container">
"""

for html_file in html_files:
    html_content += f'<iframe src="{html_file}" class="map"></iframe>'

html_content += """
    </div>
</body>
</html>
"""

# Save the combined HTML file
combined_file_path = "/content/la_combined_heatmaps.html"
with open(combined_file_path, "w") as file:
    file.write(html_content)

# Display the path to the combined HTML file
combined_file_path

In [None]:
# Function to get the top 10 crimes in each area
def get_top_10_crimes_by_area(data):
    top_crimes = data.groupby(['AREA NAME', 'Crm Cd Desc']).size().reset_index(name='counts')
    top_crimes = top_crimes.groupby('AREA NAME').apply(lambda x: x.nlargest(1, 'counts')).reset_index(drop=True)
    top_crimes = top_crimes[['AREA NAME', 'Crm Cd Desc']]
    return pd.merge(data, top_crimes, on=['AREA NAME', 'Crm Cd Desc'], how='inner')

# Get the top 10 crimes in each area
filtered_data = get_top_10_crimes_by_area(lapd_data)

# Function to create a Folium heat map for a given area
def create_heatmap(area_data, area_name):
    # Create a base map
    map_ = folium.Map(location=[34.05, -118.25], tiles="Cartodb dark_matter", zoom_start=10)  # Centered on Los Angeles

    # Prepare heat data
    heat_data = [[row['LAT'], row['LON']] for index, row in area_data.iterrows()]

    # Add heatmap to the base map
    plugins.HeatMap(heat_data).add_to(map_)

       # Add markers for each crime
    for index, row in area_data.iterrows():
        popup_text = f"Crime: {row['Crm Cd Desc']}<br>Latitude: {row['LAT']}<br>Longitude: {row['LON']}<br>Population Crime: {row['population_crime_by_lat_lon']}"
        folium.Marker(
            location=[row['LAT'], row['LON']],
            popup=popup_text
        ).add_to(map_)

    # Add title as a marker
    folium.Marker(
        location=[34.05, -118.25],  # Centered on Los Angeles
        icon=folium.DivIcon(html=f'<div style="font-size: 20pt">{area_name}</div>')
    ).add_to(map_)

    # Save the map to an HTML file
    file_name = f"{area_name.replace(' ', '_')}_heatmap_TOP1.html"
    map_.save(file_name)
    return file_name

# Create heat maps for the first 10 unique area names
unique_areas = filtered_data['AREA NAME'].unique()[:10]
html_files = []
for area in unique_areas:
    area_data = filtered_data[filtered_data['AREA NAME'] == area]
    html_file = create_heatmap(area_data, area)
    html_files.append(html_file)

# Embed the generated heat maps into a single HTML file
html_content = """
<!DOCTYPE html>
<html>
<head>
    <title>Folium Heatmaps</title>
    <style>
        .map-container {
            display: flex;
            flex-wrap: wrap;
        }
        .map {
            width: 45%;
            height: 400px;
            margin: 10px;
        }
    </style>
</head>
<body>
    <h1>Folium Heatmaps of Los Angeles Crime Data</h1>
    <div class="map-container">
"""

for html_file in html_files:
    html_content += f'<iframe src="{html_file}" class="map"></iframe>'

html_content += """
    </div>
</body>
</html>
"""

# Save the combined HTML file
combined_file_path = "/content/combined_heatmaps_TOP1.html"
with open(combined_file_path, "w") as file:
    file.write(html_content)

# Display the path to the combined HTML file
combined_file_path

In [None]:
filtered_data[filtered_data['AREA NAME'] =='CENTRAL'].groupby('Crm Cd Desc').size()