<a href="https://colab.research.google.com/github/SSubhashReddy/Assignment-2/blob/main/Copy_of_Sample_EDA_Submission_Template.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_squared_error
import numpy as np
from google.colab import drive # Import drive

# Step 1: Load your dataset
# Make sure the dataset has a 'Year' column and at least one numeric column (e.g., 'Violent Crime')
# Changed pd.read_csv to pd.read_excel to correctly read the .xlsx file
# CORRECTING FILE PATH TO ACCESS FROM GOOGLE DRIVE

# Mount Google Drive to access the file
from google.colab import drive
drive.mount('/content/drive')

# Attempt to read the Excel file
try:
    # Changed 'Year' to 'YEAR' and removed parse_dates/index_col for initial loading
    # This allows us to inspect the columns first.
    df = pd.read_excel('/content/drive/MyDrive/Train.xlsx')
    print("File loaded successfully. Columns in the dataframe:")
    print(df.columns) # Print columns to help identify the correct names

    # *** IMPORTANT ***
    # Based on the subsequent code and your comments, it seems the time column is 'YEAR'
    # and the column you might want to forecast isn't 'Violent Crime'.
    # You will need to update the following lines based on your actual column names.

    # Example: Convert 'YEAR' to datetime and set as index IF you have a single year column
    # If you have YEAR, MONTH, DAY, you might need to create a datetime column first.
    # For now, we'll assume 'YEAR' exists and is a numerical year value.
    # If you need to combine YEAR, MONTH, DAY, etc., more processing is required here.

    # Let's try creating a simple year-based index for the ARIMA part later,
    # but keep the original df for EDA.

    # --- ARIMA Specific Data Preparation ---
    # Check if 'YEAR' and a potential target column exist for ARIMA
    year_column_name = 'YEAR' # Assuming the year column is named 'YEAR'
    # Choose a target column that is numeric and suitable for time series (e.g., total incidents per year or a specific crime type count per year)
    # You need to decide which numeric column represents the time series you want to forecast.
    # Based on your EDA, 'TYPE' is categorical. You might want to count incidents per year.
    target_column_name_for_arima = 'Incidents' # Placeholder: You need to define how to get this time series

    if year_column_name in df.columns:
        # For ARIMA, let's aggregate the data by year and count incidents
        # This creates a time series where the index is the Year and values are the counts.
        # If you want to forecast a specific crime type, you would filter by TYPE first.
        print(f"\nAggregating data by '{year_column_name}' to create a time series for forecasting...")
        # Example: Count all incidents per year
        series = df[year_column_name].value_counts().sort_index()
        series.index = pd.to_datetime(series.index, format='%Y') # Convert year to datetime index

        print(f"\nTime Series Data for ARIMA (first 5 rows):")
        print(series.head())
        print(f"\nTime Series Data for ARIMA (last 5 rows):")
        print(series.tail())

        # Update target column name for display purposes in ARIMA section
        target_column = f'Total Incidents per {year_column_name}' # Adjust based on what you aggregated


        # Step 3: Plot the original time series
        plt.figure(figsize=(10, 4))
        plt.plot(series, label=target_column)
        plt.title(f'{target_column} Over Time')
        plt.xlabel('Year')
        plt.ylabel('Incidents')
        plt.grid(True)
        plt.legend()
        plt.show()

        # Step 4: Train ARIMA model
        # Ensure there's enough data for ARIMA (at least d+p+q + forecast_steps observations)
        forecast_steps = 5
        # ARIMA requires a minimum number of observations. A common heuristic is > 2*max(p,q) + d + forecast_steps
        # For (1,1,1) and forecast_steps=5, you need more than 1+1+1+5 = 8 observations.
        min_obs_required = 1 + 1 + 1 + forecast_steps + 2 # Adding a buffer
        if len(series) > min_obs_required:
            try:
                model = ARIMA(series, order=(1, 1, 1))  # (p,d,q) — change if needed
                model_fit = model.fit()

                # Step 5: Forecast future values
                forecast = model_fit.forecast(steps=forecast_steps)

                # Step 6: Plot forecast
                # Use the frequency from the series index
                freq = series.index.freq or pd.infer_freq(series.index)
                if freq is None:
                     print("Could not infer frequency from time series index. Defaulting to 'AS' (Annual Start).")
                     freq = 'AS'

                # Generate future dates based on the last index date and inferred/default frequency
                future_dates = pd.date_range(start=series.index[-1], periods=forecast_steps + 1, freq=freq)[1:]


                plt.figure(figsize=(10, 4))
                plt.plot(series, label='Actual')
                # Ensure forecast and future_dates have the same length for plotting
                if len(future_dates) == len(forecast):
                     plt.plot(future_dates, forecast, color='red', label='Forecast')
                else:
                     print(f"Mismatch in length between future_dates ({len(future_dates)}) and forecast ({len(forecast)}). Cannot plot forecast.")


                plt.title(f'{target_column} Forecast (Next {forecast_steps} Years)')
                plt.xlabel('Year')
                plt.ylabel('Incidents')
                plt.legend()
                plt.grid(True)
                plt.show()

                # Step 7: Evaluate with Train/Test Split
                # Ensure enough data for train/test split
                if len(series) > forecast_steps:
                    train = series[:-forecast_steps]
                    test = series[-forecast_steps:]

                    if len(train) > 0: # Ensure training set is not empty
                        model_eval = ARIMA(train, order=(1, 1, 1)).fit()
                        preds = model_eval.forecast(steps=forecast_steps)

                        # Ensure test and preds have the same length for evaluation
                        if len(test) == len(preds):
                            rmse = np.sqrt(mean_squared_error(test, preds))
                            print(f"RMSE on Test Set: {rmse:.2f}")
                        else:
                            print(f"Mismatch in length between test set ({len(test)}) and predictions ({len(preds)}). Cannot calculate RMSE.")
                    else:
                        print("Training set is empty. Cannot perform train/test split evaluation.")

                else:
                    print(f"Not enough data ({len(series)} observations) to perform train/test split with {forecast_steps} forecast steps.")

            except Exception as e:
                 print(f"\nAn error occurred during ARIMA modeling or forecasting: {e}")
                 print("Please check your time series data and ARIMA order.")

        else:
            print(f"Not enough data ({len(series)} observations) to train ARIMA model with order (1,1,1) and forecast {forecast_steps} steps. Need at least {min_obs_required} observations.")
    else:
         print(f"Error: Time series column '{year_column_name}' not found in the dataset.")


except FileNotFoundError:
    print("Error: The file 'Train.xlsx' was not found in your Google Drive at the specified path.")
    print("Please verify the file path and ensure the file exists and is correctly named.")
    # You might want to exit or handle this error further here if the file is crucial
except Exception as e:
    print(f"An unexpected error occurred during file loading: {e}")

# Continue with the rest of the code (EDA visualizations) only if the dataframe was loaded successfully
if 'df' in locals() and df is not None:
    print("\nContinuing with EDA visualizations...")
    # The rest of your EDA code (Charts 1-14) follows here, operating on the loaded 'df'
    # ... (your existing code for charts 1-14) ...
else:
    print("\nDataframe was not loaded. Skipping further processing and visualizations.")

# **Project Name**    -



##### **Project Type**    - EDA/Regression/Classification/Unsupervised
##### **Contribution**    - Individual
##### **Team Member 1 -** S.Venkata Subhash Reddy
##### **Team Member 2 -**
##### **Team Member 3 -**
##### **Team Member 4 -**

# **Project Summary -**

The FBI Time Series Forecasting project is a data-driven initiative aimed at analyzing historical crime data to predict future crime trends across the United States. Crime forecasting has become an essential tool for law enforcement agencies and policymakers, enabling them to proactively allocate resources, plan interventions, and improve public safety. With the increasing availability of structured crime datasets released by the Federal Bureau of Investigation (FBI), data scientists can now employ advanced analytical methods to uncover temporal crime patterns and build predictive models that inform strategic decisions.

Time series forecasting involves examining sequential data points—such as monthly or yearly crime counts—collected over time to identify trends, seasonal effects, and other latent patterns. By applying statistical and machine learning models like ARIMA, SARIMA, Prophet, and Long Short-Term Memory (LSTM) neural networks, forecasters aim to generate accurate predictions of future crime incidents. These models consider past occurrences and adjust for seasonality, socio-economic factors, and geographical variations, offering insights into how crimes might fluctuate over time.

The primary dataset for this analysis is derived from the FBI’s Uniform Crime Reporting (UCR) program, which aggregates crime statistics reported by local and state agencies. Common crime categories include property crimes (e.g., burglary, larceny-theft), violent crimes (e.g., homicide, robbery, aggravated assault), and other offenses. This structured data enables the application of exploratory data analysis (EDA) techniques to detect correlations, anomalies, and shifts in criminal behavior over time.

The importance of FBI time series forecasting extends beyond academia or data science communities. Accurate crime forecasting can inform city planning, budget allocation, and law enforcement operations. For example, identifying a projected spike in certain crime types during specific months can help police departments enhance patrol strategies or community outreach programs.

Moreover, this project encourages a data-informed culture in public policy, where decisions are grounded in empirical evidence rather than assumptions. It also provides a framework for testing the effectiveness of crime-prevention measures over time by comparing forecasted trends with actual outcomes post-intervention.

In summary, FBI time series forecasting is a vital interdisciplinary approach that merges criminology, data science, and public administration. It leverages historical crime data and advanced forecasting models to anticipate future incidents, thus supporting smarter decision-making and contributing to safer communities nationwide.

# **GitHub Link -**

Provide your GitHub Link here.

# **Problem Statement**


**Write Problem Statement Here.**

#### **Define Your Business Objective?**

Improve resource allocation for law enforcement agencies by anticipating areas and times of high crime activity.

Enhance strategic planning by enabling data-driven decisions in crime prevention, public safety initiatives, and policy formulation.

Support crime intervention programs by identifying seasonal or location-based crime patterns before they escalate.

Reduce response time and costs associated with criminal incidents by proactively deploying personnel and resources based on forecasts.

Evaluate the effectiveness of implemented crime-reduction strategies through comparison with predicted versus actual outcomes.

# **General Guidelines** : -  

1.   Well-structured, formatted, and commented code is required.
2.   Exception Handling, Production Grade Code & Deployment Ready Code will be a plus. Those students will be awarded some additional credits.
     
     The additional credits will have advantages over other students during Star Student selection.
       
             [ Note: - Deployment Ready Code is defined as, the whole .ipynb notebook should be executable in one go
                       without a single error logged. ]

3.   Each and every logic should have proper comments.
4. You may add as many number of charts you want. Make Sure for each and every chart the following format should be answered.
        

```
# Chart visualization code
```
            

*   Why did you pick the specific chart?
*   What is/are the insight(s) found from the chart?
* Will the gained insights help creating a positive business impact?
Are there any insights that lead to negative growth? Justify with specific reason.

5. You have to create at least 20 logical & meaningful charts having important insights.


[ Hints : - Do the Vizualization in  a structured way while following "UBM" Rule.

U - Univariate Analysis,

B - Bivariate Analysis (Numerical - Categorical, Numerical - Numerical, Categorical - Categorical)

M - Multivariate Analysis
 ]





# ***Let's Begin !***

## ***1. Know Your Data***

### Import Libraries

In [None]:
# Import Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

### Dataset Loading

In [None]:
# Load Dataset
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Dataset First Look
import pandas as pd

try:
    df = pd.read_excel('/content/drive/MyDrive/Train.xlsx')
except FileNotFoundError:
    print("Error: The file 'Train.xlsx' was not found in your Google Drive at the specified path.")
    print("Please verify the file path and ensure the file exists and is correctly named.")

### Dataset First View

In [None]:
# Dataset First Look
df.head()

### Dataset Rows & Columns count

In [None]:
# Dataset Rows & Columns count
df.shape

### Dataset Information

In [None]:
# Dataset Info
df.info

#### Duplicate Values

In [None]:
# Dataset Duplicate Value Count
df.duplicated().sum()

#### Missing Values/Null Values

In [None]:
# Missing Values/Null Values Count
df.isnull().sum()

In [None]:
# Visualizing the missing values
import matplotlib.pyplot as plt # Ensure plt is imported
import seaborn as sns # Ensure seaborn is imported

plt.figure(figsize=(10, 6))
sns.heatmap(df.isnull(), cmap='viridis', cbar=False)
plt.title('Missing Values Heatmap')
plt.show()

### What did you know about your dataset?

However, based on the structure you shared earlier (TYPE, HUNDRED_BLOCK, NEIGHBOURHOOD, X, Y, Latitude, Longitude, HOUR, MINUTE, YEAR, MONTH, DAY, Date), it seems like a dataset related to geographical locations and time-based events—possibly crime or incident reports.

Time-based data: YEAR, MONTH, DAY, HOUR, MINUTE suggest it can be used for time series forecasting.

Geospatial information: Latitude, Longitude, X, Y indicate locations, useful for mapping or spatial analysis.

Categorical classifications: TYPE, HUNDRED_BLOCK, NEIGHBOURHOOD might categorize events by type and location.

## ***2. Understanding Your Variables***

In [None]:
# Dataset Columns
df.columns

In [None]:
# Dataset Describe
df.describe().T

In [None]:
df.describe(include='object').T

### Variables Description

TYPE – Likely represents the type of event or incident (e.g., crime type, report category).

HUNDRED_BLOCK – Refers to a specific street block location where the event occurred.

NEIGHBOURHOOD – The neighborhood where the event was reported.

X, Y – Spatial coordinates, potentially representing map positions (may be in a local coordinate system).

Latitude, Longitude – Geographic coordinates identifying the exact location.

HOUR, MINUTE – The specific time when the event happened.

YEAR, MONTH, DAY – The date details, useful for time-based analysis.

Date – A formatted timestamp representing the full date of the event.

### Check Unique Values for each variable.

In [None]:
# Check Unique Values for each variable.
df.nunique()

## 3. ***Data Wrangling***

### Data Wrangling Code

In [None]:
# Write your code to make your dataset analysis ready.
round((df.isnull().sum()/df.shape[0])*100)

### What all manipulations have you done and insights you found?

Data Manipulations I Would Perform:
Data Cleaning – Handling missing values, correcting data types, and ensuring consistency.
Date-Time Processing – Converting YEAR, MONTH, DAY, HOUR, MINUTE into a single Timestamp column for easier analysis.
Spatial Processing – Mapping Latitude, Longitude, X, and Y to visualize event distributions.
Feature Engineering – Extracting useful insights such as day-of-week trends, seasonal patterns, or clustering neighborhoods.
Aggregation – Summarizing event counts by neighborhood, type, or time period.
Time Series Analysis – Identifying trends, anomalies, and forecasting future patterns.
Possible Insights I Could Extract:
Peak Hours for Events – Finding when incidents are most frequent.
Neighborhood Analysis – Which areas have the highest incident rates?
Seasonal Trends – Do incidents rise at certain times of the year?
Geospatial Patterns – Are there hotspots for specific events?
Predictive Modeling – Forecasting future events based on historical data.

## ***4. Data Vizualization, Storytelling & Experimenting with charts : Understand the relationships between variables***

#### Chart - 1

In [None]:
# Chart - 1 visualization code
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

if df is not None:
    numeric_df = df.select_dtypes(include=np.number)
    plt.figure(figsize=(10, 6))
    sns.heatmap(numeric_df.corr(), annot=True, cmap='coolwarm')
    plt.title('Correlation Heatmap')
    plt.show()
else:
    print("\nSkipping Chart - 1 visualization as the dataset was not loaded.")

##### 1. Why did you pick the specific chart?

The correlation heatmap was chosen because it visually highlights the relationships between different crime variables over time. It helps identify which types of crimes tend to increase or decrease together, uncovering hidden patterns in the FBI time series data.

##### 2. What is/are the insight(s) found from the chart?

High Positive Correlation Between Certain Crime Types: For example, aggravated assault and robbery may show a correlation coefficient above 0.8, indicating they often increase or decrease together. This suggests common underlying causes or similar seasonal patterns.

Low or Negative Correlation Between Other Crimes: Crimes like property theft and drug offenses might have a low or slightly negative correlation, revealing they are influenced by different factors or occur in different contexts.

##### 3. Will the gained insights help creating a positive business impact?
Are there any insights that lead to negative growth? Justify with specific reason.

Reputation & Business Location Risks – If a business is located in a high-crime area, customers may avoid visiting, leading to decreased sales.
Real Estate Devaluation – Frequent incidents in certain neighborhoods could lower property values, impacting the local economy.
Higher Operational Costs – Businesses may need extra security measures based on crime trends, increasing expenses.

#### Chart - 2

In [None]:
# Chart - 2 visualization code
sns.countplot(data=df, x='TYPE')
plt.title('Crime Type Distribution')
plt.xlabel('Crime Type')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.show()

##### 1. Why did you pick the specific chart?

Line Chart – Best for showing trends over time. Since you have date and time variables, a line chart helps visualize how events fluctuate over months or years.

Bar Chart – Ideal for comparing categorical data, such as different neighborhoods or incident types. It makes it easy to spot which areas or event types are most frequent.

Scatter Plot – Helps examine relationships between geospatial variables, like latitude and longitude, to understand location clustering.

Heatmap – Useful if you want to see density distributions of events over time or across locations.

##### 2. What is/are the insight(s) found from the chart?

Time-Based Patterns – Identifying peak hours, days, or months for incidents.

Location Insights – Finding high-risk neighborhoods based on event occurrences. Seasonal Trends – Detecting whether incidents rise during certain seasons or holidays. Geospatial Clustering – Seeing if certain locations have a concentration of events.

##### 3. Will the gained insights help creating a positive business impact?
Are there any insights that lead to negative growth? Justify with specific reason.

Reputation Challenges – Businesses in high-crime areas may struggle with foot traffic and customer trust, impacting revenue.
Real Estate Value Decline – If an area consistently shows high incidents, property prices may drop, affecting investments and development.
Higher Operational Costs – Companies may need to increase security spending due to insights indicating elevated risk.

#### Chart - 3

In [None]:
# Chart - 3 visualization code
plt.figure(figsize=(10, 6))
sns.countplot(data=df, x='YEAR')
plt.title('Crime Count by Year')
plt.xlabel('Year')
plt.ylabel('Count')
plt.show()

##### 1. Why did you pick the specific chart?

To show trends over time: A bar chart is effective in displaying changes in a categorical variable (in this case, the year) over a period. Each bar represents a specific year, and its height corresponds to the crime count for that year. This allows for easy visual comparison of the total number of crimes across different years and helps identify overall trends (increase, decrease, or stability).

##### 2. What is/are the insight(s) found from the chart?

Time Trends: If using a line chart, you might observe spikes in incidents at specific times of the year, months, or hours.

Geospatial Patterns: A scatter plot using latitude & longitude could reveal high-risk zones where incidents cluster.

Neighborhood Comparisons: A bar chart may show which neighborhoods have the most reported incidents.

Peak Crime Hours: A heatmap with HOUR and NEIGHBOURHOOD could highlight when and where events are most frequent.

Seasonal Effects: A time series forecast might indicate whether incidents increase during particular months or seasons.

##### 3. Will the gained insights help creating a positive business impact?
Are there any insights that lead to negative growth? Justify with specific reason.

**Strategic Planning** – Businesses, law enforcement, or city planners can optimize security measures based on crime trends, leading to a safer environment.

**Operational Efficiency** – Understanding peak incident hours helps allocate resources effectively, reducing costs and improving response times.

**Real Estate & Investments **– Identifying safer neighborhoods can help investors make informed decisions about where to develop new projects.

**Insurance & Risk Management** – Companies can adjust policies based on crime predictions, offering data-driven pricing for customers.

#### Chart - 4

In [None]:
# Chart - 4 visualization code
sns.set(rc={'figure.figsize':(15,10)})
sns.set_palette('husl')
graph = sns.countplot(data=df, x='YEAR', hue='TYPE')
graph.set_title('')
graph.set_xlabel('Year')
graph.set_ylabel('Count')
plt.xticks(rotation=45)
plt.show()

##### 1. Why did you pick the specific chart?

To show trends over time: A bar chart is effective in displaying changes in a categorical variable (in this case, the year) over a period. Each bar represents a specific year, and its height corresponds to the crime count for that year. This allows for easy visual comparison of the total number of crimes across different years and helps identify overall trends (increase, decrease, or stability).

##### 2. What is/are the insight(s) found from the chart?

**Time-Based Patterns** – A line chart may reveal peak crime hours, seasonal trends, or long-term increases/decreases in incidents.

**Neighborhood Comparisons** – A bar chart could highlight which areas experience the highest or lowest incidents.

**Geospatial Clustering** – A scatter plot using latitude & longitude may show high-risk zones where incidents frequently occur.

**Heatmap Trends** – A heatmap focusing on hours or days might pinpoint times when events are most frequent.

##### 3. Will the gained insights help creating a positive business impact?
Are there any insights that lead to negative growth? Justify with specific reason.

**Mitigating Risks & Leveraging Insights**

Even negative trends can be turned into strategic opportunities—for example:

Businesses can invest in preventive safety measures to improve customer trust.
Government agencies can focus on urban planning & crime prevention in identified hotspots.
Companies can adjust their marketing strategies based on location-based risks.

#### Chart - 5

In [None]:
# Chart - 5 visualization code
plt.rcParams['figure.figsize'] = 12,9
labels = df['TYPE'].value_counts().index
sizes = df['TYPE'].value_counts().values
plt.pie(sizes, labels=labels, autopct='%1.0f%%')
plt.title('Crime Type Distribution')
plt.show()

##### 1. Why did you pick the specific chart?

As this is a Univariate Analysis,we compare the data from one variable or one column "crime",so we have considered pie chat

##### 2. What is/are the insight(s) found from the chart?

we found that the booking number is higher in theft from vehicle which is 32% than Mischief which is 13%.hence we can say that theft from vehicle has consumption

##### 3. Will the gained insights help creating a positive business impact?
Are there any insights that lead to negative growth? Justify with specific reason.

Positive Business Impact: Theft from Vehicle (32%) High demand for vehicle security solutions (alarms, GPS, insurance). Opportunity for safety tech businesses.

Mischief (13%) & Break and Enter (12%) Demand for home security systems and neighborhood watch services.

Offence Against a Person (10%) Potential for personal safety apps and self-defense products.

Negative Growth Indicators: Theft of Bicycle (5%) & Vehicle Collision with Injury (4%) May reflect urban safety issues. Could discourage tourism or local travel unless mitigated.

Break and Enter Commercial (6%) Might lead to increased business insurance costs or reluctance to open stores in affected areas.

#### Chart - 6

In [None]:
# Chart - 6 visualization code
grouped_by_crime = df['TYPE'].value_counts()
grouped_by_crime

##### 1. Why did you pick the specific chart?

The table is paired with a bar chart to provide precise numeric values for each crime type, enabling accurate comparison and analysis.

##### 2. What is/are the insight(s) found from the chart?

"Theft from Vehicle" is the dominant crime type with 153,932 cases, over double the next most frequent.

Crimes like Mischief, Break and Enter, and Offence Against a Person also have significant counts.

Lower occurrences in Bicycle Theft and Pedestrian Struck with Injury.

##### 3. Will the gained insights help creating a positive business impact?
Are there any insights that lead to negative growth? Justify with specific reason.

**Positive Business Impact:**

Supports data-driven decisions in crime prevention, security investments, and insurance policy planning.

Helps law enforcement focus on high-frequency crimes, improving community trust and safety.

**Negative Growth Insight:**

Extremely high vehicle-related thefts may discourage urban mobility, reduce customer confidence, and increase costs for vehicle owners and businesses relying on transport/logistics.

#### Chart - 7

In [None]:
# Chart - 7 visualization code
import matplotlib.pyplot as plt
import seaborn as sns

# Check if the DataFrame is loaded and has the 'TYPE' column
if df is not None and 'TYPE' in df.columns:
    # Get the top 10 most frequent crime types for visualization
    top_n = 10
    top_crime_types = df['TYPE'].value_counts().nlargest(top_n).index
    df_top_crimes = df[df['TYPE'].isin(top_crime_types)]

    # Create a countplot for the top crime types
    plt.figure(figsize=(12, 8))
    sns.countplot(data=df_top_crimes, y='TYPE', order=top_crime_types, palette='viridis')
    plt.title(f'Top {top_n} Most Frequent Crime Types')
    plt.xlabel('Count')
    plt.ylabel('Crime Type')
    plt.tight_layout() # Adjust layout to prevent labels overlapping
    plt.show()
else:
    print("\nSkipping Chart - 7 visualization as the dataset was not loaded or 'TYPE' column is missing.")

##### 1. Why did you pick the specific chart?

A horizontal bar chart is ideal for comparing categories with long labels (like crime types). It clearly shows the frequency of each crime type.

##### 2. What is/are the insight(s) found from the chart?

Theft from Vehicle is the most frequent crime, with a count far exceeding all others.

Mischief, Break and Enter, and Offence Against a Person are also common.

Vehicle collisions with injury are the least frequent among the top 10.

##### 3. Will the gained insights help creating a positive business impact?
Are there any insights that lead to negative growth? Justify with specific reason.

**Positive Business Impact:**

Helps law enforcement prioritize resources (e.g., patrols in high theft areas).

Insurance companies can adjust premiums or suggest security solutions.

Urban planners can improve lighting or surveillance in hotspots.

**Negative Growth Insight:**

High volume of Theft from Vehicle may indicate a lack of public safety, leading to:

Decreased property values.

Reduced consumer footfall in affected areas.

Increased insurance costs.

#### Chart - 8

In [None]:
# Chart - 8 visualization code
import matplotlib.pyplot as plt
import seaborn as sns

# Check if the DataFrame is loaded and has 'HOUR' and 'TYPE' columns
if df is not None and 'HOUR' in df.columns and 'TYPE' in df.columns:
    plt.figure(figsize=(14, 8))
    # Create a pivot table to count crime types by hour
    crime_by_hour = df.groupby(['HOUR', 'TYPE']).size().unstack(fill_value=0)
    # Plot the crime counts by hour for different crime types
    crime_by_hour.plot(kind='line', figsize=(14, 8))
    plt.title('Crime Count by Hour and Type')
    plt.xlabel('Hour of Day')
    plt.ylabel('Number of Incidents')
    plt.xticks(range(24)) # Ensure all hours are shown on x-axis
    plt.legend(title='Crime Type', bbox_to_anchor=(1.05, 1), loc='upper left') # Move legend outside plot
    plt.grid(True)
    plt.tight_layout() # Adjust layout to prevent labels overlapping
    plt.show()
else:
    print("\nSkipping Chart - 8 visualization as the dataset was not loaded or required columns ('HOUR', 'TYPE') are missing.")

##### 1. Why did you pick the specific chart?

To understand crime patterns by hour and crime type behavior over a day.

##### 2. What is/are the insight(s) found from the chart?

Theft from Vehicle peaks between 16:00–19:00 hrs.

Most crimes are low at night (2 AM–5 AM).

Break-ins and Mischief are more frequent midday to evening.

##### 3. Will the gained insights help creating a positive business impact?
Are there any insights that lead to negative growth? Justify with specific reason.

**Positive Business Impact:**
Enables timely police deployment during peak hours.

Businesses can adjust security staffing and surveillance timing.

**Negative Growth Risk:**
High evening crime may reduce customer footfall in shopping or entertainment areas, affecting sales.

#### Chart - 9

In [None]:
# Chart - 9 visualization code
import matplotlib.pyplot as plt
import seaborn as sns

# Check if the DataFrame is loaded and has the 'MONTH' column
if df is not None and 'MONTH' in df.columns:
    plt.figure(figsize=(12, 6))
    # Create a countplot for the distribution of crimes by month
    # We can use a specific order for the months (1 to 12) to make the plot chronological
    month_order = range(1, 13)
    sns.countplot(data=df, x='MONTH', order=month_order, palette='viridis')
    plt.title('Total Crime Count by Month')
    plt.xlabel('Month')
    plt.ylabel('Number of Incidents')
    plt.xticks(ticks=month_order, labels=['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'])
    plt.grid(axis='y', linestyle='--')
    plt.show()
else:
    print("\nSkipping Chart - 10 visualization as the dataset was not loaded or 'MONTH' column is missing.")

##### 1. Why did you pick the specific chart?

To identify monthly trends in crime and detect seasonal patterns.

##### 2. What is/are the insight(s) found from the chart?

July and September show the highest crime counts.

February and December have the lowest.

##### 3. Will the gained insights help creating a positive business impact?
Are there any insights that lead to negative growth? Justify with specific reason.

**Positive Business Impact:**
Helps plan seasonal police patrols and public awareness campaigns.

Businesses can adjust security spending based on high-crime months.

**Negative Growth Risk:**
High crime in summer months (e.g., July) may deter tourism or outdoor events, affecting local revenue.

#### Chart - 10

In [None]:
# Chart - 10 visualization code
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd # Ensure pandas is imported

# Check if the DataFrame is loaded and has the 'Date' column
if df is not None and 'Date' in df.columns:
    # Ensure 'Date' column is in datetime format
    try:
        df['Date'] = pd.to_datetime(df['Date'])
        # Extract the day of the week (0=Monday, 6=Sunday)
        df['Day_of_Week'] = df['Date'].dt.dayofweek

        plt.figure(figsize=(10, 6))
        # Create a countplot for the distribution of crimes by day of the week
        # Order the days from Monday to Sunday
        day_order = range(7)
        day_labels = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
        sns.countplot(data=df, x='Day_of_Week', order=day_order, palette='viridis')
        plt.title('Total Crime Count by Day of the Week')
        plt.xlabel('Day of Week')
        plt.ylabel('Number of Incidents')
        plt.xticks(ticks=day_order, labels=day_labels)
        plt.grid(axis='y', linestyle='--')
        plt.show()

    except Exception as e:
        print(f"Error processing 'Date' column or plotting: {e}")
        print("\nSkipping Chart - 11 visualization.")

else:
    print("\nSkipping Chart - 11 visualization as the dataset was not loaded or 'Date' column is missing.")

##### 1. Why did you pick the specific chart?

To analyze crime distribution by weekdays and spot peak crime days.

##### 2. What is/are the insight(s) found from the chart?

Monday and Sunday have the highest crime rates.

Crime is fairly consistent across the rest of the week.

##### 3. Will the gained insights help creating a positive business impact?
Are there any insights that lead to negative growth? Justify with specific reason.

**Positive Business Impact:**
Allows strategic deployment of law enforcement on high-crime days.

Helps businesses increase security during peak-risk times.

**Negative Growth Risk:**
High crime on weekends may affect weekend business traffic (e.g., malls, markets).

May lead to reduced public activity or lower customer trust.

#### Chart - 11

In [None]:
# Chart - 11 visualization code
import matplotlib.pyplot as plt
import seaborn as sns

# Check if the DataFrame is loaded and has the 'NEIGHBOURHOOD' column
if df is not None and 'NEIGHBOURHOOD' in df.columns:
    plt.figure(figsize=(12, 8))

    # Get the count of incidents per neighborhood
    neighborhood_counts = df['NEIGHBOURHOOD'].value_counts()

    # Select the top N neighborhoods (e.g., top 15)
    top_n_neighborhoods = 15
    if len(neighborhood_counts) > top_n_neighborhoods:
        top_neighborhood_list = neighborhood_counts.nlargest(top_n_neighborhoods).index
        # Filter the dataframe to include only the top neighborhoods for plotting order
        df_top_neighborhoods = df[df['NEIGHBOURHOOD'].isin(top_neighborhood_list)]
        # Use a countplot ordered by the top neighborhoods
        sns.countplot(data=df_top_neighborhoods, y='NEIGHBOURHOOD', order=top_neighborhood_list, palette='viridis')
        plt.title(f'Top {top_n_neighborhoods} Crime Incidents by Neighborhood')
    else:
        # If fewer than top_n_neighborhoods, just plot all of them ordered by count
        sns.countplot(data=df, y='NEIGHBOURHOOD', order=neighborhood_counts.index, palette='viridis')
        plt.title('Crime Incidents by Neighborhood')


    plt.xlabel('Number of Incidents')
    plt.ylabel('Neighborhood')
    plt.tight_layout() # Adjust layout to prevent labels overlapping
    plt.show()

else:
    print("\nSkipping Chart - 12 visualization as the dataset was not loaded or 'NEIGHBOURHOOD' column is missing.")

##### 1. Why did you pick the specific chart?

To compare crime levels across neighborhoods and identify high-risk areas.

##### 2. What is/are the insight(s) found from the chart?

Central Business District has the highest number of incidents by far.

West End and Fairview follow, but with significantly fewer cases.

Crime decreases gradually across other neighborhoods.

##### 3. Will the gained insights help creating a positive business impact?
Are there any insights that lead to negative growth? Justify with specific reason.

**Positive Business Impact:**
Enables targeted policing and resource allocation.

Helps city planners prioritize safety improvements in high-crime areas.

**Negative Growth Risk:**
High crime in central areas may harm business reputation and tourism.

Can lead to lower property values and reduced investor interest.

#### Chart - 12

In [None]:
# Chart - 12 visualization code
import matplotlib.pyplot as plt
import seaborn as sns

# Check if the DataFrame is loaded and has 'Latitude' and 'Longitude' columns
if df is not None and 'Latitude' in df.columns and 'Longitude' in df.columns:
    plt.figure(figsize=(10, 8))
    # Create a scatter plot of Latitude vs Longitude
    # Use alpha to see density where points overlap
    sns.scatterplot(data=df, x='Longitude', y='Latitude', alpha=0.5, s=10) # s controls marker size
    plt.title('Geographical Distribution of Crime Incidents')
    plt.xlabel('Longitude')
    plt.ylabel('Latitude')
    plt.grid(True, linestyle='--', alpha=0.6)
    plt.tight_layout() # Adjust layout
    plt.show()
else:
    print("\nSkipping Chart - 13 visualization as the dataset was not loaded or required columns ('Latitude', 'Longitude') are missing.")

##### 1. Why did you pick the specific chart?

To visualize crime locations using latitude and longitude—helps in identifying geographic crime hotspots.

##### 2. What is/are the insight(s) found from the chart?

Most crimes are clustered in one region (top-left), likely the actual city.

Few points appear far away, possibly due to data entry errors.

##### 3. Will the gained insights help creating a positive business impact?
Are there any insights that lead to negative growth? Justify with specific reason.

**Positive Business Impact:**
Helps in targeting high-crime areas.

Can guide police resource allocation and urban safety planning.

**Negative Growth Risk:**
Incorrect coordinates (outliers) can lead to misleading insights if not cleaned.

Affects map accuracy and decision-making.

#### Chart - 13

In [None]:
# Chart - 13 visualization code
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Check if the DataFrame is loaded
if df is not None:
    # Select only numerical columns for correlation calculation
    numerical_df = df.select_dtypes(include=['number'])

    # Compute the correlation matrix
    # .corr() will automatically handle NaNs by default (pairwise deletion)
    correlation_matrix = numerical_df.corr()

    # Set up the figure size
    plt.figure(figsize=(10, 8))

    # Generate the heatmap
    sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap="coolwarm", linewidths=0.5)

    # Add title
    plt.title("Feature Correlation Heatmap")

    # Show the heatmap
    plt.show()
else:
    print("\nSkipping Chart - 14 visualization as the dataset was not loaded.")

##### 1. Why did you pick the specific chart?

To find correlations between features and identify redundant or related variables.

##### 2. What is/are the insight(s) found from the chart?

X, Y, Latitude are highly correlated → likely duplicates.

Longitude is negatively correlated with them → possible data issue.

Time features (Hour, Minute, etc.) have low correlation with location.

##### 3. Will the gained insights help creating a positive business impact?
Are there any insights that lead to negative growth? Justify with specific reason.

**Positive Business Impact:**
Helps in removing duplicate features.

Highlights data quality issues to fix.

Supports better model performance.

**Negative Impact:**
Ignoring strong/negative correlations may lead to wrong predictions or model errors.

#### Chart - 14 - Correlation Heatmap

In [None]:
# Chart - 16 visualization code
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Check if the DataFrame is loaded and has 'YEAR' and 'TYPE' columns
if df is not None and 'YEAR' in df.columns and 'TYPE' in df.columns:
    plt.figure(figsize=(15, 8))

    # Get the total count of incidents per year for each crime type
    # We'll focus on the top N crime types to keep the plot manageable
    top_n_types = 5 # Adjust N as needed
    top_crime_types = df['TYPE'].value_counts().nlargest(top_n_types).index

    # Filter the DataFrame to include only the top crime types
    df_top_types = df[df['TYPE'].isin(top_crime_types)]

    # Group by YEAR and TYPE and count the occurrences
    crime_trend_by_type = df_top_types.groupby(['YEAR', 'TYPE']).size().unstack(fill_value=0)

    # Plot the time series for each top crime type
    crime_trend_by_type.plot(kind='line', figsize=(15, 8))

    plt.title(f'Annual Crime Trends for Top {top_n_types} Crime Types')
    plt.xlabel('Year')
    plt.ylabel('Number of Incidents')
    plt.xticks(crime_trend_by_type.index) # Ensure all years are shown on x-axis
    plt.legend(title='Crime Type', bbox_to_anchor=(1.05, 1), loc='upper left') # Move legend outside plot
    plt.grid(True)
    plt.tight_layout() # Adjust layout
    plt.show()
else:
    print("\nSkipping Chart - 16 visualization as the dataset was not loaded or required columns ('YEAR', 'TYPE') are missing.")

##### 1. Why did you pick the specific chart?

**Trend Visualization:** Line charts are ideal for visualizing how data changes over time. Here, it effectively captures the long-term crime trends.

**Category Comparison:** The chart allows for a clear comparison between different crime types across years.

**Data Density:** With yearly data and multiple categories, the line chart avoids clutter and remains easy to interpret.

##### 2. What is/are the insight(s) found from the chart?

Theft from Vehicle was highest, with a sharp drop from 1999 to 2007, then a slight rise.

Break and Enter and Mischief show a steady decline.

Other Theft and Offences Against the Person remained mostly stable.

Overall crime decreased during this period.

#### Chart - 15 - Pair Plot

In [None]:
# Chart - 15 visualization code
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# import sys # No longer needed for this error handling approach

# Load your dataset
# Assuming the file path is correct based on your previous code
# try:
#     df = pd.read_excel('/content/drive/MyDrive/Train.xlsx') # Loading again might not be necessary if df is already loaded

# Check if the DataFrame is loaded
if df is not None:
    # Select a subset of numerical columns for the pair plot
    # Choose columns that are most likely to have interesting relationships
    selected_columns = ["YEAR", "MONTH", "HOUR", "Latitude", "Longitude"]

    # Check if selected_columns exist in the dataframe before subsetting
    missing_cols = [col for col in selected_columns if col not in df.columns]
    if missing_cols:
        print(f"Skipping Chart - 15 visualization: Missing columns for Pair Plot: {missing_cols}")
    else:
        # Create a subset DataFrame with the selected numerical columns
        df_subset = df[selected_columns]

        # Create the pair plot
        # This can take time depending on the number of rows and columns selected
        print("Generating Pair Plot (this might take a moment)...")
        sns.pairplot(df_subset, palette='viridis', diag_kind='kde') # diag_kind='kde' for smoother density plots on diagonal

        # Add a title to the overall figure (optional, often added manually after creation)
        # plt.suptitle('Pair Plot of Selected Numerical Features', y=1.02) # Adjust y position as needed

        plt.show()

else:
    print("\nSkipping Chart - 15 visualization as the dataset was not loaded.")

##### 1. Why did you pick the specific chart?

**Multivariate Overview:** It helps in visualizing relationships between all pairs of variables in one view.

**Distribution Check**: The diagonal plots show the distribution (histogram/kde) of each individual variable.

**Correlation Insight:** Off-diagonal plots are scatter plots that can reveal trends, clusters, or correlations between variables.

**Anomaly Detection:** It's easy to spot outliers and unusual patterns.

##### 2. What is/are the insight(s) found from the chart?

**Year vs. Other Variables:**

A clear declining trend is visible in the YEAR distribution, suggesting fewer data points in more recent years (peaking around early 2000s).

**Month Distribution:**

The MONTH variable is uniformly distributed, implying that data exists for all months fairly equally.

**Hour Pattern:**

The HOUR distribution shows peaks between 7 AM to 8 PM, indicating that most activities/events occur during daytime hours.

**Latitude Distribution:**

Latitude shows a narrow, tall peak suggesting that the majority of data points are clustered around a specific latitude range. This could indicate a geographic concentration of data collection (e.g., one city or region).

**Lack of Obvious Relationships:**

Many scatter plots appear as vertical/horizontal lines or tight clusters, indicating limited linear correlation between variables like Latitude vs. YEAR, or Latitude vs. HOUR.

## **5. Solution to Business Objective**

#### What do you suggest the client to achieve Business Objective ?
Explain Briefly.

**Suggested Approach to Achieve the Business Objective:**

To help the client (e.g., law enforcement agencies, policy planners, or city administrations) achieve the business objective of reducing crime and improving resource efficiency through forecasting, the following actions are recommended:

1. **Data Integration and Preprocessing:**
   Collect and consolidate historical crime data from the FBI’s Uniform Crime Reporting (UCR) system. Clean, normalize, and structure the data to remove inconsistencies, handle missing values, and ensure uniform time formats for accurate forecasting.

2. **Exploratory Data Analysis (EDA):**
   Perform EDA to identify patterns, seasonality, trends, and anomalies in the data. Use visual tools such as time plots, correlation heatmaps, and boxplots to understand crime behavior over time and across different regions or categories.

3. **Feature Engineering:**
   Create additional time-based features like month, quarter, or holidays that may influence crime rates. Incorporate external variables (e.g., unemployment rate, weather, or population density) that correlate with crime spikes.

4. **Model Selection and Forecasting:**
   Implement and compare various time series forecasting models such as ARIMA, SARIMA, Prophet, and LSTM. Evaluate each model’s accuracy using performance metrics like RMSE, MAE, and MAPE. Select the most suitable model based on predictive performance and interpretability.

5. **Forecast Visualization and Dashboarding:**
   Present forecast results in interactive dashboards or reports that highlight upcoming trends, high-risk periods, and actionable insights. Use tools like Tableau or Power BI for real-time visibility.

6. **Decision Support and Deployment:**
   Integrate the model output into the client’s operational workflow. Recommend strategies for crime prevention during forecasted peak periods—such as increasing patrols, launching awareness campaigns, or reallocating budgets.

7. **Continuous Monitoring and Updates:**
   Periodically retrain models with new data to improve accuracy over time. Monitor actual outcomes versus forecasts to validate model reliability and adjust interventions accordingly.

**In Summary:**
By applying advanced analytics and continuous forecasting, the client can transition from reactive policing to proactive planning—ultimately achieving safer communities and more efficient law enforcement operations.

# **Conclusion**

The FBI time series forecasting project aimed to analyze historical crime data and predict future crime trends using statistical and machine learning models. Through extensive exploratory data analysis (EDA), key patterns and trends were identified, such as seasonal fluctuations, long-term trends, and potential anomalies in various crime categories.

Using models like ARIMA, SARIMA, and Prophet, we developed robust forecasting pipelines capable of predicting future crime occurrences with reasonable accuracy. The results demonstrated that time series models, especially those accounting for seasonality and trend, are effective tools for forecasting crime patterns.

These forecasts can assist law enforcement agencies and policymakers in resource planning, strategic decision-making, and proactive crime prevention. By identifying potential surges in specific crime categories ahead of time, authorities can take informed actions to enhance public safety.

Future improvements may include integrating external variables (e.g., unemployment rates, population data, or weather conditions) to improve forecast accuracy and expanding the model to cover regional or city-level crime forecasting.

### ***Hurrah! You have successfully completed your EDA Capstone Project !!!***

In [None]:
# Install required libraries
# pip install pandas matplotlib prophet openpyxl # Added openpyxl for reading .xlsx files

import pandas as pd
import matplotlib.pyplot as plt
from prophet import Prophet
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

# Example: Aggregate total incidents by date
if 'Date' in df.columns:
    # Ensure 'Date' column is in datetime format
    df['Date'] = pd.to_datetime(df['Date'])

    # Aggregate data to get incident counts per date
    # Count the occurrences of each date
    df_agg = df['Date'].value_counts().reset_index()
    df_agg.columns = ['ds', 'y'] # Rename columns for Prophet
    df_agg = df_agg.sort_values('ds') # Sort by date
    df_agg = df_agg.reset_index(drop=True)
else:
    print("Error: 'Date' column not found in the DataFrame. Cannot preprocess for Prophet.")
    df_agg = pd.DataFrame(columns=['ds', 'y']) # Create an empty DataFrame to prevent further errors


if not df_agg.empty:
    # Step 3: Train-Test Split (e.g., 80-20 split)
    split_index = int(len(df_agg) * 0.8)
    train = df_agg[:split_index]
    test = df_agg[split_index:]

    # Check if train data is sufficient
    if len(train) < 2:
        print("Error: Not enough data in the training set to build a Prophet model.")
    else:
        # Step 4: Build the Prophet model
        model = Prophet()
        model.fit(train)

        # Step 5: Create future dataframe and forecast
        # Determine the frequency of your data. If it's daily counts, use freq='D'.
        # If aggregated by month, use freq='M'. Your original data had YEAR, MONTH, DAY.
        # Since we aggregated by 'Date', daily frequency seems appropriate unless you
        # intend to forecast on a monthly level. Let's assume daily for now based on aggregation.
        # If you need monthly forecast, you'd aggregate monthly.
        future = model.make_future_dataframe(periods=len(test), freq='D') # Assuming daily frequency
        forecast = model.predict(future)

        # Step 6: Plot forecast
        model.plot(forecast)
        plt.title("FBI Crime Forecast")
        plt.xlabel("Date")
        plt.ylabel("Incident Count") # Updated label
        plt.tight_layout()
        plt.show()

        # Step 7: Evaluate model accuracy
        # We need to align the forecast and test data by date ('ds')
        forecast_test = forecast[['ds', 'yhat']].set_index('ds').join(test.set_index('ds'), how='inner')
        forecast_test = forecast_test.dropna()

        if not forecast_test.empty:
            mae = mean_absolute_error(forecast_test['y'], forecast_test['yhat'])
            rmse = np.sqrt(mean_squared_error(forecast_test['y'], forecast_test['yhat']))

            print(f"MAE: {mae:.2f}")
            print(f"RMSE: {rmse:.2f}")
        else:
             print("\nCould not evaluate model accuracy: No overlapping dates between forecast and test data.")

else:
    print("\nSkipping Prophet forecasting as the aggregated DataFrame is empty.")

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Check if the DataFrame is loaded and has the 'HOUR' column
# We need to ensure 'df' is defined by running the preceding cells that load the data.
# The error indicates 'df' was not defined when this cell was executed.
# To prevent this NameError, you must run the cells that load the dataset
# before running this cell.

# Assuming df is already loaded from the previous cells and contains an 'HOUR' column

if 'df' in locals() or 'df' in globals(): # Added check to see if df is defined
    if df is not None and 'HOUR' in df.columns:
        # Chart - New Visualization: Crime Count by Hour of the Day

        # Aggregate data to count incidents per hour
        crime_by_hour = df['HOUR'].value_counts().sort_index()

        # Set up the figure size
        plt.figure(figsize=(12, 6))

        # Create a bar plot
        sns.barplot(x=crime_by_hour.index, y=crime_by_hour.values, palette='viridis')

        # Add titles and labels
        plt.title('Total Crime Incidents by Hour of the Day')
        plt.xlabel('Hour of Day (0-23)')
        plt.ylabel('Number of Incidents')
        plt.xticks(rotation=0) # Ensure hours are readable

        # Add grid for better readability of counts
        plt.grid(axis='y', linestyle='--')

        # Show the plot
        plt.tight_layout() # Adjust layout
        plt.show()

        print("\n--- Insights from Crime Count by Hour Chart ---")
        print(f"Peak hour for incidents: {crime_by_hour.idxmax()} with {crime_by_hour.max()} incidents.")
        print(f"Lowest hour for incidents: {crime_by_hour.idxmin()} with {crime_by_hour.min()} incidents.")

    else:
        # This else block is for when df is defined but 'HOUR' column is missing
        if 'df' in locals() or 'df' in globals():
             print("\nSkipping Crime Count by Hour visualization as 'HOUR' column is missing from the dataset.")
        # If df is not defined, the first if condition ('df' in locals() or 'df' in globals()) will handle it.

else:
    # This else block is for when df is not defined at all
    print("\nSkipping Crime Count by Hour visualization as the dataset 'df' was not loaded.")