# Leveraging Machine Learning for Predicting and Identifying Determinants of () in the London Area

## Introduction and literature review


### research question

This report will use supervised learning methods to investigate the crime number in each borough in London to explore which factors will affect the crime number, and is it possible to predict its crime number? 

## Presentation of Data and Data preprocessing 


### import data package

This section provides an overview of the packages utilized in the analysis, along with a brief explanation of their roles.

In [None]:
import numpy as np

#data import and handle data
import pandas as pd
import sklearn
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, GridSearchCV, validation_curve
from sklearn.metrics import mean_squared_error
from scipy import stats


# preprocessors
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
import statsmodels.api as sm
from sklearn.metrics import mean_squared_log_error, mean_squared_error, r2_score

#Import plot package
import matplotlib.pyplot as plt
import seaborn as sns
#regression model
from statsmodels.formula.api import ols
from statsmodels.iolib.summary2 import summary_col
# pipeline
from sklearn.pipeline import Pipeline

# CART
from sklearn.tree import DecisionTreeRegressor

# random forest
from sklearn.ensemble import RandomForestRegressor

# feature importance
import rfpimp

# xgboost
import xgboost
from xgboost import XGBRegressor


import matplotlib.pyplot as plt
import statsmodels.api as sm
from math import sqrt
from numpy.random import seed
from numpy.random import randn
from numpy import mean
from scipy.stats import sem
import statistics 
import seaborn as sns
from IPython.display import display, Math, Latex, display_latex
import plotly.express as px
import pylab

%matplotlib inline  

In [None]:
#pre set the default property
pd.set_option('display.max_rows', 300) # specifies number of rows to show
pd.options.display.float_format = '{:40,.4f}'.format # specifies default number format to 4 decimal places
pylab.rcParams['figure.figsize'] = (10., 8.)
sns.set(font_scale=1.5)
sns.set_style("dark")

### Data import

In [None]:
import pandas as pd
df = pd.read_csv('2003-2023MonthlyCrimeNum(Base).csv', sep=',')

In [None]:
df.head(10)


### check about the data

In [None]:
df.info()

In [None]:
df.isnull().values.any()

In [None]:
df.fillna(0, inplace=True)  # 将所有NaN值替换为0

# 检查是否还有空值
print(df.isnull().values.any())  # 应该输出 False

### Variables Visualization

In [None]:
pivot_table = df.groupby('BoroughName').sum()

In [None]:
pivot_table.to_csv('2023MonthlyCrime.csv')

In [None]:
df_new = pd.read_csv("MonthlyCrimeNum03-23.csv")

In [None]:
df_new.isnull().values.any()

In [None]:
df_new.head(30)

In [None]:
# Calculate the total crimes from 2003 to 2023 for each borough
df_new['Total_Crimes_03_23'] = df_new.iloc[:, 2:].sum(axis=1)

# Sort the boroughs based on the total crimes
sorted_data = df_new.sort_values(by='Total_Crimes_03_23', ascending=False)

# Plotting
plt.figure(figsize=(12, 8))
plt.plot(sorted_data['Borough_Name'], sorted_data['Total_Crimes_03_23'], marker='o')
plt.xticks(rotation=90)
plt.xlabel('Borough')
plt.ylabel('Total Crimes (2003-2023)')
plt.title('Total Crimes in Each Borough from 2003 to 2023')
plt.tight_layout()
plt.show()

In [None]:
# 绘制折线图，仅高亮第23个数据点
plt.figure(figsize=(12, 10))
plt.plot(sorted_data['Borough_Name'], sorted_data['Total_Crimes_03_23'], marker='o', color='blue')

# 高亮第23个数据点
plt.scatter(sorted_data['Borough_Name'].iloc[22], sorted_data['Total_Crimes_03_23'].iloc[22], color='red', s=50)

# 注释第一个和第23个数据点的值
first_borough = sorted_data['Borough_Name'].iloc[0]
first_value = sorted_data['Total_Crimes_03_23'].iloc[0]
plt.annotate(f'{first_value}', (first_borough, first_value), textcoords="offset points", xytext=(0,10), ha='center')

twenty_third_borough = sorted_data['Borough_Name'].iloc[22]
twenty_third_value = sorted_data['Total_Crimes_03_23'].iloc[22]
plt.annotate(f'{twenty_third_value}', (twenty_third_borough, twenty_third_value), textcoords="offset points", xytext=(0,10), ha='center')

plt.xticks(rotation=45, ha='right', fontsize=10)

plt.xlabel('Borough')
plt.ylabel('Total Crimes (2003-2023)')
plt.title('Total Crimes in Each Borough from 2003 to 2023 with the 23rd Data Point Highlighted')
plt.tight_layout()
plt.show()

In [None]:
# 绘制折线图，仅高亮第23个数据点
plt.figure(figsize=(12, 8))
plt.plot(sorted_data['Borough_Name'], sorted_data['Total_Crimes_03_23'], marker='o', color='black', linewidth=1)

# 高亮第23个数据点
plt.scatter(sorted_data['Borough_Name'].iloc[22], sorted_data['Total_Crimes_03_23'].iloc[22], color='red', s=50)

# 注释第一个和第23个数据点的值
first_borough = sorted_data['Borough_Name'].iloc[0]
first_value = sorted_data['Total_Crimes_03_23'].iloc[0]
plt.annotate(f'{first_value}', (first_borough, first_value), textcoords="offset points", xytext=(0,10), ha='center', fontsize=9)

twenty_third_borough = sorted_data['Borough_Name'].iloc[22]
twenty_third_value = sorted_data['Total_Crimes_03_23'].iloc[22]
plt.annotate(f'{twenty_third_value}', (twenty_third_borough, twenty_third_value), textcoords="offset points", xytext=(0,10), ha='center', fontsize=9)

plt.xticks(rotation=45, ha='right', fontsize=9)

plt.xlabel('Borough')
plt.ylabel('Total Crimes (2003-2023)')
plt.title('Total Crimes in Each Borough from 2003 to 2023 with the 23rd Data Point Highlighted')
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import MultipleLocator
from statsmodels.tsa.seasonal import seasonal_decompose

# 按区域进行分析
df_new = pd.read_csv("MonthlyCrimeNum03-23.csv")

# 检查是否有缺失值
if df_new.isnull().values.any():
    print("Data contains missing values.")
else:
    for borough in df_new["Borough_Name"].unique():
        # 选择区域
        borough_data = df_new[df_new["Borough_Name"] == borough]
        
        # 将年份和犯罪数量数据进行重塑
        crime_counts = borough_data.melt(id_vars=["Borough_Name", "Borough_code"], 
                                         var_name="Year", 
                                         value_name="Crime_number")
        crime_counts["Year"] = crime_counts["Year"].str[-4:]  # 取出年份
        crime_counts.set_index("Year", inplace=True)  # 设定年份为索引
        
        # 绘制散点图（时序图）
        plt.figure(figsize=(14,10))
        plt.suptitle(borough, fontsize=16)

        plt.scatter(crime_counts.index, crime_counts["Crime_number"], label='Original', marker='o', color='blue')
        plt.legend(loc='best')
        
        # 设置横坐标刻度标签的间隔为每隔12显示一个标签
        x_major_locator = MultipleLocator(12)
        plt.gca().xaxis.set_major_locator(x_major_locator)

        plt.tight_layout()
        plt.show()


In [None]:

# import pandas as pd

# # 读取数据
# df_new = pd.read_csv("MonthlyCrimeNum03-23.csv")

# # 检查缺失值
# missing_values = df_new[df_new.isnull().any(axis=1)]

# # 打印缺失值的具体信息
# print("Missing values:")
# print(missing_values)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import MultipleLocator
from statsmodels.tsa.seasonal import seasonal_decompose

# 按区域进行分析
df_new = pd.read_csv("MonthlyCrimeNum03-23.csv")

# 检查是否有缺失值
if df_new.isnull().values.any():
    print("Data contains missing values.")
else:
    for borough in df_new["Borough_Name"].unique():
        # 选择区域
        borough_data = df_new[df_new["Borough_Name"] == borough]
        
        # 将年份和犯罪数量数据进行重塑
        crime_counts = borough_data.melt(id_vars=["Borough_Name", "Borough_code"], 
                                         var_name="Year", 
                                         value_name="Crime_number")
        crime_counts["Year"] = crime_counts["Year"].str[-4:]  # 取出年份
        crime_counts.set_index("Year", inplace=True)  # 设定年份为索引
        
        # 时间序列分解
        result = seasonal_decompose(crime_counts["Crime_number"], model='additive', period=1)
        
        # 绘图
        plt.figure(figsize=(14,10))
        plt.suptitle(borough, fontsize=16)

        # 原始数据
        plt.subplot(411)
        plt.plot(result.observed, label='Original')
        plt.legend(loc='best')

        # 趋势
        plt.subplot(412)
        plt.plot(result.trend, label='Trend')
        plt.legend(loc='best')

        # 季节性
        plt.subplot(413)
        plt.plot(result.seasonal,label='Seasonality')
        plt.legend(loc='best')

        # 残差
        plt.subplot(414)
        plt.plot(result.resid, label='Residuals')
        plt.legend(loc='best')

        # 设置横坐标刻度标签的间隔为每隔12显示一个标签
        x_major_locator = MultipleLocator(12)
        for ax in plt.gcf().get_axes():
            ax.xaxis.set_major_locator(x_major_locator)

        plt.tight_layout()
        plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import MultipleLocator
from statsmodels.tsa.seasonal import seasonal_decompose

# 按区域进行分析
df_new = pd.read_csv("MonthlyCrimeNum10-23.csv")

# 检查是否有缺失值
if df_new.isnull().values.any():
    print("Data contains missing values.")
else:
    for borough in df_new["Borough_Name"].unique():
        # 选择区域
        borough_data = df_new[df_new["Borough_Name"] == borough]
        
        # 将年份和犯罪数量数据进行重塑
        crime_counts = borough_data.melt(id_vars=["Borough_Name", "Borough_code"], 
                                         var_name="Year", 
                                         value_name="Crime_number")
        crime_counts["Year"] = crime_counts["Year"].str[-4:]  # 取出年份
        crime_counts.set_index("Year", inplace=True)  # 设定年份为索引
        
        # 时间序列分解
        result = seasonal_decompose(crime_counts["Crime_number"], model='additive', period=1)
        
        # 绘图
        plt.figure(figsize=(14,10))
        plt.suptitle(borough, fontsize=16)

        # 原始数据
        plt.subplot(411)
        plt.plot(result.observed, label='Original')
        plt.legend(loc='best')

        # 趋势
        plt.subplot(412)
        plt.plot(result.trend, label='Trend')
        plt.legend(loc='best')

        # 季节性
        plt.subplot(413)
        plt.plot(result.seasonal,label='Seasonality')
        plt.legend(loc='best')

        # 残差
        plt.subplot(414)
        plt.plot(result.resid, label='Residuals')
        plt.legend(loc='best')

        # 设置横坐标刻度标签的间隔为每隔12显示一个标签
        x_major_locator = MultipleLocator(6)
        for ax in plt.gcf().get_axes():
            ax.xaxis.set_major_locator(x_major_locator)

        plt.tight_layout()
        plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import MultipleLocator
from statsmodels.tsa.seasonal import seasonal_decompose

# 按区域进行分析
df_new = pd.read_csv("MonthlyCrimeNum03-23.csv")

# 检查是否有缺失值
if df_new.isnull().values.any():
    print("Data contains missing values.")
else:
    for borough in df_new["Borough_Name"].unique():
        # 选择区域
        borough_data = df_new[df_new["Borough_Name"] == borough]
        
        # 将年份和犯罪数量数据进行重塑
        crime_counts = borough_data.melt(id_vars=["Borough_Name", "Borough_code"], 
                                         var_name="Year", 
                                         value_name="Crime_number")
        crime_counts["Year"] = crime_counts["Year"].str[-4:]  # 取出年份
        crime_counts.set_index("Year", inplace=True)  # 设定年份为索引
        
        # 时间序列分解
        result = seasonal_decompose(crime_counts["Crime_number"], model='additive', period=12)  # 修改这里的 period 值
        
        # 绘图
        plt.figure(figsize=(14,10))
        plt.suptitle(borough, fontsize=16)

        # 原始数据
        plt.subplot(411)
        plt.plot(result.observed, label='Original')
        plt.legend(loc='best')

        # 趋势
        plt.subplot(412)
        plt.plot(result.trend, label='Trend')
        plt.legend(loc='best')

        # 季节性
        plt.subplot(413)
        plt.plot(result.seasonal,label='Seasonality')
        plt.legend(loc='best')

        # 残差
        plt.subplot(414)
        plt.plot(result.resid, label='Residuals')
        plt.legend(loc='best')

        # 设置横坐标刻度标签的间隔为每隔12显示一个标签
        x_major_locator = MultipleLocator(12)
        for ax in plt.gcf().get_axes():
            ax.xaxis.set_major_locator(x_major_locator)

        plt.tight_layout()
        plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import MultipleLocator
from statsmodels.tsa.seasonal import seasonal_decompose

# 按区域进行分析
df_new = pd.read_csv("MonthlyCrimeNum03-23.csv")

# 检查是否有缺失值
if df_new.isnull().values.any():
    print("Data contains missing values.")
else:
    for borough in df_new["Borough_Name"].unique():
        # 选择区域
        borough_data = df_new[df_new["Borough_Name"] == borough]
        
        # 将年份和犯罪数量数据进行重塑
        crime_counts = borough_data.melt(id_vars=["Borough_Name", "Borough_code"], 
                                         var_name="Year", 
                                         value_name="Crime_number")
        crime_counts["Year"] = crime_counts["Year"].str[-4:]  # 取出年份
        crime_counts.set_index("Year", inplace=True)  # 设定年份为索引
        
        # 时间序列分解
        result = seasonal_decompose(crime_counts["Crime_number"], model='additive', period=12)  # 修改这里的 period 值
        
        # 绘图
        plt.figure(figsize=(14,10))
        plt.suptitle(borough, fontsize=16)

        # 原始数据和趋势放在同一张图上
        plt.subplot(211)
        plt.plot(result.observed, label='Original', color='blue')
        plt.plot(result.trend, label='Trend', color='orange')
        plt.legend(loc='best')

        # 季节性和残差放在同一张图上
        plt.subplot(212)
        plt.plot(result.seasonal,label='Seasonality', color='green')
        plt.plot(result.resid, label='Residuals', color='red')
        plt.legend(loc='best')

        # 设置横坐标刻度标签的间隔为每隔12显示一个标签
        x_major_locator = MultipleLocator(12)
        for ax in plt.gcf().get_axes():
            ax.xaxis.set_major_locator(x_major_locator)

        plt.tight_layout()
        plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import MultipleLocator
from statsmodels.tsa.seasonal import seasonal_decompose

# 按区域进行分析
df_new = pd.read_csv("MonthlyCrimeNum10-23.csv")

# 检查是否有缺失值
if df_new.isnull().values.any():
    print("Data contains missing values.")
else:
    for borough in df_new["Borough_Name"].unique():
        # 选择区域
        borough_data = df_new[df_new["Borough_Name"] == borough]
        
        # 将年份和犯罪数量数据进行重塑
        crime_counts = borough_data.melt(id_vars=["Borough_Name", "Borough_code"], 
                                         var_name="Year", 
                                         value_name="Crime_number")
        crime_counts["Year"] = crime_counts["Year"].str[-4:]  # 取出年份
        crime_counts.set_index("Year", inplace=True)  # 设定年份为索引
        
        # 时间序列分解
        result = seasonal_decompose(crime_counts["Crime_number"], model='additive', period=12)  # 修改这里的 period 值
        
        # 绘图
        plt.figure(figsize=(14,10))
        plt.suptitle(borough, fontsize=16)

        # 原始数据和趋势放在同一张图上
        plt.subplot(211)
        plt.plot(result.observed, label='Original', color='blue')
        plt.plot(result.trend, label='Trend', color='orange')
        plt.legend(loc='best')

        # 季节性和残差放在同一张图上
        plt.subplot(212)
        plt.plot(result.seasonal,label='Seasonality', color='green')
        plt.plot(result.resid, label='Residuals', color='red')
        plt.legend(loc='best')

        # 设置横坐标刻度标签的间隔为每隔12显示一个标签
        x_major_locator = MultipleLocator(12)
        for ax in plt.gcf().get_axes():
            ax.xaxis.set_major_locator(x_major_locator)

        plt.tight_layout()
        plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import MultipleLocator
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.statespace.sarimax import SARIMAX

# 读取数据
df_new = pd.read_csv("MonthlyCrimeNum03-23.csv")

# 检查是否有缺失值
if df_new.isnull().values.any():
    print("Data contains missing values.")
else:
    for borough in df_new["Borough_Name"].unique():
        # 选择区域
        borough_data = df_new[df_new["Borough_Name"] == borough]
        
        # 将年份和犯罪数量数据进行重塑
        crime_counts = borough_data.melt(id_vars=["Borough_Name", "Borough_code"], 
                                         var_name="Year", 
                                         value_name="Crime_number")
        crime_counts["Year"] = crime_counts["Year"].str[-4:]  # 取出年份
        crime_counts.set_index("Year", inplace=True)  # 设定年份为索引
        
        # 创建SARIMA模型
        model = SARIMAX(crime_counts["Crime_number"], order=(5,1,0), seasonal_order=(1,1,1,12))
        
        # 拟合模型
        with pd.option_context('compute.use_bottleneck', False, 'compute.use_numexpr', False):
            model_fit = model.fit(disp=0)
        
        # 预测未来三年的步长数（每年12个月）
        forecast_steps = 3 * 12
        
        # 进行未来三年的预测
        forecast = model_fit.get_forecast(steps=forecast_steps, method='innovations')
        
        # 创建新的图表对象
        plt.figure(figsize=(14,10))
        
        # 绘制预测结果
        plt.plot(crime_counts.index, crime_counts["Crime_number"], label='Original', color='blue')
        plt.plot(forecast.predicted_mean.index, forecast.predicted_mean, label='Forecast', color='green')
        plt.fill_between(forecast.predicted_mean.index,
                         forecast.conf_int()["lower Crime_number"],
                         forecast.conf_int()["upper Crime_number"],
                         color='pink', alpha=0.3)
        plt.legend(loc='best')
        
        # 设置横坐标刻度标签的间隔为每隔12显示一个标签
        x_major_locator = MultipleLocator(12)
        plt.gca().xaxis.set_major_locator(x_major_locator)
        
        plt.title(borough)  # 添加地区名称为标题
        plt.tight_layout()
        plt.show()  # 在每次循环迭代后显示图形窗口


In [None]:
# Melting the data for easier plotting
df_melted = df_new.melt(id_vars=['Borough_Name', 'Borough_code'], var_name='Date', value_name='Crime_Count')

# Convert the 'Date' column to datetime format
df_melted['Date'] = pd.to_datetime(df_melted['Date'], format='%Y%m')

# Plotting the crime trends for all 22 boroughs on one graph for better visualization
plt.figure(figsize=(15, 8))

# Loop through each borough and plot the trend
for borough in borough_list:
    borough_data = df_melted[df_melted['Borough_Name'] == borough]
    plt.plot(borough_data['Date'], borough_data['Crime_Count'], label=borough)

plt.title('Crime Trends from 2003 to 2023 for 22 London Boroughs')
plt.xlabel('Year')
plt.ylabel('Number of Crimes')
plt.legend(loc='upper left', bbox_to_anchor=(1,1), ncol=1)
plt.grid(True, which="both", ls="--")
plt.tight_layout()
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import MultipleLocator
from statsmodels.tsa.seasonal import seasonal_decompose

# Load data
file_path = "MonthlyCrimeNum10-23.csv"
df_new = pd.read_csv(file_path)

# Check for missing values
if df_new.isnull().values.any():
    print("Data contains missing values.")
else:
    plt.figure(figsize=(14,10))
    for borough in df_new["Borough_Name"].unique():
        # Select the borough
        borough_data = df_new[df_new["Borough_Name"] == borough]
        
        # Reshape year and crime number data
        crime_counts = borough_data.melt(id_vars=["Borough_Name", "Borough_code"], 
                                         var_name="Year", 
                                         value_name="Crime_number")
        crime_counts["Year"] = crime_counts["Year"].str[-4:]  # Extract the year
        crime_counts.set_index("Year", inplace=True)  # Set the year as the index
        
        # Time series decomposition
        result = seasonal_decompose(crime_counts["Crime_number"], model='additive', period=12)
        
        # Plot the trend for each borough on the same graph
        plt.plot(result.trend, label=borough)

    # Adjust x-axis to display every 6 months
    x_major_locator = MultipleLocator(6)
    plt.gca().xaxis.set_major_locator(x_major_locator)
    plt.xticks(rotation=45)  # Rotate x-axis labels for better readability

    plt.title('Crime Trends from 2010 to 2023 for 22 London Boroughs')
    plt.xlabel('Year')
    plt.ylabel('Number of Crimes')
    plt.legend(loc='upper left', bbox_to_anchor=(1,1), ncol=1)
    plt.grid(True, which="both", ls="--")
    plt.tight_layout()
    plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import MultipleLocator
from statsmodels.tsa.seasonal import seasonal_decompose

# Load data
file_path = "MonthlyCrimeNum03-23.csv"
df_new = pd.read_csv(file_path)

# Check for missing values
if df_new.isnull().values.any():
    print("Data contains missing values.")
else:
    # Calculate the total crimes for each borough
    total_crimes = df_new.drop(["Borough_code"], axis=1).groupby("Borough_Name").sum().sum(axis=1)
    # Get the top 22 boroughs with the highest total crimes
    top_22_boroughs = total_crimes.nlargest(22).index.tolist()

    plt.figure(figsize=(14,10))
    for borough in top_22_boroughs:
        # Select the borough
        borough_data = df_new[df_new["Borough_Name"] == borough]
        
        # Reshape year and crime number data
        crime_counts = borough_data.melt(id_vars=["Borough_Name", "Borough_code"], 
                                         var_name="Year", 
                                         value_name="Crime_number")
        crime_counts["Year"] = crime_counts["Year"].str[-4:]  # Extract the year
        crime_counts.set_index("Year", inplace=True)  # Set the year as the index
        
        # Time series decomposition
        result = seasonal_decompose(crime_counts["Crime_number"], model='additive', period=12)
        
        # Plot the trend for each borough on the same graph
        plt.plot(result.trend, label=borough)

    # Adjust x-axis to display every 6 months
    x_major_locator = MultipleLocator(12)
    plt.gca().xaxis.set_major_locator(x_major_locator)
    plt.xticks(rotation=45)  # Rotate x-axis labels for better readability

    plt.title('Crime Trends from 2010 to 2023 for Top 22 London Boroughs by Total Crimes')
    plt.xlabel('Year')
    plt.ylabel('Number of Crimes')
    plt.legend(loc='upper left', bbox_to_anchor=(1,1), ncol=1)
    plt.grid(True, which="both", ls="--")
    plt.tight_layout()
    plt.show()


In [None]:
import geopandas as gpd
import pandas as pd
import matplotlib.pyplot as plt

# 步骤1：加载数据集
gdf_boroughs = gpd.read_file('statistical-gis-boundaries-london/ESRI/London_Borough_Excluding_MHW.shp')

# 创建基础地图
fig, ax = plt.subplots(figsize=(15, 15))
gdf_boroughs.plot(ax=ax, edgecolor='black')

ax.set_title('London Boroughs', fontdict={'fontsize': '15', 'fontweight' : '3'})
ax.set_axis_off()
plt.show()


In [None]:
import geopandas as gpd
import pandas as pd
import matplotlib.pyplot as plt

# Step 1: Load the datasets
gdf_boroughs = gpd.read_file('statistical-gis-boundaries-london/ESRI/London_Borough_Excluding_MHW.shp')
df_borough_data = pd.read_csv('MonthlyCrimeNum03-23.csv')

# Step 2: Calculate the total crime numbers for each borough from 2003 to 2023
df_borough_data['Total_Crime_03_23'] = df_borough_data.iloc[:, 1:].sum(axis=1)

# Step 3: Define the overall trends and peak periods for each borough
trend_peak = {
    'City of Westminster': ('Decreasing', '2019-08'),
    'Newham': ('Increasing then Decreasing', 'Mid 2019'),
    'Croydon': ('Increasing', 'End 2019'),
    'Southwark': ('Decreasing', '2016'),
    'Tower Hamlets': ('Increasing then Stabilizing', '2017'),
    'Hackney': ('Increasing then Decreasing', '2018'),
    'Lambeth': ('Increasing then Decreasing', 'Mid 2018'),
    'Camden': ('Increasing', '2019'),
    'Ealing': ('Increasing then Stabilizing', 'End 2017'),
    'Brent': ('Increasing then Decreasing', 'Mid 2018'),
    'Enfield': ('Increasing', '2019'),
    'Haringey': ('Increasing then Stabilizing', '2017'),
    'Lewisham': ('Fluctuating', '2018'),
    'Barnet': ('Increasing', 'End 2019'),
    'Hillingdon': ('Increasing then Decreasing', 'Mid 2018'),
    'Islington': ('Increasing', 'Mid 2018'),
    'Greenwich': ('Increasing', 'Start 2019'),
    'Wandsworth': ('Increasing then Decreasing', '2018'),
    'Hounslow': ('Increasing', 'End 2019'),
    'Waltham Forest': ('Fluctuating', 'Mid 2017'),
    'Redbridge': ('Increasing', 'End 2018'),
    'Bromley': ('Increasing', '2019')
}

df_borough_data['Overall Trend'] = df_borough_data['Borough_Name'].map(lambda x: trend_peak.get(x, ('Unknown', 'N/A'))[0])
df_borough_data['Peak Period'] = df_borough_data['Borough_Name'].map(lambda x: trend_peak.get(x, ('Unknown', 'N/A'))[1])

# Step 4: Merge the geographical dataset with the data dataset
merged_boroughs = gdf_boroughs.set_index('NAME').join(df_borough_data.set_index('Borough_Name'))

# Step 5: Create the visualization
color_map = {
    'Decreasing': '#3182bd',
    'Increasing': '#E6564E',
    'Increasing then Decreasing': '#31a354',
    'Increasing then Stabilizing': '#dca20d',
    'Fluctuating': '#756bb1',
    'Unknown': '#bdbdbd'
}

fig, ax = plt.subplots(figsize=(30, 25))
for ctype, data in merged_boroughs.groupby('Overall Trend'):
    color = color_map[ctype]
    data.plot(color=color, ax=ax, legend=True, edgecolor='black')

# Add peak period as labels
for x, y, label in zip(merged_boroughs.geometry.centroid.x, merged_boroughs.geometry.centroid.y, merged_boroughs['Peak Period']):
    ax.text(x, y, label, fontsize=15, fontweight='bold', va='center', ha='center', color='black')

# Create a custom legend
from matplotlib.lines import Line2D
legend_elements = [Line2D([0], [0], marker='o', color='w', markerfacecolor=color_map[key], markersize=10, label=key) for key in color_map.keys()]
ax.legend(handles=legend_elements, loc='lower right')

ax.set_title('Overall Crime Trend and Peak Periods in London Boroughs (2003-2023)', fontdict={'fontsize': '15', 'fontweight' : '3'})
ax.set_axis_off()
plt.show()


In [None]:
import geopandas as gpd
import pandas as pd
import matplotlib.pyplot as plt

# Step 1: Load the datasets
gdf_boroughs = gpd.read_file('statistical-gis-boundaries-london/ESRI/London_Borough_Excluding_MHW.shp')
df_borough_data = pd.read_csv('MonthlyCrimeNum03-23.csv')

# Step 2: Calculate the total crime numbers for each borough from 2003 to 2023
df_borough_data['Total_Crime_03_23'] = df_borough_data.iloc[:, 1:].sum(axis=1)

# Step 3: Define the overall trends and peak periods for each borough
trend_peak = {
    'City of Westminster': ('Decreasing', '2019-08'),
    'Newham': ('Increasing then Decreasing', 'Mid 2019'),
    'Croydon': ('Increasing', 'End 2019'),
    'Southwark': ('Decreasing', '2016'),
    'Tower Hamlets': ('Increasing then Stabilizing', '2017'),
    'Hackney': ('Increasing then Decreasing', '2018'),
    'Lambeth': ('Increasing then Decreasing', 'Mid 2018'),
    'Camden': ('Increasing', '2019'),
    'Ealing': ('Increasing then Stabilizing', 'End 2017'),
    'Brent': ('Increasing then Decreasing', 'Mid 2018'),
    'Enfield': ('Increasing', '2019'),
    'Haringey': ('Increasing then Stabilizing', '2017'),
    'Lewisham': ('Fluctuating', '2018'),
    'Barnet': ('Increasing', 'End 2019'),
    'Hillingdon': ('Increasing then Decreasing', 'Mid 2018'),
    'Islington': ('Increasing', 'Mid 2018'),
    'Greenwich': ('Increasing', 'Start 2019'),
    'Wandsworth': ('Increasing then Decreasing', '2018'),
    'Hounslow': ('Increasing', 'End 2019'),
    'Waltham Forest': ('Fluctuating', 'Mid 2017'),
    'Redbridge': ('Increasing', 'End 2018'),
    'Bromley': ('Increasing', '2019')
}

df_borough_data['Overall Trend'] = df_borough_data['Borough_Name'].map(lambda x: trend_peak.get(x, ('Unknown', 'N/A'))[0])
df_borough_data['Peak Period'] = df_borough_data['Borough_Name'].map(lambda x: trend_peak.get(x, ('Unknown', 'N/A'))[1])

# Step 4: Merge the geographical dataset with the data dataset
merged_boroughs = gdf_boroughs.set_index('NAME').join(df_borough_data.set_index('Borough_Name'))

# Step 5: Create the visualization
color_map = {
    'Decreasing': '#3182bd',
    'Increasing': '#E6564E',
    'Increasing then Decreasing': '#31a354',
    'Increasing then Stabilizing': '#dca20d',
    'Fluctuating': '#756bb1',
    'Unknown': '#bdbdbd'
}

fig, ax = plt.subplots(figsize=(30, 25))
for ctype, data in merged_boroughs.groupby('Overall Trend'):
    color = color_map[ctype]
    data.plot(color=color, ax=ax, legend=True, edgecolor='black')

# Add peak period as labels
for x, y, label in zip(merged_boroughs.geometry.centroid.x, merged_boroughs.geometry.centroid.y, merged_boroughs['Peak Period']):
    ax.text(x, y, label, fontsize=15, fontweight='bold', va='center', ha='center', color='black')

# Create a custom legend
from matplotlib.lines import Line2D
legend_elements = [Line2D([0], [0], marker='o', color='w', markerfacecolor=color_map[key], markersize=10, label=key) for key in color_map.keys()]
ax.legend(handles=legend_elements, loc='lower right')

ax.set_title('Overall Crime Trend and Peak Periods in London Boroughs (2003-2023)', fontdict={'fontsize': '15', 'fontweight' : '3'})
ax.set_axis_off()
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.tsa.statespace.sarimax import SARIMAX

# Load data
df_new = pd.read_csv("MonthlyCrimeNum03-23.csv")

# Replace missing values with 0
df_new.fillna(0, inplace=True)

# Container to hold forecasted data
# Initialize an empty dataframe to store forecast results
forecast_df = pd.DataFrame()

# Forecast for each borough and store the results
for borough in df_new["Borough_Name"].unique():
    # Selecting the borough
    borough_data = df_new[df_new["Borough_Name"] == borough]
    
    # Reshaping the years and crime number data
    crime_counts = borough_data.melt(id_vars=["Borough_Name", "Borough_code"], 
                                     var_name="Year-Month", 
                                     value_name="Crime_number")
    crime_counts["Year"] = crime_counts["Year-Month"].str[-4:]  # Extracting the year
    crime_counts = crime_counts.groupby("Year").sum().reset_index()  # Summing up the crimes for each year
    crime_counts.set_index("Year", inplace=True)  # Setting the year as index
    
    # Creating the ARIMA model (without the seasonal component)
    model = SARIMAX(crime_counts["Crime_number"], order=(1,1,0))
    
    # Fitting the model
    with pd.option_context('compute.use_bottleneck', False, 'compute.use_numexpr', False):
        model_fit = model.fit(disp=0)
    
    # Predicting for the next 3 years
    forecast_steps = 3
    forecast = model_fit.get_forecast(steps=forecast_steps, method='innovations')
    
    # Storing the forecasted results
    forecast_df[borough] = forecast.predicted_mean.values

# Rename the index of forecast_df for clarity
forecast_df.index = ['2023', '2024', '2025']

forecast_df

In [None]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

print(forecast_df)

In [None]:
# Create a new figure and axis object for plotting
plt.figure(figsize=(30, 15))

# Iterate through each borough's data in the dataframe
for borough in forecast_df.columns:
    # Determine the trend based on 2023 and 2025 values
    if forecast_df[borough]['2023'] < forecast_df[borough]['2025']:
        color = 'red'  # Increasing trend
    else:
        color = 'green'  # Decreasing trend
    
    # Plot the data
    plt.plot(forecast_df.index, forecast_df[borough], label=borough, color=color, marker='o')

# Adding title and labels
plt.title("Predicted Crime Numbers in London Boroughs (2023-2025)")
plt.xlabel("Year")
plt.ylabel("Predicted Crime Numbers")
plt.legend(loc="upper right", bbox_to_anchor=(1.15, 1), title="Boroughs")
plt.grid(True, which='both', linestyle='--', linewidth=0.5)
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.tsa.statespace.sarimax import SARIMAX

# Load data
df_new = pd.read_csv("MonthlyCrimeNum03-23.csv")

# Replace missing values with 0
df_new.fillna(0, inplace=True)

# Container to hold forecasted data
forecast_df = pd.DataFrame()

# Forecast for each borough and store the results
for borough in df_new["Borough_Name"].unique():
    # Selecting the borough
    borough_data = df_new[df_new["Borough_Name"] == borough]
    
    # Reshaping the years and crime number data
    crime_counts = borough_data.melt(id_vars=["Borough_Name", "Borough_code"], 
                                     var_name="Year-Month", 
                                     value_name="Crime_number")
    crime_counts["Year"] = crime_counts["Year-Month"].str[-4:]  # Extracting the year
    crime_counts = crime_counts.groupby("Year").sum().reset_index()  # Summing up the crimes for each year
    crime_counts.set_index("Year", inplace=True)  # Setting the year as index
    
    # Creating the ARIMA model (without the seasonal component)
    model = SARIMAX(crime_counts["Crime_number"], order=(1,1,0))
    
    # Fitting the model
    model_fit = model.fit(disp=0)
    
    # Predicting for the next 3 years
    forecast_steps = 3
    forecast = model_fit.get_forecast(steps=forecast_steps)
    
    # Storing the forecasted results
    forecast_df[borough] = forecast.predicted_mean.values

# Rename the index of forecast_df for clarity
forecast_df.index = ['2023', '2024', '2025']

# Plotting
borough_colors = sns.color_palette("husl", len(forecast_df.columns))
fig, ax = plt.subplots(figsize=(20, 10))

for idx, borough in enumerate(forecast_df.columns):
    if forecast_df[borough]['2023'] < forecast_df[borough]['2025']:
        ax.plot(forecast_df.index, forecast_df[borough], label=borough, color=borough_colors[idx], lw=2.5)
    else:
        ax.plot(forecast_df.index, forecast_df[borough], label=borough, color=sns.light_palette(borough_colors[idx])[-2], lw=2.5)

ax.set_title("Predicted Crime Numbers in London Boroughs (2023-2025)", fontsize=18)
ax.set_xlabel("Year", fontsize=15)
ax.set_ylabel("Predicted Crime Numbers", fontsize=15)
ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=10)
ax.grid(True, which='both', linestyle='--', linewidth=0.5)
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# 读取数据
df_new = pd.read_csv("MonthlyCrimeNum03-23.csv")

# 填充缺失值为0
df_new.fillna(0, inplace=True)

# 容器来保存预测数据
forecast_df = pd.DataFrame()

# 对每个区域进行预测并保存结果
for borough in df_new["Borough_Name"].unique():
    borough_data = df_new[df_new["Borough_Name"] == borough]
    crime_counts = borough_data.melt(id_vars=["Borough_Name", "Borough_code"], 
                                     var_name="Year-Month", 
                                     value_name="Crime_number")
    crime_counts["Year"] = crime_counts["Year-Month"].str[-4:]  # 提取年份
    crime_counts = crime_counts.groupby("Year").sum().reset_index()  # 计算每年的犯罪总数
    crime_counts.set_index("Year", inplace=True)  # 将年份设为索引
    
    # 创建SARIMA模型
    model = SARIMAX(crime_counts["Crime_number"], order=(1,1,0))
    model_fit = model.fit(disp=0)
    forecast = model_fit.get_forecast(steps=3)
    
    # 保存预测结果
    forecast_df[borough] = forecast.predicted_mean.values

# 设置索引名称以提高清晰度
forecast_df.index = ['2023', '2024', '2025']

# 计算2025年与2023年的差异
difference = forecast_df.loc['2025'] - forecast_df.loc['2023']

# 定义颜色映射
colors = {}
for borough, diff in difference.items():
    if diff > 0:  # 如果2025年的预测值大于2023年的
        colors[borough] = 'darkred'
    else:  # 如果2025年的预测值小于或等于2023年的
        colors[borough] = 'lightcoral'

# 绘制条形图
fig, ax = plt.subplots(figsize=(40,20))
difference.plot(kind='bar', color=[colors[borough] for borough in difference.index], ax=ax)
ax.set_title("Difference in Predicted Crime Numbers Between 2023 and 2025")
ax.set_ylabel("Difference in Crime Numbers")
ax.set_xlabel("Boroughs")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.tsa.statespace.sarimax import SARIMAX
import seaborn as sns

# 1. Import necessary libraries
sns.set_style("whitegrid")

# 2. Load the data
df_new = pd.read_csv("MonthlyCrimeNum03-23.csv")
df_new.fillna(0, inplace=True)

# 3. Clean and preprocess the data
forecast_df = pd.DataFrame()

for borough in df_new["Borough_Name"].unique():
    borough_data = df_new[df_new["Borough_Name"] == borough]
    crime_counts = borough_data.melt(id_vars=["Borough_Name", "Borough_code"], 
                                     var_name="Year-Month", 
                                     value_name="Crime_number")
    crime_counts["Year"] = crime_counts["Year-Month"].str[-4:]
    crime_counts = crime_counts.groupby("Year").sum().reset_index()
    crime_counts.set_index("Year", inplace=True)
    
    model = SARIMAX(crime_counts["Crime_number"], order=(1,1,0))
    with pd.option_context('compute.use_bottleneck', False, 'compute.use_numexpr', False):
        model_fit = model.fit(disp=0)
    forecast_steps = 3
    forecast = model_fit.get_forecast(steps=forecast_steps, method='innovations')
    forecast_df[borough] = forecast.predicted_mean.values

forecast_df.index = ['2023', '2024', '2025']

# 4. Visualize the data on a bar chart
fig, ax = plt.subplots(figsize=(30, 15))
colors = sns.color_palette("tab20", len(forecast_df.columns))

for idx, borough in enumerate(forecast_df.columns):
    if forecast_df[borough]['2023'] < forecast_df[borough]['2025']:
        color = colors[idx]  # Dark shade for increasing trend
    else:
        color = sns.light_palette(colors[idx])[2]  # Light shade for decreasing trend
    ax.bar(borough, forecast_df[borough]['2025'], color=color, label=borough)

ax.set_ylabel('Predicted Crime Numbers for 2025')
ax.set_title('Predicted Crime Numbers in London Boroughs for 2025')
ax.legend(loc="upper right", bbox_to_anchor=(1.2, 1), title="Boroughs")
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.tsa.statespace.sarimax import SARIMAX
import seaborn as sns

# 1. Import necessary libraries
sns.set_style("whitegrid")

# 2. Load the data
df_new = pd.read_csv("MonthlyCrimeNum03-23.csv")
df_new.fillna(0, inplace=True)

# 3. Clean and preprocess the data
forecast_df = pd.DataFrame()

for borough in df_new["Borough_Name"].unique():
    borough_data = df_new[df_new["Borough_Name"] == borough]
    crime_counts = borough_data.melt(id_vars=["Borough_Name", "Borough_code"], 
                                     var_name="Year-Month", 
                                     value_name="Crime_number")
    crime_counts["Year"] = crime_counts["Year-Month"].str[-4:]
    crime_counts = crime_counts.groupby("Year").sum().reset_index()
    crime_counts.set_index("Year", inplace=True)
    
    model = SARIMAX(crime_counts["Crime_number"], order=(1,1,0))
    with pd.option_context('compute.use_bottleneck', False, 'compute.use_numexpr', False):
        model_fit = model.fit(disp=0)
    forecast_steps = 3
    forecast = model_fit.get_forecast(steps=forecast_steps, method='innovations')
    forecast_df[borough] = forecast.predicted_mean.values

forecast_df.index = ['2023', '2024', '2025']

# 4. Visualize the data on a bar chart
fig, ax = plt.subplots(figsize=(30, 15))
base_colors = sns.color_palette("tab20", len(forecast_df.columns))

for idx, borough in enumerate(forecast_df.columns):
    if forecast_df[borough]['2023'] < forecast_df[borough]['2025']:
        color = sns.dark_palette(base_colors[idx])[4]  # Darker shade for increasing trend
    else:
        color = sns.light_palette(base_colors[idx])[2]  # Lighter shade for decreasing trend
    ax.bar(borough, forecast_df[borough]['2025'], color=color, label=borough)

ax.set_ylabel('Predicted Crime Numbers for 2025')
ax.set_title('Predicted Crime Numbers in London Boroughs for 2025 with Color Gradient Indicating Trend')
ax.legend(loc="upper right", bbox_to_anchor=(1.2, 1), title="Boroughs")
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()
