In [None]:
#Load libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn import linear_model
from sklearn.linear_model import LassoCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn import linear_model
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn import metrics

In [None]:
#Load crime database
crime=pd.read_csv("crime-recorded-by-police-by-selected-offence-groups-in-bristol-by-ward.csv", delimiter=";")
crime=crime[["Ward Name ","All Crimes (rate per 1000 ward population)"]]
crime=pd.DataFrame(crime.groupby(["Ward Name "]).mean())
crime

In [None]:
#Which areas have the highest rates of crime
sb.set_style("whitegrid")
a4_dims = (16.5, 11.7)
fig, ax = plt.subplots(figsize=a4_dims)
sb.barplot(x="All Crimes (rate per 1000 ward population)", y=crime.index, data=crime, color="steelblue",ci = None)

In [None]:
#Load NEETS database
NEETS=pd.read_csv("16-17-year-olds-neet-or-whose-activity-is-not-known-in-bristol.csv", delimiter=";")
NEETS=NEETS[["Ward Name ","% NEET or Not Known 16-17 year olds"]]
NEETS

In [None]:
#Load deprivation database
dep=pd.read_csv("deprivation-in-bristol-2019.csv", delimiter=";")
dep=dep[["2016 Ward","Index of Multiple Deprivation Score"]]
dep=pd.DataFrame(dep.groupby(["2016 Ward"]).mean())
dep

In [None]:
#Load disability database
dis=pd.read_csv("disability-2016-ward.csv", delimiter=";")
dis=dis[["2016 Ward Name","% People whose day-to-day activities are limited"]]
dis

In [None]:
#Load ethnicity database
eth=pd.read_csv("ethnicity.csv", delimiter=";")
eth=eth[["2016 Ward name","% Black and Minority Ethnic Group"]]
eth

In [None]:
#Load overcrowding database
oc=pd.read_csv("household-size-and-bedrooms-2011-census-by-2016-ward.csv", delimiter=";")
oc=oc[["2016 Ward Name","% households that are overcrowded"]]
oc

In [None]:
#Create dataframe, merging all datasets together
df1=pd.merge(crime,NEETS,left_index=True, right_on="Ward Name ")
df2=pd.merge(df1,dep,left_on="Ward Name ",right_on="2016 Ward")
df3=pd.merge(df2,dis,left_on="Ward Name ",right_on="2016 Ward Name").drop("Ward Name ",axis=1)
df4=pd.merge(df3,eth,left_on="2016 Ward Name",right_on="2016 Ward name").drop("2016 Ward name",axis=1)
data=pd.merge(df4,oc,on="2016 Ward Name").drop("2016 Ward Name",axis=1)
data

In [None]:
data.describe()

In [None]:
sb.pairplot(data)

In [None]:
corr_matrix=data.corr()
corr_matrix

In [None]:
#Model 1 Linear Regression

y=data["All Crimes (rate per 1000 ward population)"]
X=data[["% NEET or Not Known 16-17 year olds","Index of Multiple Deprivation Score","% Black and Minority Ethnic Group"]]


lr=LinearRegression()

kfold = KFold(n_splits=8, shuffle=True, random_state=7)
lr_results = cross_val_score(lr, X, y, cv=kfold, scoring="neg_root_mean_squared_error")
print("Mean root mean squared error is: ",-np.round(lr_results.mean(),2),"Standard deviation is: ", np.round(lr_results.std(),2))


In [None]:
lr.fit(X,y)
coefficients = lr.coef_
importance = np.abs(coefficients)
print(importance)

In [None]:
#Model 2 Ridge regression

cv_inner = KFold(n_splits=3, shuffle=True, random_state=7)

ridge = linear_model.Ridge()


params ={"alpha":[0.1, 1,10]}


model = GridSearchCV(ridge, params, scoring='neg_root_mean_squared_error', cv=cv_inner, refit=True)

cv_outer = KFold(n_splits=8, shuffle=True, random_state=7)

ridge_results = cross_val_score(model, X, y, scoring='neg_root_mean_squared_error', cv=cv_outer)
print("Mean root mean squared error is: ",-np.round(ridge_results.mean(),2),"Standard deviation is: ", np.round(ridge_results.std(),2))



In [None]:
model.fit(X,y)
coefficients = model.best_estimator_.coef_
importance = np.abs(coefficients)
print(importance)

In [None]:
#Model 3 Lasso regression

cv_inner = KFold(n_splits=3, shuffle=True, random_state=7)

lasso = linear_model.Lasso()


params ={"alpha":[1, 0.1, 0.01,0.001]}


lasso2 = GridSearchCV(lasso, params, scoring='neg_root_mean_squared_error', cv=cv_inner, refit=True)

cv_outer = KFold(n_splits=8, shuffle=True, random_state=7)

lasso_results = cross_val_score(lasso2, X, y, scoring="neg_root_mean_squared_error", cv=kfold)
print("Mean root mean squared error is: ",-np.round(lasso_results.mean(),2),"Standard deviation is: ", np.round(lasso_results.std(),2))

In [None]:
lasso2.fit(X,y)
coefficients = lasso2.best_estimator_.coef_
importance = np.abs(coefficients)
print(importance)

In [None]:
#Model 4 Decision Tree

cv_inner = KFold(n_splits=3, shuffle=True, random_state=7)

DT = DecisionTreeRegressor()


random_grid={"splitter":["best","random"],
            "max_depth" : [1,3,5,7,9,11,12],
            "min_samples_leaf":[1,2,3,4,5,6,7,8,9,10],
            "min_weight_fraction_leaf":[0,0.5],
            "max_features":["auto","log2","sqrt",None],
            "max_leaf_nodes":[None,10,20,30]}


model2 = RandomizedSearchCV(estimator = DT, param_distributions = random_grid, n_iter = 100, cv = 3,random_state=7, n_jobs = -1)

cv_outer = KFold(n_splits=8, shuffle=True, random_state=7)

DT_results = cross_val_score(model2, X, y, scoring='neg_root_mean_squared_error', cv=cv_outer)
print("Mean root mean squared error is: ",-np.round(DT_results.mean(),2),"Standard deviation is: ", np.round(DT_results.std(),2))


In [None]:
model2.fit(X,y)
coefficients = model2.best_estimator_.feature_importances_
for i,v in enumerate(coefficients):
     print('Feature: %0d, Score: %.5f' % (i,v))

In [None]:
#Model 5 Random Forest Regression

cv_inner = KFold(n_splits=3, shuffle=True, random_state=7)

RF = RandomForestRegressor()

random_grid={"n_estimators":[50,100,200,400,500,1000],
             "max_features": ["auto", "sqrt"],
             "max_depth":[1,3,5,7,9,11,12],
             "min_samples_split":[2, 5, 10],
             "min_samples_leaf": [1,2,3,4,5,6,7,8,9,10],
             "bootstrap": [True, False]}

model3 = RandomizedSearchCV(estimator = RF, param_distributions = random_grid, n_iter = 100, cv = 3,random_state=7, n_jobs = -1)

cv_outer = KFold(n_splits=8, shuffle=True, random_state=7)

RF_results = cross_val_score(model3, X, y, scoring="neg_root_mean_squared_error", cv=cv_outer)
print("Mean root mean squared error is: ",-np.round(RF_results.mean(),2),"Standard deviation is: ", np.round(RF_results.std(),2))

In [None]:
model3.fit(X,y)
coefficients2 = model3.best_estimator_.feature_importances_
for i,v in enumerate(coefficients2):
     print('Feature: %0d, Score: %.5f' % (i,v))

In [None]:
print('Best Hyperparameters: %s' % model3.best_params_)

In [None]:
#Model 6 KNearestNeighbour Regression
cv_inner = KFold(n_splits=3, shuffle=True, random_state=7)

KNN = KNeighborsRegressor()


params={"leaf_size":[1,3],
        "n_neighbors":[2,3,4,5],
        "p":[1,2]}


model4 = GridSearchCV(KNN, params, scoring="neg_root_mean_squared_error", cv=cv_inner, refit=True)

cv_outer = KFold(n_splits=8, shuffle=True, random_state=7)

KNN_results = cross_val_score(model4, X, y, scoring="neg_root_mean_squared_error", cv=cv_outer)
print("Mean root mean squared error is: ",-np.round(KNN_results.mean(),2),"Standard deviation is: ", np.round(KNN_results.std(),2))
