# Вступление:
# **Преступность и недвижимость**

Смысл проекта заключается в анализе уровня преступности и рынка недвижимости в районах Хартфорда. Рассмотрим корреляцию уровня криминала и количества продоваемой и строящейся недвижимостисти (цены на нее), визуализируем данные по-районно на карте Хартфорада, проанализируем "настроения" в текстовых источниках относительно этих районов на тему преступности и цен на жилье.

# 1
С помощью веб-скраппинга посмотрим, как описываются районы Хартфорда в текстовых источниках и оценим "настроение" этих текстов:

In [None]:
import requests
from bs4 import BeautifulSoup


neighborhood_urls = [
  'https://www.neighborhoodscout.com/ct/hartford/university-hartford#crime',
  'https://www.neighborhoodscout.com/ct/hartford/blue-hills-south#crime',
  'https://www.neighborhoodscout.com/ct/hartford/behind-rocks-southwest#crime',
  'https://www.neighborhoodscout.com/ct/hartford/blue-hills#crime',
  'https://www.neighborhoodscout.com/ct/hartford/cabot-st#crime',
  'https://www.neighborhoodscout.com/ct/hartford/parkville-south#crime',
  'https://www.neighborhoodscout.com/ct/hartford/keney-park#crime',
  'https://www.neighborhoodscout.com/ct/hartford/barry-square-west#crime',
  'https://www.neighborhoodscout.com/ct/hartford/southwest#crime',
  'https://www.neighborhoodscout.com/ct/hartford/asylum-hill-south#crime'
]

data = {}

for url in neighborhood_urls:
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    title = soup.find('title').text
    main_text = soup.find_all('p')
    alltext = ''
    for text in main_text:
        alltext+=(text.get_text())
    data[title] = alltext

In [None]:
# теперь анализируем описание этих районов с помощью NLP 
from textblob import TextBlob
polarhood = {}
for text in data:
      blob = TextBlob(data[text])
      sentiment = blob.sentiment.polarity
      polarhood[text] = sentiment

In [None]:
polarhood

{'University of Hartford Hartford, CT 06112, Neighborhood Profile - NeighborhoodScout': 0.1690532964303456,
 'Blue Hills South Hartford, CT 06112, Neighborhood Profile - NeighborhoodScout': 0.1641109668109668,
 'Behind the Rocks Southwest Hartford, CT 06106, Neighborhood Profile - NeighborhoodScout': 0.11377195806947875,
 'Blue Hills Hartford, CT 06112, Neighborhood Profile - NeighborhoodScout': 0.12190655145379554,
 'Cabot St & Albany Ave Hartford, CT 06112, Neighborhood Profile - NeighborhoodScout': 0.1545229962715512,
 'Parkville South Hartford, CT 06106, Neighborhood Profile - NeighborhoodScout': 0.16303183813822109,
 'Keney Park Hartford, CT 06120, Neighborhood Profile - NeighborhoodScout': 0.17257362467216478,
 'Barry Square West Hartford, CT 06114, Neighborhood Profile - NeighborhoodScout': 0.1530247828234405,
 'Southwest Hartford, CT 06106, Neighborhood Profile - NeighborhoodScout': 0.15034483965454112,
 'Asylum Hill South Hartford, CT 06105, Neighborhood Profile - Neighborhood

In [None]:
sorted_polarhood = dict(sorted(polarhood.items(), key=lambda x: x[1], reverse=True))

for key, value in sorted_polarhood.items():
    print(f"{key}: {value}")

Asylum Hill South Hartford, CT 06105, Neighborhood Profile - NeighborhoodScout: 0.18547739969527677
Keney Park Hartford, CT 06120, Neighborhood Profile - NeighborhoodScout: 0.17257362467216478
University of Hartford Hartford, CT 06112, Neighborhood Profile - NeighborhoodScout: 0.1690532964303456
Blue Hills South Hartford, CT 06112, Neighborhood Profile - NeighborhoodScout: 0.1641109668109668
Parkville South Hartford, CT 06106, Neighborhood Profile - NeighborhoodScout: 0.16303183813822109
Cabot St & Albany Ave Hartford, CT 06112, Neighborhood Profile - NeighborhoodScout: 0.1545229962715512
Barry Square West Hartford, CT 06114, Neighborhood Profile - NeighborhoodScout: 0.1530247828234405
Southwest Hartford, CT 06106, Neighborhood Profile - NeighborhoodScout: 0.15034483965454112
Blue Hills Hartford, CT 06112, Neighborhood Profile - NeighborhoodScout: 0.12190655145379554
Behind the Rocks Southwest Hartford, CT 06106, Neighborhood Profile - NeighborhoodScout: 0.11377195806947875


# 2
Мы получаем отсортированный список районов. Чем больше "положительность", тем лучше район. Тем не менее, из-за того, что эта оценка была произведена, в целом, не относительно какого-то критерия. Мы не можем исходя из нее сделать ответ относиельно, например, безопасности района.

Поэтому я нашла большую базу данных с записью всех преступлений от 2005 до 2021. Сделаем рейтинг безопасных районов.

In [None]:
import pandas as pd

In [None]:
dataa = pd.read_csv('Police_Incidents_01012005_to_05182021.csv')

In [None]:
dataa

Unnamed: 0,Case_Number,Date,Time_24HR,Address,UCR_1_Category,UCR_1_Description,UCR_1_Code,UCR_2_Category,UCR_2_Description,UCR_2_Code,Neighborhood,geom
0,21013791,05/10/2021,1641,403 GARDEN ST,32* - PROPERTY DAMAGE ACCIDENT,PROP DAM ACC,3221,,,0,CLAY-ARSENAL,"(41.780238042803745, -72.68497435174203)"
1,21014071,05/13/2021,245,59 ELLINGTON ST,32* - PROPERTY DAMAGE ACCIDENT,PROP DAM ACC,3261,24* - MOTOR VEHICLE LAWS,EVADING RESP,2401,BEHIND THE ROCKS,"(41.74625648731947, -72.70484012171347)"
2,20036741,11/29/2020,1703,267 ZION ST,31* - PERSONAL INJURY ACCIDENT,PERS INJ ACC,3124,23* - DRIVING LAWS,FOLL TOO CLOSE,2334,BEHIND THE ROCKS,"(41.74850755091766, -72.69411393999614)"
3,21013679,05/09/2021,2245,HOMESTEAD AV & WOODLAND ST,31* - PERSONAL INJURY ACCIDENT,PERS INJ ACC,3124,23* - DRIVING LAWS,TRAVELING TOO FAST,2327,UPPER ALBANY,"(41.778689832211015, -72.69776621329845)"
4,21014070,05/13/2021,240,BENTON ST & WEBSTER ST,32* - PROPERTY DAMAGE ACCIDENT,PROP DAM ACC,3221,,,0,BARRY SQUARE,"(41.74653366174123, -72.68316706252509)"
...,...,...,...,...,...,...,...,...,...,...,...,...
708980,9000978,01/09/2009,955,219 ZION ST,34* - OTHER ACCIDENT,HAZARDOUS CONDITION,3490,,,0,BEHIND THE ROCKS,"(41.747163627033345, -72.69420967955826)"
708981,13014833,05/07/2013,1139,150 WARD ST,24* - MOTOR VEHICLE LAWS,OP UNREG M/V,2414,24* - MOTOR VEHICLE LAWS,TOWED VEHICLE,2430,FROG HOLLOW,"(41.75622433710221, -72.68959981062677)"
708982,6011544,03/18/2006,1155,334 FRANKLIN AV,32* - PROPERTY DAMAGE ACCIDENT,PROP DAM ACC,3224,23* - DRIVING LAWS,IMPRP LANE CHANGE,2344,SOUTHEND,"(41.74104529114852, -72.67573171674711)"
708983,12033004,09/19/2012,1206,ASHLEY ST & SIGOURNEY ST,39* - ANIMAL COMPLAINT,ANIMAL BITE,3904,,,0,ASYLUM HILL,"(41.774504796809694, -72.69245815646983)"


In [None]:
pivot_table = dataa.groupby(['Neighborhood', 'UCR_1_Category']).size().unstack().reset_index()

# Заменяем NaN значения на 0 (предполагая, что NaN означает отсутствие преступлений данного типа)
pivot_table = pivot_table.fillna(0)

# Переименовываем столбцы
pivot_table.columns.name = None

# Переименовываем индекс
pivot_table.reset_index(drop=True, inplace=True)

In [None]:
corr_matrix = pivot_table.corr().stack()

corr_matrix = corr_matrix[corr_matrix.index.get_level_values(0) != corr_matrix.index.get_level_values(1)]

corr_matrix = corr_matrix[corr_matrix.index.get_level_values(0) < corr_matrix.index.get_level_values(1)]

# Сортируем по абсолютному значению корреляции
corr_matrix = corr_matrix.abs().sort_values(ascending=False)

# Выводим 10 самых коррелируемых категорий преступлений
print('Самые коррелируемые категории преступлений:')
print(corr_matrix.head(20))

# Выводим 10 самых не коррелируемых категорий преступлений
print('\nСамые не коррелируемые категории преступлений:')
print(corr_matrix.tail(20))


Самые коррелируемые категории преступлений:
35* - MISC. CRIMES AGAINST PROPERTY  42* - CARE FOR SICK                    0.976218
19* - CRIMES AGAINST THE PUBLIC      42* - CARE FOR SICK                    0.970195
15* - FAMILY OFFENSES                20* - RADIO SIGNAL                     0.969238
17* - LIQUOR LAWS                    44* - MISC. WANTS                      0.964995
04* - AGGRAVATED ASSAULT             08* - SIMPLE ASSAULT                   0.960264
19* - CRIMES AGAINST THE PUBLIC      35* - MISC. CRIMES AGAINST PROPERTY    0.959203
05* - BURGLARY                       07* - MOTOR VEHICLE THEFT              0.957895
03* - ROBBERY                        08* - SIMPLE ASSAULT                   0.954883
11* - STOLEN PROPERTY                24* - MOTOR VEHICLE LAWS               0.953667
03* - ROBBERY                        20* - RADIO SIGNAL                     0.952921
08* - SIMPLE ASSAULT                 29* - FOUND PERSON/PROPERTY            0.951064
04* - AGGRAVATED ASSA

  corr_matrix = pivot_table.corr().stack()


In [None]:
correlations = pivot_table.corr()

sum_of_correlations = correlations.sum()

# Сортируем категории по сумме корреляций
sorted_sum_of_correlations = sum_of_correlations.sort_values(ascending=False)

print(sorted_sum_of_correlations)


08* - SIMPLE ASSAULT                   35.477176
42* - CARE FOR SICK                    35.096257
35* - MISC. CRIMES AGAINST PROPERTY    34.706404
03* - ROBBERY                          34.505840
19* - CRIMES AGAINST THE PUBLIC        34.177491
04* - AGGRAVATED ASSAULT               33.552232
20* - RADIO SIGNAL                     33.407618
51* - MISC. MANAGEMENT INFO.           33.312521
29* - FOUND PERSON/PROPERTY            33.287536
28* - MISSING PERSON/PROPERTY          32.555622
15* - FAMILY OFFENSES                  32.262806
11* - STOLEN PROPERTY                  31.966589
31* - PERSONAL INJURY ACCIDENT         31.814613
24* - MOTOR VEHICLE LAWS               31.512476
37* - FIRE-RELATED                     31.500444
12* - WEAPONS OFFENSES                 30.525181
34* - OTHER ACCIDENT                   30.272905
43* - MENTAL CASE                      30.029932
53* - LANDLORD-TENANT                  29.589828
25* - CITY ORDINANCES                  29.355498
07* - MOTOR VEHICLE 

  correlations = pivot_table.corr()


Видно, самый коррелируемый признак "simple assault". То есть чем больше зафиксированных таких престплуний, тем больше и других видов престплений. Поэтому мы можем, ориентируясь по этому виду, отранжировать города по безопасности. То есть сделать вывод о безопасности в целом.

In [None]:
sorted_table = pivot_table.sort_values('08* - SIMPLE ASSAULT', ascending=False)

In [None]:
sorted_table

Unnamed: 0,Neighborhood,01* - HOMICIDE,03* - ROBBERY,04* - AGGRAVATED ASSAULT,05* - BURGLARY,06* - LARCENY,07* - MOTOR VEHICLE THEFT,08* - SIMPLE ASSAULT,09* - FORGERY/COUNTERFEITING,10* - FRAUD/EMBEZZLEMENT/EXTORTION,...,44* - MISC. WANTS,46* - JUVENILE-RELATED,47* - CONNECTING CASE,49* - SHOOTING,51* - MISC. MANAGEMENT INFO.,52* - SHOTS FIRED,5210 - SHOTS FIRED CONFIRMED,5211 - SHOTS FIRED UNCONFIRMED,53* - LANDLORD-TENANT,55* - REPORT-RELATED
8,NORTHEAST,80.0,935.0,2130.0,1782.0,4439.0,1748.0,4405.0,101.0,108.0,...,818.0,118.0,0.0,5.0,3541.0,2868.0,14.0,47.0,158.0,835.0
1,BARRY SQUARE,45.0,995.0,1381.0,1819.0,5833.0,2009.0,3925.0,80.0,108.0,...,602.0,633.0,0.0,2.0,2611.0,1497.0,7.0,6.0,151.0,768.0
0,ASYLUM HILL,32.0,912.0,1292.0,1267.0,6051.0,1733.0,3755.0,125.0,164.0,...,601.0,335.0,0.0,4.0,4218.0,856.0,3.0,14.0,103.0,1246.0
6,FROG HOLLOW,29.0,923.0,1267.0,1267.0,5161.0,1204.0,3233.0,114.0,114.0,...,939.0,139.0,0.0,8.0,3087.0,1435.0,6.0,14.0,107.0,861.0
4,CLAY-ARSENAL,48.0,629.0,1350.0,861.0,2967.0,906.0,3167.0,101.0,84.0,...,703.0,903.0,0.0,2.0,2995.0,1254.0,4.0,10.0,79.0,675.0
15,UPPER ALBANY,52.0,619.0,1346.0,811.0,2515.0,870.0,2820.0,117.0,114.0,...,450.0,424.0,0.0,3.0,1922.0,1267.0,14.0,15.0,120.0,571.0
5,DOWNTOWN,12.0,550.0,715.0,474.0,7093.0,763.0,2515.0,148.0,202.0,...,3625.0,196.0,0.0,1.0,2004.0,212.0,1.0,4.0,17.0,1577.0
13,SOUTHEND,26.0,752.0,847.0,1776.0,4919.0,1745.0,2434.0,154.0,204.0,...,370.0,301.0,0.0,5.0,1764.0,711.0,2.0,0.0,98.0,572.0
3,BLUE HILLS,24.0,346.0,578.0,846.0,2269.0,784.0,1964.0,77.0,69.0,...,234.0,393.0,1.0,1.0,1433.0,941.0,2.0,9.0,52.0,372.0
2,BEHIND THE ROCKS,17.0,346.0,681.0,1013.0,6087.0,1236.0,1956.0,57.0,72.0,...,450.0,53.0,0.0,1.0,1492.0,817.0,2.0,4.0,63.0,505.0


# 3
Видно, что по этому признаку самый опасный район Northeast. Объясним этот исход событий с помощью "зонирования". 

Посмотрим к какой "зоне" города относится этот район. Для этого используем новую базу данных и соотнесем данные районов с данными зон города.

In [30]:
# новая таблица данных
zones

Unnamed: 0,OBJECTID,LABEL,ACRES,GlobalID,SHAPESTArea,SHAPESTLength,geometry
0,17112,N-3-1,9.11974626,{E3EE6A28-6C36-48B3-876A-E42AD43B7410},397254.557983398,3095.85946399691,
1,17113,N-3-1,8.12185791,{559FB82C-23FB-4353-B8DB-87CB7AA5C197},353786.715332031,3157.7131618503,
2,17114,N-3-1,7.68425352,{8172198C-44A3-4671-8290-FBF8201B22CC},334724.74432373,3260.78768503537,
3,17115,N-3-1,0.75794007,{38CAD916-AE4A-4B18-88E5-4EC6A5D799DF},33015.7373657227,794.288899043288,
4,17116,N-3-1,2.9593989,{3194A4FC-B023-4D51-8E5D-2FC0E21948F5},128910.900512695,2021.78429831719,
...,...,...,...,...,...,...,...
2020,19132,MX-2,16.55334082,{8DE358B2-E695-4A96-98D7-AF606E07966D},185638.308898926,2263.79061464701,
2021,19133,MX-1,16.55334082,{D6EE9C97-178D-40FD-B3CF-781D525A913B},194756.109375,1893.41854955068,
2022,19134,MX-1,16.55334082,{7D1B39B5-194C-42F0-9117-7361FD947084},152505.217712402,1991.58948581149,
2023,19135,MX-2,16.55334082,{7410D2AC-FE2A-4075-ABF3-5E1A9D3EF18A},188161.010986328,2056.98885801392,


In [2]:
pip install geopandas

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting geopandas
  Downloading geopandas-0.13.2-py3-none-any.whl (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m32.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting fiona>=1.8.19 (from geopandas)
  Downloading Fiona-1.9.4.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (16.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.4/16.4 MB[0m [31m81.4 MB/s[0m eta [36m0:00:00[0m
Collecting pyproj>=3.0.1 (from geopandas)
  Downloading pyproj-3.6.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m114.7 MB/s[0m eta [36m0:00:00[0m
Collecting click-plugins>=1.0 (from fiona>=1.8.19->geopandas)
  Downloading click_plugins-1.1.1-py2.py3-none-any.whl (7.5 kB)
Collecting cligj>=0.5 (from fiona>=1.8.19->geopanda

In [60]:
import requests
import pandas as pd

# ArcGIS
arcgis_url = "https://gis1.hartford.gov/arcgis/rest/services/OpenData_Housing_Development/MapServer/4"
params = {
    'f': 'json',
    'where': '1=1',  
    'outFields': '*',  
}
response = requests.get(arcgis_url, params = params)
data_arcgis = response.json()

# Socrata
socrata_url = "https://data.hartford.gov/resource/889t-nwfu.json"
params = {
    '$limit': 5000,
    '$$app_token': 'APP_TOKEN',  
}
response = requests.get(socrata_url)
data_socrata = response.json()

df_arcgis = pd.DataFrame(data_arcgis)
df_socrata = pd.DataFrame(data_socrata)


ValueError: ignored

In [50]:
df_socrata

Unnamed: 0,case_number,date,time_24hr,address,ucr_1_category,ucr_1_description,ucr_1_code,ucr_2_code,neighborhood,geom,:@computed_region_ugzy_ysqh,:@computed_region_35zh_8fi2,:@computed_region_2vdc_22if,:@computed_region_haf6_6xye,ucr_2_category,ucr_2_description
0,5000024,2005-01-01T00:00:00.000,0000,115 ASYLUM ST,55* - REPORT-RELATED,NO CASE INFO - UNABLE TO,5510,0,DOWNTOWN,"{'latitude': '41.766946453488366', 'longitude'...",19,10,15050,1041,,
1,9010396,2005-01-01T00:00:00.000,0000,56 VINE ST,55* - REPORT-RELATED,CASE DRAWN IN ERROR,5520,0,UPPER ALBANY,"{'latitude': '41.78097081523108', 'longitude':...",15,13,18493,1041,,
2,5001381,2005-01-01T00:00:00.000,0000,161 ENFIELD ST,29* - FOUND PERSON/PROPERTY,M-V-S-O-T-R-L,2905,0,NORTHEAST,"{'latitude': '41.78614510044552', 'longitude':...",16,16,18493,1041,,
3,5000084,2005-01-01T00:00:00.000,0001,127 IRVING ST,34* - OTHER ACCIDENT,OCC-INJ-POLICE,3440,0,UPPER ALBANY,"{'latitude': '41.78011225750916', 'longitude':...",15,13,18493,1041,,
4,5029127,2005-01-01T00:00:00.000,0001,14 GILMAN ST,19* - CRIMES AGAINST THE PUBLIC,SIMPLE TRESSPASS,1909,2418,SOUTHEND,"{'latitude': '41.737652996541435', 'longitude'...",21,2,18494,1041,24* - MOTOR VEHICLE LAWS,MISUSE OF PLATES
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,5001075,2005-01-08T00:00:00.000,1450,126 SOUTH ST,35* - MISC. CRIMES AGAINST PROPERTY,CR MISCHIEF 2,3502,0,SOUTHEND,"{'latitude': '41.735411974808876', 'longitude'...",20,2,18494,1041,,
996,5001313,2005-01-08T00:00:00.000,1500,26 CONGRESS ST,19* - CRIMES AGAINST THE PUBLIC,BREACH-PEACE,1901,1904,SOUTH GREEN,"{'latitude': '41.75445662917266', 'longitude':...",8,6,18494,1041,19* - CRIMES AGAINST THE PUBLIC,DOMESTIC
997,5001076,2005-01-08T00:00:00.000,1500,1307 MAIN ST,24* - MOTOR VEHICLE LAWS,FT CARRY INSURANCE,2417,2430,DOWNTOWN,"{'latitude': '41.772245875611674', 'longitude'...",19,10,15050,1041,24* - MOTOR VEHICLE LAWS,TOWED VEHICLE
998,5001056,2005-01-08T00:00:00.000,1500,45 EVERGREEN AV,06* - LARCENY,LARC4-M/V PART-STO-PLATE,645,0,WESTEND,"{'latitude': '41.76469392621961', 'longitude':...",35,14,15051,1041,,


In [61]:
df_arcgis

Unnamed: 0,error
code,400
details,[Invalid URL]
message,Invalid URL


In [56]:
# в виде geodataframes
zones_gdf = gpd.GeoDataFrame(df_arcgis, geometry=gpd.points_from_xy(df_arcgis.longitude, df_arcgis.latitude))
divisions_gdf = gpd.GeoDataFrame(df_socrata, geometry=gpd.points_from_xy(df_socrata.longitude, df_socrata.latitude))

# Объедим инфо о зонах и адм районах
result_gdf = gpd.sjoin(zones_gdf, divisions_gdf, op='intersects')

print(result_gdf)


AttributeError: ignored

но у меня ничего не получилось........

# Вывод:


In [7]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import urllib.request, json 
import pandas as pd
from tqdm import tqdm
from shapely.geometry import Point
import geopandas as gpd
from geopandas import GeoDataFrame
import matplotlib.pyplot as plt
import sys

# Скачиваем огромный json с записями о полицейских ингцидентах

with urllib.request.urlopen("https://data.hartford.gov/api/views/889t-nwfu/rows.json?accessType=DOWNLOAD") as url:
    data = json.load(url)['data']

# Достаем из всех полей только номер дела, дату, широту и долготу. Все заворачиваем в список
datalist = []
for row in data:    
    d = {}
    lt = float(row[19][1])
    lg = float(row[19][2])
    d['lt'] = lt
    d['lg'] = lg
    if not 38 < lt < 45 or not -75 < lg < -70:
        continue
    d['Case_Number'] = row[8]
    d['Date'] = row[9]
    datalist.append(d)


# Получившийся список собираем в DataFrame pandas
df_police = pd.DataFrame(data=datalist)

# Аналогично - данные о разрешениях на строительство/продажу недвижимости
with urllib.request.urlopen(
    "https://data.hartford.gov/api/views/p2vw-4aab/rows.json?accessType=DOWNLOAD") as url:
    data = json.load(url)['data']

# Достаем из всех полей только номер дела, дату, широту и долготу. Все заворачиваем в список
datalist = []
for row in data:
    if float(row[24][1]) > 45:
        continue
    d = {}
    lt = float(row[24][1])
    lg = float(row[24][2])
    d['lt'] = lt
    d['lg'] = lg
    if not 38 < lt < 45 or not -75 < lg < -70:
        continue
    d['Permit_Number'] = str(row[8])
    d['Date'] = str(row[9])
    datalist.append(d)

# Получившийся список собираем в DataFrame pandas
df_permits = pd.DataFrame(data=datalist)

# Нарисуем карту, отметим данные точками для наглядности
'''
geom_permits = [Point(xy) for xy in zip(df_permits['lg'], df_permits['lt'])]
geom_police = [Point(xy) for xy in zip(df_police['lg'], df_police['lt'])]
gdf_permits = gpd.GeoDataFrame(df_permits, geometry=geom_permits, crs="EPSG:4326")
gdf_police = gpd.GeoDataFrame(df_police, geometry=geom_police, crs="EPSG:4326")
counties = gpd.read_file(
    'data/map/wgs84/townct_37800_0000_2010_s100_census_1_shp_wgs84.shp')
city = counties[counties["NAME10"] == "Hartford"]
gdf_permits.plot(
    ax = city.plot(figsize=(10, 10)),
    marker = 'o',
    color = 'white',
    markersize = 1)
gdf_police.plot(
    ax = city.plot(figsize=(10, 10)),
    marker = 'o',
    color = 'red',
    markersize = 1)
'''

geom_permits = [Point(xy) for xy in zip(df_permits['lg'], df_permits['lt'])]
geom_police = [Point(xy) for xy in zip(df_police['lg'], df_police['lt'])]
gdf_permits = gpd.GeoDataFrame(df_permits, geometry=geom_permits, crs="EPSG:4326")
gdf_police = gpd.GeoDataFrame(df_police, geometry=geom_police, crs="EPSG:4326")
counties = gpd.read_file(
    'data/map/wgs84/townct_37800_0000_2010_s100_census_1_shp_wgs84.shp')
city = counties[counties["NAME10"] == "Hartford"]

fig, ax = plt.subplots(figsize=(10, 10))
city.plot(ax=ax, color='white', edgecolor='black')
gdf_police.plot(ax=ax, marker='o', color='red', markersize=1, alpha=0.7)
gdf_permits.plot(ax=ax, marker='o', color='blue', markersize=1, alpha=0.7)
plt.show()

ERROR:fiona._env:data/map/wgs84/townct_37800_0000_2010_s100_census_1_shp_wgs84.shp: No such file or directory


DriverError: ignored

In [74]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import urllib.request, json 
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import pandas as pd
from tqdm import tqdm
from shapely.geometry import Point
import geopandas as gpd
from geopandas import GeoDataFrame
from geopandas import GeoSeries
import matplotlib.pyplot as plt
import sys
from sklearn.cluster import KMeans
import time
import builtins

start = time.time()
print('Start')

# Корректируем функцию print, чтобы всегда писала время исполнения
def print(*args, **kwargs):
    if type(args) == tuple:
        builtins.print(format(int(time.time() - start), "3d") + 's: ' + ''.join(map(str, args)))
    elif type(args) == int:
        builtins.print(format(int(time.time() - start), "3d") + 's: ' + str(args))
    elif type(args) == int:
        builtins.print(format(int(time.time() - start), "3d") + 's: ' + args)
    

def plot_map(df_permits, df_police, df_permits_centers, df_police_centers):    
    # Скачаем очертания региона
    counties = gpd.read_file(
        '/content/sample_data/townct_37800_0000_2010_s100_census_1_shp_wgs84.shx')
    # Подготовим точки
    geom_permits = [Point(xy) for xy in zip(df_permits['lg'], df_permits['lt'])]
    geom_permits_centers = [Point(xy) for xy in zip(df_permits_centers['lg'],
                                                        df_permits_centers['lt'])]
    geom_police = [Point(xy) for xy in zip(df_police['lg'], df_police['lt'])]
    geom_police_centers = [Point(xy) for xy in zip(df_police_centers['lg'],
                                                       df_police_centers['lt'])]
    
    
    gdf_permits = gpd.GeoDataFrame(df_permits, geometry=geom_permits, crs="EPSG:4326")
    gdf_permits_centers = gpd.GeoDataFrame(df_permits_centers,
                                               geometry=geom_permits_centers, crs="EPSG:4326")
    gdf_police = gpd.GeoDataFrame(df_police, geometry=geom_police, crs="EPSG:4326")
    gdf_police_centers = gpd.GeoDataFrame(df_police_centers,
                                              geometry=geom_police_centers, crs="EPSG:4326")
    city = counties[counties["NAME10"] == "Hartford"]
    
    fig, ax = plt.subplots(figsize=(8, 8))
    city.plot(ax=ax, color='white', edgecolor='black')
    gdf_police.plot(ax=ax, marker='.', color='red', markersize=1)
    gdf_permits.plot(ax=ax, marker='.', color='blue', markersize=1)
    gdf_police_centers.plot(ax=ax, marker='o', color='red',
                                edgecolor='black', linewidth=2, markersize=500)
    gdf_permits_centers.plot(ax=ax, marker='o', color='blue',
                                 edgecolor='black', linewidth=2, markersize=500)
    
    plt.show()
    print('Plot - done')

def find_best_clusters(df, maximum_K):    
    clusters_centers = []
    k_values = []
    
    for k in range(1, maximum_K):        
        kmeans_model = KMeans(n_clusters = k)
        kmeans_model.fit(df)        
        clusters_centers.append(kmeans_model.inertia_)
        k_values.append(k)
    print('Best K - found')
    return clusters_centers, k_values

def generate_elbow_plot(clusters_centers, k_values):    
    figure = plt.subplots(figsize = (10, 10))
    plt.plot(k_values, clusters_centers, 'o-', color = 'orange')
    plt.xlabel("Number of Clusters (K)")
    plt.ylabel("Cluster Inertia")
    plt.title("Elbow Plot of KMeans")
    print('Elbow plot - done')
    plt.show()

# Скачиваем данные о разрешениях на строительство/продажу недвижимости
with urllib.request.urlopen(
    "https://data.hartford.gov/api/views/p2vw-4aab/rows.json?accessType=DOWNLOAD") as url:
    data = json.load(url)['data']
print('Building and Trades Permits - downloaded')

# Достаем из всех полей только номер дела, дату, широту и долготу. Все заворачиваем в список
datalist = []
for row in data:
    if float(row[24][1]) > 45:
        continue
    d = {}
    lt = float(row[24][1])
    lg = float(row[24][2])
    d['lt'] = lt
    d['lg'] = lg
    if not 38 < lt < 45 or not -75 < lg < -70:
        continue
    datalist.append(d)

# Получившийся список собираем в DataFrame pandas
df_permits = pd.DataFrame(data=datalist)
print('Building and Trades Permits dataframe - done')

# Аналогично - огромный json с записями о полицейских инцидентах
with urllib.request.urlopen("https://data.hartford.gov/api/views/889t-nwfu/rows.json?accessType=DOWNLOAD") as url:
    data = json.load(url)['data']
print('Police records - downloaded')

# Достаем из всех полей только номер дела, дату, широту и долготу. Все заворачиваем в список
datalist = []
for row in data:    
    d = {}
    lt = float(row[19][1])
    lg = float(row[19][2])
    d['lt'] = lt
    d['lg'] = lg
    if not 38 < lt < 45 or not -75 < lg < -70:
        continue
    datalist.append(d)

# Получившийся список собираем в DataFrame pandas
df_police = pd.DataFrame(data=datalist)
print('Police records dataframe - done')

# Сделаем кластеризацию k-means, чтобы найти 
# ключевые центры строительства/продажи недвижимости и криминала

# Сначала посмотрим, как от кол-ва кластеров зависит ошибка,
# чтобы выбрать этот параметр
#clusters_centers, k_values = find_best_clusters(df_permits, 12)
#generate_elbow_plot(clusters_centers, k_values)

#Видно, что почти гипербола. Пусть k = 6
kmeans_model = KMeans(n_clusters = 6)
kmeans_model.fit(df_permits)

# Припишем к каждой записи номер кластера
df_permits["clusters"] = kmeans_model.labels_
# На будущее сохраним центры кластеров
df_permits_centers = pd.DataFrame(data = kmeans_model.cluster_centers_)
df_permits_centers = df_permits_centers.rename(columns={0: "lt", 1: "lg"})
print('Permits clusters - found')

# То же самое для криминала
#clusters_centers, k_values = find_best_clusters(df_police, 12)
#generate_elbow_plot(clusters_centers, k_values)

#Видно, что почти гипербола. Пусть k = 7
kmeans_model = KMeans(n_clusters = 7)
kmeans_model.fit(df_police)

# Припишем к каждой записи номер кластера
df_police["clusters"] = kmeans_model.labels_
# На будущее сохраним центры кластеров
df_police_centers = pd.DataFrame(data = kmeans_model.cluster_centers_)
df_police_centers = df_police_centers.rename(columns={0: "lt", 1: "lg"})
print('Police records clusters - found')

# Посмотрим карту
plot_map(df_permits, df_police, df_permits_centers, df_police_centers)


  0s: Start
  5s: Building and Trades Permits - downloaded
  5s: Building and Trades Permits dataframe - done
 45s: Police records - downloaded
 47s: Police records dataframe - done
 48s: Permits clusters - found
 53s: Police records clusters - found


KeyError: ignored