# Вступление:
# **Преступность и недвижимость**

Смысл проекта заключается в анализе уровня преступности и рынка недвижимости в районах Хартфорда. Рассмотрим корреляцию уровня криминала и количества продоваемой и строящейся недвижимостисти (цены на нее), визуализируем данные по-районно на карте Хартфорада, проанализируем "настроения" в текстовых источниках относительно этих районов на тему преступности и цен на жилье.

С помощью веб-скраппинга посмотрим, как описываются районы Хартфорда в текстовых источниках и оценим "настроение" этих текстов:

In [3]:
import requests
from bs4 import BeautifulSoup


neighborhood_urls = [
  'https://www.neighborhoodscout.com/ct/hartford/university-hartford#crime',
  'https://www.neighborhoodscout.com/ct/hartford/blue-hills-south#crime',
  'https://www.neighborhoodscout.com/ct/hartford/behind-rocks-southwest#crime',
  'https://www.neighborhoodscout.com/ct/hartford/blue-hills#crime',
  'https://www.neighborhoodscout.com/ct/hartford/cabot-st#crime',
  'https://www.neighborhoodscout.com/ct/hartford/parkville-south#crime',
  'https://www.neighborhoodscout.com/ct/hartford/keney-park#crime',
  'https://www.neighborhoodscout.com/ct/hartford/barry-square-west#crime',
  'https://www.neighborhoodscout.com/ct/hartford/southwest#crime',
  'https://www.neighborhoodscout.com/ct/hartford/asylum-hill-south#crime'
]

data = {}

for url in neighborhood_urls:
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    title = soup.find('title').text
    main_text = soup.find_all('p')
    alltext = ''
    for text in main_text:
        alltext+=(text.get_text())
    data[title] = alltext

In [8]:
# теперь анализируем описание этих районов с помощью NLP 
from textblob import TextBlob
polarhood = {}
for text in data:
      blob = TextBlob(data[text])
      sentiment = blob.sentiment.polarity
      polarhood[text] = sentiment

In [9]:
polarhood

{'University of Hartford Hartford, CT 06112, Neighborhood Profile - NeighborhoodScout': 0.1690532964303456,
 'Blue Hills South Hartford, CT 06112, Neighborhood Profile - NeighborhoodScout': 0.1641109668109668,
 'Behind the Rocks Southwest Hartford, CT 06106, Neighborhood Profile - NeighborhoodScout': 0.11377195806947875,
 'Blue Hills Hartford, CT 06112, Neighborhood Profile - NeighborhoodScout': 0.12190655145379554,
 'Cabot St & Albany Ave Hartford, CT 06112, Neighborhood Profile - NeighborhoodScout': 0.1545229962715512,
 'Parkville South Hartford, CT 06106, Neighborhood Profile - NeighborhoodScout': 0.16303183813822109,
 'Keney Park Hartford, CT 06120, Neighborhood Profile - NeighborhoodScout': 0.17257362467216478,
 'Barry Square West Hartford, CT 06114, Neighborhood Profile - NeighborhoodScout': 0.1530247828234405,
 'Southwest Hartford, CT 06106, Neighborhood Profile - NeighborhoodScout': 0.15034483965454112,
 'Asylum Hill South Hartford, CT 06105, Neighborhood Profile - Neighborhood

In [16]:
sorted_polarhood = dict(sorted(polarhood.items(), key=lambda x: x[1], reverse=True))

for key, value in sorted_polarhood.items():
    print(f"{key}: {value}")

Asylum Hill South Hartford, CT 06105, Neighborhood Profile - NeighborhoodScout: 0.18547739969527677
Keney Park Hartford, CT 06120, Neighborhood Profile - NeighborhoodScout: 0.17257362467216478
University of Hartford Hartford, CT 06112, Neighborhood Profile - NeighborhoodScout: 0.1690532964303456
Blue Hills South Hartford, CT 06112, Neighborhood Profile - NeighborhoodScout: 0.1641109668109668
Parkville South Hartford, CT 06106, Neighborhood Profile - NeighborhoodScout: 0.16303183813822109
Cabot St & Albany Ave Hartford, CT 06112, Neighborhood Profile - NeighborhoodScout: 0.1545229962715512
Barry Square West Hartford, CT 06114, Neighborhood Profile - NeighborhoodScout: 0.1530247828234405
Southwest Hartford, CT 06106, Neighborhood Profile - NeighborhoodScout: 0.15034483965454112
Blue Hills Hartford, CT 06112, Neighborhood Profile - NeighborhoodScout: 0.12190655145379554
Behind the Rocks Southwest Hartford, CT 06106, Neighborhood Profile - NeighborhoodScout: 0.11377195806947875


Мы получаем отсортированный список районов. Чем больше "положительность", тем лучше район. Тем не менее, из-за того, что эта оценка была произведена, в целом, не относительно какого-то критерия. Мы не можем исходя из нее сделать ответ относиельно, например, безопасности района.

Поэтому я нашла большую базу данных с записью всех преступлений от 2005 до 2021. Сделаем рейтинг безопасных районов.

In [None]:
import pandas as pd

In [None]:
dataa = pd.read_csv('Police_Incidents_01012005_to_05182021.csv')

In [None]:
dataa

Unnamed: 0,Case_Number,Date,Time_24HR,Address,UCR_1_Category,UCR_1_Description,UCR_1_Code,UCR_2_Category,UCR_2_Description,UCR_2_Code,Neighborhood,geom
0,21013791,05/10/2021,1641,403 GARDEN ST,32* - PROPERTY DAMAGE ACCIDENT,PROP DAM ACC,3221,,,0,CLAY-ARSENAL,"(41.780238042803745, -72.68497435174203)"
1,21014071,05/13/2021,245,59 ELLINGTON ST,32* - PROPERTY DAMAGE ACCIDENT,PROP DAM ACC,3261,24* - MOTOR VEHICLE LAWS,EVADING RESP,2401,BEHIND THE ROCKS,"(41.74625648731947, -72.70484012171347)"
2,20036741,11/29/2020,1703,267 ZION ST,31* - PERSONAL INJURY ACCIDENT,PERS INJ ACC,3124,23* - DRIVING LAWS,FOLL TOO CLOSE,2334,BEHIND THE ROCKS,"(41.74850755091766, -72.69411393999614)"
3,21013679,05/09/2021,2245,HOMESTEAD AV & WOODLAND ST,31* - PERSONAL INJURY ACCIDENT,PERS INJ ACC,3124,23* - DRIVING LAWS,TRAVELING TOO FAST,2327,UPPER ALBANY,"(41.778689832211015, -72.69776621329845)"
4,21014070,05/13/2021,240,BENTON ST & WEBSTER ST,32* - PROPERTY DAMAGE ACCIDENT,PROP DAM ACC,3221,,,0,BARRY SQUARE,"(41.74653366174123, -72.68316706252509)"
...,...,...,...,...,...,...,...,...,...,...,...,...
708980,9000978,01/09/2009,955,219 ZION ST,34* - OTHER ACCIDENT,HAZARDOUS CONDITION,3490,,,0,BEHIND THE ROCKS,"(41.747163627033345, -72.69420967955826)"
708981,13014833,05/07/2013,1139,150 WARD ST,24* - MOTOR VEHICLE LAWS,OP UNREG M/V,2414,24* - MOTOR VEHICLE LAWS,TOWED VEHICLE,2430,FROG HOLLOW,"(41.75622433710221, -72.68959981062677)"
708982,6011544,03/18/2006,1155,334 FRANKLIN AV,32* - PROPERTY DAMAGE ACCIDENT,PROP DAM ACC,3224,23* - DRIVING LAWS,IMPRP LANE CHANGE,2344,SOUTHEND,"(41.74104529114852, -72.67573171674711)"
708983,12033004,09/19/2012,1206,ASHLEY ST & SIGOURNEY ST,39* - ANIMAL COMPLAINT,ANIMAL BITE,3904,,,0,ASYLUM HILL,"(41.774504796809694, -72.69245815646983)"


In [None]:
pivot_table = dataa.groupby(['Neighborhood', 'UCR_1_Category']).size().unstack().reset_index()

# Заменяем NaN значения на 0 (предполагая, что NaN означает отсутствие преступлений данного типа)
pivot_table = pivot_table.fillna(0)

# Переименовываем столбцы
pivot_table.columns.name = None

# Переименовываем индекс
pivot_table.reset_index(drop=True, inplace=True)

In [None]:
corr_matrix = pivot_table.corr().stack()

corr_matrix = corr_matrix[corr_matrix.index.get_level_values(0) != corr_matrix.index.get_level_values(1)]

corr_matrix = corr_matrix[corr_matrix.index.get_level_values(0) < corr_matrix.index.get_level_values(1)]

# Сортируем по абсолютному значению корреляции
corr_matrix = corr_matrix.abs().sort_values(ascending=False)

# Выводим 10 самых коррелируемых категорий преступлений
print('Самые коррелируемые категории преступлений:')
print(corr_matrix.head(20))

# Выводим 10 самых не коррелируемых категорий преступлений
print('\nСамые не коррелируемые категории преступлений:')
print(corr_matrix.tail(20))


Самые коррелируемые категории преступлений:
35* - MISC. CRIMES AGAINST PROPERTY  42* - CARE FOR SICK                    0.976218
19* - CRIMES AGAINST THE PUBLIC      42* - CARE FOR SICK                    0.970195
15* - FAMILY OFFENSES                20* - RADIO SIGNAL                     0.969238
17* - LIQUOR LAWS                    44* - MISC. WANTS                      0.964995
04* - AGGRAVATED ASSAULT             08* - SIMPLE ASSAULT                   0.960264
19* - CRIMES AGAINST THE PUBLIC      35* - MISC. CRIMES AGAINST PROPERTY    0.959203
05* - BURGLARY                       07* - MOTOR VEHICLE THEFT              0.957895
03* - ROBBERY                        08* - SIMPLE ASSAULT                   0.954883
11* - STOLEN PROPERTY                24* - MOTOR VEHICLE LAWS               0.953667
03* - ROBBERY                        20* - RADIO SIGNAL                     0.952921
08* - SIMPLE ASSAULT                 29* - FOUND PERSON/PROPERTY            0.951064
04* - AGGRAVATED ASSA

  corr_matrix = pivot_table.corr().stack()


In [None]:
correlations = pivot_table.corr()

sum_of_correlations = correlations.sum()

# Сортируем категории по сумме корреляций
sorted_sum_of_correlations = sum_of_correlations.sort_values(ascending=False)

print(sorted_sum_of_correlations)


08* - SIMPLE ASSAULT                   35.477176
42* - CARE FOR SICK                    35.096257
35* - MISC. CRIMES AGAINST PROPERTY    34.706404
03* - ROBBERY                          34.505840
19* - CRIMES AGAINST THE PUBLIC        34.177491
04* - AGGRAVATED ASSAULT               33.552232
20* - RADIO SIGNAL                     33.407618
51* - MISC. MANAGEMENT INFO.           33.312521
29* - FOUND PERSON/PROPERTY            33.287536
28* - MISSING PERSON/PROPERTY          32.555622
15* - FAMILY OFFENSES                  32.262806
11* - STOLEN PROPERTY                  31.966589
31* - PERSONAL INJURY ACCIDENT         31.814613
24* - MOTOR VEHICLE LAWS               31.512476
37* - FIRE-RELATED                     31.500444
12* - WEAPONS OFFENSES                 30.525181
34* - OTHER ACCIDENT                   30.272905
43* - MENTAL CASE                      30.029932
53* - LANDLORD-TENANT                  29.589828
25* - CITY ORDINANCES                  29.355498
07* - MOTOR VEHICLE 

  correlations = pivot_table.corr()


In [None]:
sorted_table = pivot_table.sort_values('08* - SIMPLE ASSAULT', ascending=False)

In [None]:
sorted_table

Unnamed: 0,Neighborhood,01* - HOMICIDE,03* - ROBBERY,04* - AGGRAVATED ASSAULT,05* - BURGLARY,06* - LARCENY,07* - MOTOR VEHICLE THEFT,08* - SIMPLE ASSAULT,09* - FORGERY/COUNTERFEITING,10* - FRAUD/EMBEZZLEMENT/EXTORTION,...,44* - MISC. WANTS,46* - JUVENILE-RELATED,47* - CONNECTING CASE,49* - SHOOTING,51* - MISC. MANAGEMENT INFO.,52* - SHOTS FIRED,5210 - SHOTS FIRED CONFIRMED,5211 - SHOTS FIRED UNCONFIRMED,53* - LANDLORD-TENANT,55* - REPORT-RELATED
8,NORTHEAST,80.0,935.0,2130.0,1782.0,4439.0,1748.0,4405.0,101.0,108.0,...,818.0,118.0,0.0,5.0,3541.0,2868.0,14.0,47.0,158.0,835.0
1,BARRY SQUARE,45.0,995.0,1381.0,1819.0,5833.0,2009.0,3925.0,80.0,108.0,...,602.0,633.0,0.0,2.0,2611.0,1497.0,7.0,6.0,151.0,768.0
0,ASYLUM HILL,32.0,912.0,1292.0,1267.0,6051.0,1733.0,3755.0,125.0,164.0,...,601.0,335.0,0.0,4.0,4218.0,856.0,3.0,14.0,103.0,1246.0
6,FROG HOLLOW,29.0,923.0,1267.0,1267.0,5161.0,1204.0,3233.0,114.0,114.0,...,939.0,139.0,0.0,8.0,3087.0,1435.0,6.0,14.0,107.0,861.0
4,CLAY-ARSENAL,48.0,629.0,1350.0,861.0,2967.0,906.0,3167.0,101.0,84.0,...,703.0,903.0,0.0,2.0,2995.0,1254.0,4.0,10.0,79.0,675.0
15,UPPER ALBANY,52.0,619.0,1346.0,811.0,2515.0,870.0,2820.0,117.0,114.0,...,450.0,424.0,0.0,3.0,1922.0,1267.0,14.0,15.0,120.0,571.0
5,DOWNTOWN,12.0,550.0,715.0,474.0,7093.0,763.0,2515.0,148.0,202.0,...,3625.0,196.0,0.0,1.0,2004.0,212.0,1.0,4.0,17.0,1577.0
13,SOUTHEND,26.0,752.0,847.0,1776.0,4919.0,1745.0,2434.0,154.0,204.0,...,370.0,301.0,0.0,5.0,1764.0,711.0,2.0,0.0,98.0,572.0
3,BLUE HILLS,24.0,346.0,578.0,846.0,2269.0,784.0,1964.0,77.0,69.0,...,234.0,393.0,1.0,1.0,1433.0,941.0,2.0,9.0,52.0,372.0
2,BEHIND THE ROCKS,17.0,346.0,681.0,1013.0,6087.0,1236.0,1956.0,57.0,72.0,...,450.0,53.0,0.0,1.0,1492.0,817.0,2.0,4.0,63.0,505.0


In [None]:
import altair as alt
from google.colab import autoviz
df_2461634522618689939 = autoviz.get_registered_df('df_2461634522618689939')

def value_plot(df, y, sort_ascending=False, width=1000, height=200):
  if sort_ascending:
    df = df.sort_values(y).reset_index(drop=True)
  return (alt.Chart(df.reset_index()).mark_line()
          .encode(x=alt.X('index', title=''), y=alt.X(y, title='value'))
          .properties(width=width, height=height, title=y))

chart = value_plot(df_2461634522618689939, *['03* - ROBBERY'], **{})
chart

In [None]:
crime_count = dataa.groupby('Neighborhood').size()

In [None]:
crime_count

Neighborhood
ASYLUM HILL            71815
BARRY SQUARE           64661
BEHIND THE ROCKS       40544
BLUE HILLS             31921
CLAY-ARSENAL           51404
DOWNTOWN               61623
FROG HOLLOW            66112
NORTH MEADOWS          21375
NORTHEAST              71667
PARKVILLE              29778
SHELDON-CHARTER OAK    19518
SOUTH GREEN            33455
SOUTH MEADOWS           9645
SOUTHEND               45010
SOUTHWEST              14863
UPPER ALBANY           42899
WESTEND                32695
dtype: int64

# Вывод:
