# Exploratory Data Analysis
Takato Matsumoto  
Chicago Crime 
## Dataset


In [0]:
#ライブラリの読み込み
import os.path
import warnings
import pandas as pd
import seaborn as sns
import numpy as np
from scipy.stats import norm
from sklearn.preprocessing import StandardScaler
from scipy import stats
import matplotlib.pyplot as plt
from tqdm import tqdm
warnings.filterwarnings('ignore')
from datetime import datetime, date, timezone, timedelta
%matplotlib inline

#グラフの設定
plt.style.use('ggplot')
#plt.rcParams['font.family'] = 'IPAexGothic'
plt.rcParams['font.size'] = 18
plt.rcParams['figure.figsize'] = (15,10)

In [0]:
url = 'https://media.githubusercontent.com/media/TakatoMatsumoto/data_policy_studies/master/chapter_2/dataset/Chicago_Crimes_2012_to_2017.csv'
df = pd.read_csv(url, low_memory=False)

In [0]:
df['Date'] = pd.to_datetime(df['Date'])
df = df.sort_values(by='Date')
df = df.set_index('Date')

In [0]:
sns.heatmap(df.isnull(), cmap = 'viridis')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 9 columns):
 #   Column                Non-Null Count    Dtype  
---  ------                --------------    -----  
 0   Date                  1048575 non-null  object 
 1   Primary Type          1048575 non-null  object 
 2   Description           1048575 non-null  object 
 3   Location Description  1047927 non-null  object 
 4   Arrest                1048575 non-null  bool   
 5   Domestic              1048575 non-null  bool   
 6   Community Area        1048535 non-null  float64
 7   Latitude              1039870 non-null  float64
 8   Longitude             1039870 non-null  float64
dtypes: bool(2), float64(3), object(4)
memory usage: 58.0+ MB


In [4]:
df

Unnamed: 0,Date,Primary Type,Description,Location Description,Arrest,Domestic,Community Area,Latitude,Longitude
0,05/03/2016 11:40:00 PM,BATTERY,DOMESTIC BATTERY SIMPLE,APARTMENT,True,True,29.0,41.864073,-87.706819
1,05/03/2016 09:40:00 PM,BATTERY,DOMESTIC BATTERY SIMPLE,RESIDENCE,False,True,42.0,41.782922,-87.604363
2,05/03/2016 11:31:00 PM,PUBLIC PEACE VIOLATION,RECKLESS CONDUCT,STREET,False,False,25.0,41.894908,-87.758372
3,05/03/2016 10:10:00 PM,BATTERY,SIMPLE,SIDEWALK,False,False,25.0,41.885687,-87.749516
4,05/03/2016 10:00:00 PM,THEFT,$500 AND UNDER,RESIDENCE,False,True,25.0,41.886297,-87.761751
...,...,...,...,...,...,...,...,...,...
1048570,06/13/2015 04:12:00 PM,CRIMINAL DAMAGE,TO PROPERTY,APARTMENT,False,True,29.0,41.867188,-87.716440
1048571,06/13/2015 09:30:00 PM,THEFT,POCKET-PICKING,STREET,False,False,33.0,41.863197,-87.614818
1048572,06/13/2015 09:40:00 PM,BATTERY,SIMPLE,SIDEWALK,True,False,4.0,41.968625,-87.688354
1048573,06/13/2015 08:40:00 PM,ROBBERY,ARMED: OTHER DANGEROUS WEAPON,SIDEWALK,False,False,25.0,41.900424,-87.755024


In [0]:
df.Date = pd.to_datetime(df.Date)

In [0]:
df_modify = df[df['Date'] < datetime(2014,1,1)]
df_modify.to_csv('Chicago_Crimes.csv')

In [0]:
df_modify = df_modify.dropna()

In [0]:
df_modify

In [0]:
df_modify.groupby('Community Area').mean()

In [0]:
df_modify['Community Area'].unique()
sns.barplot(y=df['Community Area'].value_counts().index, x=df['Community Area'].value_counts())

In [0]:
sns.barplot(y=df['Primary Type'].value_counts().index, x=df['Primary Type'].value_counts())

In [0]:
df.Date.max()

In [18]:
df.Latitude.isnull().sum()

8705

In [8]:
#folium libraryのインストール（地図描画をするライブラリです）
!pip install folium



In [0]:
#地図の読み込み
import folium
m = folium.Map(location=[df['Latitude'].median(), df['Longitude'].median()], zoom_start=12)
#, tiles = 'CartoDB dark_matter'

In [0]:
#対数変換後の物件価格を四つの価格帯に分けて色分けします
#'skyblue', 'plum', 'tomato', 'gold'の色の順番で物件価格が高くなっていきます
df['marker_color'] = pd.cut(df['log_price'], bins=4, labels=['skyblue', 'plum', 'tomato', 'gold'])

In [49]:
#地図上に物件のマーカーを描画
from tqdm import tqdm
for index, row in tqdm(df_modify.iterrows()):
  folium.CircleMarker([row['Latitude'], row['Longitude']], radius=1).add_to(m)

637583it [03:45, 2827.01it/s]


In [0]:
from folium.plugins import HeatMap
location = []
for index, row in tqdm(df_modify.iterrows()):
  location.append([row['Latitude'], row['Longitude']])
HeatMap(location).add_to(m)

633588it [02:45, 3967.25it/s]

In [13]:
m

Output hidden; open in https://colab.research.google.com to view.

In [0]:
%%time 
#for showing the time taken to draw the location on the map


#plotting circles on map a larger and red circle represent large number of crimes while a smaller blue circle represent the less number of crimes
for i in range(500):
    
    #extracting latitude
    lat = float(new_unique_location['Location Coord'].iloc[i][0])
    
    #extracting longitude
    long = float(new_unique_location['Location Coord'].iloc[i][1])
    
    #making radius dynamic on the basis of number of crimes
    radius = float(new_unique_location['Values Count'].iloc[i]) / 40
    
    # if number of crimes are more than 1000 then circle will be red in color else blue in color 
    if float(new_unique_location["Values Count"].iloc[i]) > 1000:
        color = '#FF4500'
    else:
        color = '#008080'
        
    #creating the popup text which will show Latitude, Longitude and Number of crimes happened on place when clicked on Map
    popup_text = """Latitude : {}<br><br>
                    Longitude : {}<br><br>
                    No of Crime Incident : {}<br><br>"""
    
    popup_text = popup_text.format(lat, long, new_unique_location['Values Count'][i])
    
    
    #drawing circles on map with different color and dynamic radius
    folium.CircleMarker(location = [lat, long], radius = radius, color = color, popup = popup_text, fill = True).add_to(chicago_crime_map)

In [0]:
m

*   どんな犯罪が多いか
*   どんな場所で起こるか
*   よく起こる時間帯は
*   季節に関係あるか



