# Part 1: Getting the dataset

I've downloaded the CSV-file from SF OpenData homepage. And below I'd import it to python.

In [1]:
import pandas as pd

with open("SFPD_Incidents_-_from_1_January_2003.csv", 'rb') as f:
    df = pd.read_csv("SFPD_Incidents_-_from_1_January_2003.csv")

In [2]:
df.head(100)

Unnamed: 0,IncidntNum,Category,Descript,DayOfWeek,Date,Time,PdDistrict,Resolution,Address,X,Y,Location,PdId
0,150060275,NON-CRIMINAL,LOST PROPERTY,Monday,01/19/2015,14:00,MISSION,NONE,18TH ST / VALENCIA ST,-122.421582,37.761701,"(37.7617007179518, -122.42158168137)",15006027571000
1,150098210,ROBBERY,"ROBBERY, BODILY FORCE",Sunday,02/01/2015,15:45,TENDERLOIN,NONE,300 Block of LEAVENWORTH ST,-122.414406,37.784191,"(37.7841907151119, -122.414406029855)",15009821003074
2,150098210,ASSAULT,AGGRAVATED ASSAULT WITH BODILY FORCE,Sunday,02/01/2015,15:45,TENDERLOIN,NONE,300 Block of LEAVENWORTH ST,-122.414406,37.784191,"(37.7841907151119, -122.414406029855)",15009821004014
3,150098210,SECONDARY CODES,DOMESTIC VIOLENCE,Sunday,02/01/2015,15:45,TENDERLOIN,NONE,300 Block of LEAVENWORTH ST,-122.414406,37.784191,"(37.7841907151119, -122.414406029855)",15009821015200
4,150098226,VANDALISM,"MALICIOUS MISCHIEF, VANDALISM OF VEHICLES",Tuesday,01/27/2015,19:00,NORTHERN,NONE,LOMBARD ST / LAGUNA ST,-122.431119,37.800469,"(37.8004687042875, -122.431118543788)",15009822628160
5,150098232,NON-CRIMINAL,AIDED CASE -PROPERTY FOR DESTRUCTION,Sunday,02/01/2015,16:21,RICHMOND,NONE,400 Block of LOCUST ST,-122.451782,37.787085,"(37.7870853907529, -122.451781767894)",15009823251041
6,150098248,SECONDARY CODES,DOMESTIC VIOLENCE,Saturday,01/31/2015,21:00,BAYVIEW,NONE,700 Block of KIRKWOOD AV,-122.374019,37.729203,"(37.729203356539, -122.374019331833)",15009824815200
7,150098248,VANDALISM,"MALICIOUS MISCHIEF, VANDALISM",Saturday,01/31/2015,21:00,BAYVIEW,NONE,700 Block of KIRKWOOD AV,-122.374019,37.729203,"(37.729203356539, -122.374019331833)",15009824828150
8,150098254,BURGLARY,"BURGLARY OF STORE, UNLAWFUL ENTRY",Saturday,01/31/2015,16:09,CENTRAL,NONE,200 Block of STOCKTON ST,-122.406568,37.787809,"(37.7878092959561, -122.40656817787)",15009825405053
9,150098260,LARCENY/THEFT,PETTY THEFT SHOPLIFTING,Saturday,01/31/2015,17:00,CENTRAL,NONE,800 Block of GEARY ST,-122.417295,37.786258,"(37.7862578545865, -122.417295322526)",15009826006362


# Part 2: Working with data

**Exercise: The types of crime and their popularity over time. The first field we'll dig into is the column "Category".**

In [3]:
#Finding the number of different crime types
SetCrimeType = set(df['Category'])
NumCrimeType = len(SetCrimeType)
print "There are %d different crime categories." % NumCrimeType



There are 39 different crime categories.


In [4]:
#Counting each crime incident per crime type
CrimeDict = {}
for Type in SetCrimeType:
    counter = 0
    for incident in df['Category']:
        if incident == Type:
            counter += 1
    CrimeDict[Type] = counter
    
#Creating a pandas dataframe from the dictionary
df2 = pd.DataFrame.from_dict(CrimeDict, orient = 'index')

#Finding the most and least frequent crime
print "The most frequent crime is:" 
print df2.nlargest(1, 0)

print "The least frequent crime is:" 
print df2.nsmallest(1, 0)

The most frequent crime is:
                    0
LARCENY/THEFT  422170
The least frequent crime is:
       0
TREA  13


In [22]:
import matplotlib.pyplot as plt
#Sorting the dataframe by value
df2 = pd.DataFrame.sort_values(df2, 0)

bar_width = 0.5
index = range(len(df2[0]))
plt.bar(index, df2[0], bar_width)


plt.xticks([x+(bar_width/2) for x in index], df2.index, rotation='vertical')
plt.xlabel('x')
plt.ylabel('Count')
plt.title('Number of incidents per crime')

plt.show()


In [6]:
#Create a dictionary with year (key) and number of crimes (values)
YearList = []
for i in df['Date']:
    YearList.append(int( i.split("/")[-1]))

YearDict = {}
                        
for i in set(YearList):
    counter = 0
    for x in YearList:
        if i == x:
            counter += 1
    YearDict[i] = counter

In [7]:
#Creating a dataframe for the counted crimes
df3 = pd.DataFrame.from_dict(YearDict, orient = 'index')

df3 = pd.DataFrame.sort_index(df3)
df3 = df3.drop(2017)
df3.tail()

Unnamed: 0,0
2012,140854
2013,152806
2014,150131
2015,156543
2016,149502


In [8]:
#Calculating the average number of crimes per year
print "Average crimes per year:"
print sum(df3[0])/len(df3[0])

Average crimes per year:
143730


In [9]:
#Creating bar charts for focus crimes:
focuscrimes = set(['WEAPON LAWS', 'PROSTITUTION', 'DRIVING UNDER THE INFLUENCE', 'ROBBERY', 'BURGLARY', 'ASSAULT', 'DRUNKENNESS', 'DRUG/NARCOTIC', 'TRESPASS', 'LARCENY/THEFT', 'VANDALISM', 'VEHICLE THEFT', 'STOLEN PROPERTY', 'DISORDERLY CONDUCT'])
                  
print focuscrimes

set(['VEHICLE THEFT', 'DISORDERLY CONDUCT', 'WEAPON LAWS', 'VANDALISM', 'PROSTITUTION', 'DRUG/NARCOTIC', 'TRESPASS', 'ASSAULT', 'DRIVING UNDER THE INFLUENCE', 'DRUNKENNESS', 'ROBBERY', 'BURGLARY', 'STOLEN PROPERTY', 'LARCENY/THEFT'])


In [10]:
df3.columns = ['Total number of crimes']

In [11]:
#Adding the years to the original dataframe
df['Year'] = YearList
df.head()

Unnamed: 0,IncidntNum,Category,Descript,DayOfWeek,Date,Time,PdDistrict,Resolution,Address,X,Y,Location,PdId,Year
0,150060275,NON-CRIMINAL,LOST PROPERTY,Monday,01/19/2015,14:00,MISSION,NONE,18TH ST / VALENCIA ST,-122.421582,37.761701,"(37.7617007179518, -122.42158168137)",15006027571000,2015
1,150098210,ROBBERY,"ROBBERY, BODILY FORCE",Sunday,02/01/2015,15:45,TENDERLOIN,NONE,300 Block of LEAVENWORTH ST,-122.414406,37.784191,"(37.7841907151119, -122.414406029855)",15009821003074,2015
2,150098210,ASSAULT,AGGRAVATED ASSAULT WITH BODILY FORCE,Sunday,02/01/2015,15:45,TENDERLOIN,NONE,300 Block of LEAVENWORTH ST,-122.414406,37.784191,"(37.7841907151119, -122.414406029855)",15009821004014,2015
3,150098210,SECONDARY CODES,DOMESTIC VIOLENCE,Sunday,02/01/2015,15:45,TENDERLOIN,NONE,300 Block of LEAVENWORTH ST,-122.414406,37.784191,"(37.7841907151119, -122.414406029855)",15009821015200,2015
4,150098226,VANDALISM,"MALICIOUS MISCHIEF, VANDALISM OF VEHICLES",Tuesday,01/27/2015,19:00,NORTHERN,NONE,LOMBARD ST / LAGUNA ST,-122.431119,37.800469,"(37.8004687042875, -122.431118543788)",15009822628160,2015


In [12]:
CrimeYearDict = {}
for i in focuscrimes:
    CrimeYearDict[i]= {}
    for year in range(2003, 2017, 1):
        CrimeYearDict[i][year] = 0
    
for index in range(len(df)):
    c = df.get_value(index, 'Category')
    y = df.get_value(index, 'Year')
    if c in focuscrimes and y < 2017:
        CrimeYearDict[c][y] += 1
    
print CrimeYearDict


{'WEAPON LAWS': {2016: 1651, 2003: 1281, 2004: 1252, 2005: 1341, 2006: 1324, 2007: 1318, 2008: 1419, 2009: 1433, 2010: 1349, 2011: 1329, 2012: 1460, 2013: 1535, 2014: 1580, 2015: 1655}, 'ROBBERY': {2016: 3293, 2003: 3204, 2004: 3380, 2005: 3566, 2006: 4131, 2007: 4027, 2008: 4229, 2009: 3578, 2010: 3324, 2011: 3376, 2012: 3955, 2013: 4196, 2014: 3420, 2015: 3759}, 'PROSTITUTION': {2016: 640, 2003: 1952, 2004: 1527, 2005: 1103, 2006: 1290, 2007: 1873, 2008: 1673, 2009: 1468, 2010: 1299, 2011: 1094, 2012: 690, 2013: 692, 2014: 449, 2015: 374}, 'DRUG/NARCOTIC': {2016: 4228, 2003: 9917, 2004: 9897, 2005: 8533, 2006: 9069, 2007: 10560, 2008: 11648, 2009: 11950, 2010: 9205, 2011: 6935, 2012: 6444, 2013: 6775, 2014: 5408, 2015: 4254}, 'TRESPASS': {2016: 1807, 2003: 1434, 2004: 1191, 2005: 1034, 2006: 1102, 2007: 1198, 2008: 1151, 2009: 1232, 2010: 1150, 2011: 1072, 2012: 1288, 2013: 1129, 2014: 1125, 2015: 1403}, 'ASSAULT': {2016: 13522, 2003: 13461, 2004: 12899, 2005: 11601, 2006: 12449, 200

In [13]:
#Adding the CrimeYearDict to the df3
for crime in focuscrimes:
    temp = []
    for year in range(2003, 2017, 1):
        temp.append(CrimeYearDict[crime][year])
    
    df3[crime] = temp
    
    
df3.head()

Unnamed: 0,Total number of crimes,VEHICLE THEFT,DISORDERLY CONDUCT,WEAPON LAWS,VANDALISM,PROSTITUTION,DRUG/NARCOTIC,TRESPASS,ASSAULT,DRIVING UNDER THE INFLUENCE,DRUNKENNESS,ROBBERY,BURGLARY,STOLEN PROPERTY,LARCENY/THEFT
2003,149176,15325,886,1281,6448,1952,9917,1434,13461,289,662,3204,6047,800,26393
2004,148148,17884,814,1252,6496,1527,9897,1191,12899,244,600,3380,6753,641,24505
2005,142186,18194,687,1341,7013,1103,8533,1034,11601,196,636,3566,7071,540,25319
2006,137853,7291,521,1324,7688,1290,9069,1102,12449,266,703,4131,7004,575,27352
2007,137639,6460,581,1318,7566,1873,10560,1198,12518,313,671,4027,5454,527,25770


In [14]:
#Transposing the df for easier plotting
df4 = df3.transpose()

#plotting the evolution of the focuscrimes
counter = 0
plt.figure(1)
for crime in focuscrimes:
    counter += 1
    plt.subplot(2, 7, counter)
    df4.ix[crime].plot(kind='bar'); plt.axhline(0, color='k')
plt.show()


**Now I want to look at focuscrimes compared to the time of day**

In [15]:
#Creating a dictionary for the crime time
CrimeTimeDict = {}
for crime in focuscrimes:
    CrimeTimeDict[crime]= {}
    for time in range(0, 24):
        CrimeTimeDict[crime][time] = 0

for index in range(len(df)):
    crime = df.get_value(index, 'Category')
    time = df.get_value(index, 'Time')
    hr = int( time.split(":")[0] )
    if crime in focuscrimes:
        CrimeTimeDict[crime][hr] += 1
    
print CrimeTimeDict

{'WEAPON LAWS': {0: 1032, 1: 749, 2: 669, 3: 393, 4: 290, 5: 157, 6: 230, 7: 399, 8: 548, 9: 718, 10: 731, 11: 862, 12: 991, 13: 982, 14: 994, 15: 1127, 16: 1197, 17: 1158, 18: 1257, 19: 1157, 20: 1084, 21: 1084, 22: 1159, 23: 1097}, 'ROBBERY': {0: 2803, 1: 2840, 2: 2998, 3: 1809, 4: 1173, 5: 999, 6: 961, 7: 870, 8: 990, 9: 1168, 10: 1326, 11: 1502, 12: 1781, 13: 1924, 14: 2033, 15: 2306, 16: 2546, 17: 2492, 18: 2643, 19: 2832, 20: 3158, 21: 3607, 22: 3570, 23: 3341}, 'PROSTITUTION': {0: 1314, 1: 1020, 2: 750, 3: 517, 4: 454, 5: 571, 6: 514, 7: 265, 8: 148, 9: 158, 10: 189, 11: 208, 12: 443, 13: 325, 14: 286, 15: 209, 16: 270, 17: 575, 18: 1155, 19: 1338, 20: 1067, 21: 1262, 22: 1557, 23: 1568}, 'DRUG/NARCOTIC': {0: 3626, 1: 2195, 2: 1777, 3: 1197, 4: 880, 5: 445, 6: 1286, 7: 2909, 8: 3878, 9: 4555, 10: 5116, 11: 5674, 12: 5891, 13: 7651, 14: 8701, 15: 7986, 16: 8403, 17: 8628, 18: 7696, 19: 6550, 20: 5016, 21: 4722, 22: 5579, 23: 4770}, 'TRESPASS': {0: 517, 1: 372, 2: 404, 3: 331, 4: 

In [16]:
df5 = pd.DataFrame.from_dict(CrimeTimeDict)
df5.head()

Unnamed: 0,ASSAULT,BURGLARY,DISORDERLY CONDUCT,DRIVING UNDER THE INFLUENCE,DRUG/NARCOTIC,DRUNKENNESS,LARCENY/THEFT,PROSTITUTION,ROBBERY,STOLEN PROPERTY,TRESPASS,VANDALISM,VEHICLE THEFT,WEAPON LAWS
0,9996,3148,504,684,3626,804,16917,1314,2803,515,517,5994,4503,1032
1,8587,2037,374,623,2195,761,10413,1020,2840,370,372,4214,3109,749
2,7808,2156,330,617,1777,635,6868,750,2998,307,404,3980,2462,669
3,3939,2160,214,271,1197,250,4230,517,1809,236,331,2604,1511,393
4,2466,1930,151,103,880,128,2750,454,1173,210,235,1723,1164,290


In [17]:
#Transposing the df for easier plotting
df6 = df5.transpose()

#plotting the evolution of the focuscrimes
counter = 0
plt.figure(1)
for crime in focuscrimes:
    counter += 1
    plt.subplot(7, 2, counter)
    df6.ix[crime].plot(kind='bar'); plt.axhline(0, color='k')
    plt.title(crime)
    
plt.subplots_adjust(hspace=.5)
plt.show()

**I forgot to create a histogram for all the crimes pr hour, so it comes here:**

In [18]:
TotalCrimeTimeDict = {}
for hr in range(0, 24):
    TotalCrimeTimeDict[hr] = 0

for index in range(len(df)):
    time = df.get_value(index, 'Time')
    hr = int( time.split(":")[0] )
    TotalCrimeTimeDict[hr] += 1
    
print TotalCrimeTimeDict

{0: 103719, 1: 60049, 2: 50352, 3: 32404, 4: 22894, 5: 20339, 6: 30587, 7: 51007, 8: 75672, 9: 81771, 10: 86931, 11: 88618, 12: 120631, 13: 99044, 14: 102423, 15: 109948, 16: 114815, 17: 123971, 18: 128199, 19: 114542, 20: 104345, 21: 100098, 22: 104650, 23: 96856}


In [19]:
plt.figure(1)
plt.bar(range(len(TotalCrimeTimeDict)), TotalCrimeTimeDict.values(), align='center')
plt.xticks(range(len(TotalCrimeTimeDict)), TotalCrimeTimeDict.keys())
plt.title('Total # of crimes per hour')
plt.show()

**Exercises: The types of crime and how they take place across San Francisco's police districts.**

* So now we'll be combining information about PdDistrict and Category to explore differences between SF's >neighborhoods. First, simply list the names of SF's 10 police districts.

In [20]:
#finding the unique district names
districts_temp = df['PdDistrict'].unique()
print districts_temp

#Removing the nan element and converting to list instead of numpy array
districts = []
for i in range(len(districts_temp)-1):
    districts.append(districts_temp[i])
    
print districts
    
    

['MISSION' 'TENDERLOIN' 'NORTHERN' 'RICHMOND' 'BAYVIEW' 'CENTRAL' 'PARK'
 'TARAVAL' 'SOUTHERN' 'INGLESIDE' nan]
['MISSION', 'TENDERLOIN', 'NORTHERN', 'RICHMOND', 'BAYVIEW', 'CENTRAL', 'PARK', 'TARAVAL', 'SOUTHERN', 'INGLESIDE']


* Which has the most crimes? Which has the most focus crimes?

In [21]:
TotalCrimeDistrictDict = {}
for district in districts:
    TotalCrimeDistrictDict[district] = 0
    
for district in df['PdDistrict']:
    if district in districts:
        TotalCrimeDistrictDict[district] += 1
    

MaxCrimeDistrict = max(TotalCrimeDistrictDict, key=TotalCrimeDistrictDict.get)
print 'The district with most crimes is: %s' % MaxCrimeDistrict
print 'With %d as the total number of crimes since 2003. \n' % TotalCrimeDistrictDict[MaxCrimeDistrict]

TotalFocusCrimeDistrictDict = {}
for district in districts:
    TotalFocusCrimeDistrictDict[district] = 0
    
for index in range(len(df)):
    if df.get_value(index, 'Category') in focuscrimes:
        if df.get_value(index, 'PdDistrict') in districts:
            TotalFocusCrimeDistrictDict[df.get_value(index, 'PdDistrict')] += 1

MaxFocusCrimeDistrict = max(TotalFocusCrimeDistrictDict, key=TotalFocusCrimeDistrictDict.get)
print 'The district with most focus crimes is: %s' % MaxFocusCrimeDistrict
print 'With %d as the total number of focus crimes since 2003.' % TotalFocusCrimeDistrictDict[MaxFocusCrimeDistrict]

The district with most crimes is: SOUTHERN
With 364516 as the total number of crimes since 2003. 

The district with most focus crimes is: SOUTHERN
With 208888 as the total number of focus crimes since 2003.


**Exercises: The types of crime and how they take place across San Francisco's police districts.**

First, we need to calculate the relative probabilities of seeing each type of crime in the dataset as a whole. > That's simply a normalized version of this plot. Let's call it P(crime).

In [43]:
CrimeTotal = float(sum(df2[0]))
CrimeNormalized = []

for i in df2[0]:
    CrimeNormalized.append(i/CrimeTotal)
    
df2['Normalized'] = CrimeNormalized

bar_width = 0.5
index = range(len(df2[0]))
plt.bar(index, df2['Normalized'], bar_width)


plt.xticks([x+(bar_width/2) for x in index], df2.index, rotation='vertical')
plt.xlabel('x')
plt.ylabel('Ratio')
plt.title('Ratio of incidents per crime')

plt.show()

Next, we calculate that same probability distribution but for each PD district, let's call that P(crime|district).

In [45]:
#Creating an empty dictionary
DistrictFocusCountDict = {}
for district in districts:
    DistrictFocusCountDict[district]={}
    for crime in focuscrimes:
        DistrictFocusCountDict[district][crime] = 0
        
#counting each instance of the crime
for index in df[]

NameError: name 'CrimeDistrictDict' is not defined

# Part 4: A little bit of plotting


In [None]:
#Setting bounding box
max_lat = 37.833972
min_lat = 37.713972
max_lon = -122.461297
min_lon = -122.401297

bbox = BoundingBox(north=max_lat, west=min_lon, south=min_lat, east=max_lon)
geoplotlib.set_bbox(bbox)





In [53]:
lon = []
lat = []
for index in range(len(df)):
    lon.append(int(df.get_value(index, 'X')))
    lat.append(int(df.get_value(index, 'Y')))


    


In [54]:
import geoplotlib

geoplotlib.kde(lon, lat)
geoplotlib.inline(width=900)
geoplotlib.show()

Traceback (most recent call last):
  File "C:\Users\Simon\Anaconda2\lib\site-packages\geoplotlib\__init__.py", line 32, in _runapp
    app.start()
  File "C:\Users\Simon\Anaconda2\lib\site-packages\geoplotlib\core.py", line 367, in start
    l.invalidate(self.proj)
  File "C:\Users\Simon\Anaconda2\lib\site-packages\geoplotlib\layers.py", line 734, in invalidate
    xv, yv = proj.lonlat_to_screen(self.values['lon'], self.values['lat'])
TypeError: list indices must be integers, not str
