# Chapter 13 -- Putting it Together

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from bokeh.charts import Bar, output_file, show
from bokeh.io import output_notebook
from bokeh.charts.attributes import CatAttr
from collections import OrderedDict
import calendar

Read the accidents .csv file

In [2]:
file_loc = "C:\Data\\DftRoad_Safety_2015.csv"
df = pd.read_csv(file_loc,low_memory=False)

In [3]:
print(df.shape, df.ndim)

(140086, 32) 2


Examine the DataFrame attributes

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 140086 entries, 0 to 140085
Data columns (total 32 columns):
Accident_Index                                 140086 non-null object
Location_Easting_OSGR                          140058 non-null float64
Location_Northing_OSGR                         140058 non-null float64
Longitude                                      140058 non-null float64
Latitude                                       140058 non-null float64
Police_Force                                   140086 non-null int64
Accident_Severity                              140086 non-null int64
Number_of_Vehicles                             140086 non-null int64
Number_of_Casualties                           140086 non-null int64
Date                                           140086 non-null object
Day_of_Week                                    140086 non-null int64
Time                                           140068 non-null object
Local_Authority_(District)                     140

We need just a few columns.

In [5]:
df = df[['Accident_Index', 'Accident_Severity', 'Number_of_Vehicles', 
'Number_of_Casualties', 'Date', 'Day_of_Week', 'Time', 'Weather_Conditions']]

Shorten column names to reduce the amount of typing needed.

In [6]:
df = df.rename(columns = {
        'Accident_Index':'index',
        'Accident_Severity':'severity',
        'Number_of_Vehicles':'vehicles',
        'Number_of_Casualties':'casualties',
        'Day_of_Week':'day',
        'Weather_Conditions':'weather'
    })

For consistency, make all of the column names lower-case

In [7]:
# Lower-case all DataFrame column names
df.columns = map(str.lower, df.columns)

Review the columns and names we have so far.

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 140086 entries, 0 to 140085
Data columns (total 8 columns):
index         140086 non-null object
severity      140086 non-null int64
vehicles      140086 non-null int64
casualties    140086 non-null int64
date          140086 non-null object
day           140086 non-null int64
time          140068 non-null object
weather       140086 non-null int64
dtypes: int64(5), object(3)
memory usage: 8.6+ MB


Check for missing values

In [9]:
df.isnull().any()

index         False
severity      False
vehicles      False
casualties    False
date          False
day           False
time           True
weather       False
dtype: bool

Inspect the values for time

In [10]:
df[df['time'].isnull()].head()

Unnamed: 0,index,severity,vehicles,casualties,date,day,time,weather
44671,2.02E+12,3,2,1,5/1/2015,2,,1
45170,20151322B0546,3,2,2,11/2/2015,4,,1
45290,20151322L0759,2,2,1,21/02/2015,7,,1
45302,20151322M0629,3,2,1,22/02/2015,1,,1
46078,20151324N1112,2,2,1,23/04/2015,5,,1


In [11]:
# Creates a DataFrame and filters day = 6 and uses the len function to 'count'
#NEED BETTER WAY TO DO THIS--SEE NEXT
accidents_sat = df.loc[df.day == 6]
print("Accidents which happened on Friday: {0}".format(
    len(accidents_sat)))
print(type(accidents_sat))


Accidents which happened on Friday: 22380
<class 'pandas.core.frame.DataFrame'>


In [12]:
df_sat = df.query("day == 6")
print("Accidents which happened on Friday: {0}".format(
    len(df_sat)))

Accidents which happened on Friday: 22380


In [13]:
#We want to find the date ranges in out table, but the date column is type 'object' and not datetime
# dtype of 'O' is an object aka string
df['date'].dtype

dtype('O')

In [14]:
# Since this is a timeseries, we set the date column as the index
#df2.set_index('date',inplace=True)

In [15]:
# dataframe where day = Friday
df_sat2 = df[(df.day == 6)]
df_sat2.shape

(22380, 8)

In [17]:
# returns rows between 15Dec2015 and 31DecJan2015 and the associated 'Number_of_Vehicles' and 'Time' values

df.loc['2015-12-25':'2015/12/31', ['vehicles', 'time']].head()

Unnamed: 0,vehicles,time


In [18]:
# How many accidents occured on Christman Day?

print('There were {} accidents on Christmas 2015.'.format(len(df.loc['2015-12-25'])))

KeyError: 'the label [2015-12-25] is not in the [index]'

In [None]:
# selecting all columns by name

df1 = df[['Age_of_Driver', 'Sex_of_Driver', 'Time']]
df1.shape

The analog SAS program is below.  Notice how the KEEP list is associated with the SET statement which directs SAS to read just the 3 columns from the input data set.  The KEEP list on the input data set returns the same results but instead reads all of the columns.

In [None]:
52       data df1;
53          set uk_accidents(keep = age_of_driver sex_of_driver date);

NOTE: 266776 observations were read from "WORK.uk_accidents"
NOTE: Data set "WORK.df1" has 266776 observation(s) and 3 variable(s)

In [None]:
# dataframe where day = Friday and weather = raining
df_sat3 = df[(df.day == 6) & (df.weather == 2)]
df_sat3.shape

The example below illustrates the use of column as an attribute.  You utilize the square brackets to create a slice.  This creates a DataFrame selecting the rows expressed by the boolean condition.  Observe how the name of the DataFrame appears before the square brackets [ ].  Without this object name, the syntax produces a list as shown in the example two cells below.

What is needed, is both distinct values and a count.  The value_counts() method is analogus to PROC FREQ.  Later in this chapter, we will see additional DataFrame methods like crosstab() to render results in a less spartan fashion.

In [None]:
# similar to PROC SQL; select distinct, the unique() method finds unique values for a variable

df.Sex_of_Driver.unique()

In [None]:
50       proc freq data = uk_accidents;
51       tables sex_of_driver / missing missprint nocum nopercent;

In [None]:
# From the SAS example above, output from PROC PRINT
#C:\Users\randy\Anaconda3\output

Image(filename='Anaconda3\\output\\freq_sex_of_driver.JPG')

In [None]:
#Correct syntax
df2 = df[(df.Sex_of_Driver == 2) & (df.Age_of_Driver >= 70)]
print(type(df2))
print(len(df2))
print(df2.shape)

In [19]:
df.describe(percentiles=None)

Unnamed: 0,severity,vehicles,casualties,day,weather
count,140086.0,140086.0,140086.0,140086.0,140086.0
mean,2.833895,1.841005,1.329248,4.095077,1.5117
std,0.402014,0.710215,0.795125,1.911234,1.520103
min,1.0,1.0,1.0,1.0,1.0
25%,3.0,1.0,1.0,2.0,1.0
50%,3.0,2.0,1.0,4.0,1.0
75%,3.0,2.0,1.0,6.0,1.0
max,3.0,37.0,38.0,7.0,9.0


In [20]:
equiv={1:'Sunday', 
       2:'Monday', 
       3:'Tuesday', 
       4:'Wednesday',
       5:'Thursday',
       6:'Friday',
       7:'Saturday'
      }

df["day_fmt"] = df["day"].map(equiv)


In [21]:
df.set_index('day_fmt', drop=False, inplace=True)

In [22]:
hist = df.groupby('day_fmt')['casualties'].aggregate(['count']).sort_values('count')
print(hist)
type(hist)
type(hist.info())

           count
day_fmt         
Sunday     15259
Saturday   18117
Monday     20036
Wednesday  21371
Tuesday    21433
Thursday   21490
Friday     22380
<class 'pandas.core.frame.DataFrame'>
Index: 7 entries, Sunday to Friday
Data columns (total 1 columns):
count    7 non-null int64
dtypes: int64(1)
memory usage: 112.0+ bytes


NoneType

if you prefer graphs

In [23]:
output_notebook()

In [24]:
x = [calendar.day_name[i-1] for i in range(7)]
x

['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday']

In [25]:
df['dow'] = [calendar.day_name[i-1] for i in range(7)]
p = Bar(hist, label=CatAttr(columns=['dow'], sort=False), 
        values='count', title='Casualties by Day of Week', xlabel="Day", ylabel="Value")
show(p)

ValueError: Length of values does not match length of index

In [None]:

p = Bar(df, label='day',
            values='day', color='wheat', 
            agg='count', title="Casualties by Day of Week")

show(p)

In [None]:
df['dow'] = [calendar.day_name[i] for i in range(7)]
df['dow']

In [None]:

hist.plot(kind='bar', title='Casualties by Day of Week')

In [None]:
# think of these in terms first coined by Hadley Wickham of Rstats fame: Split, Apply, Combine
df.groupby('day_fmt')['casualties'].mean()


In [None]:
# Alternatively using the aggregate() method
df.groupby('day_fmt')['casualties'].aggregate(['mean'])

In [None]:
df.groupby('day_fmt')['casualties'].describe().unstack()

In [None]:
df.groupby('day_fmt')['casualties'].aggregate(['max', 'std', 'count'])

In [None]:
# Pivot_table example
df.pivot_table(index='day', columns='severity', aggfunc={'casualties' : 'mean', 'vehicles' : 'sum'})

In [None]:
print(type(df.pivot_table))

In [None]:
df.pivot_table('casualties', index='severity', columns='day', margins=True)

In [None]:
#NOT WORKING
#daily_cnt = df2.set_index('date').df2.groupby([df2['date'],pd.TimeGrouper(freq='Day')])

In [None]:
df['date_num'] = pd.to_datetime(df['date'])


In [None]:
# Range for rows with dates between 25Dec15 and 31Dec15
df.loc['12/25/2015':'12/31/2015']

print('There were {} accidents between 25Dec2015 and 31Dec2015:'.format(len(df.loc['2015-12-25'-'2015-12-31'])))

You can use a range for both row and column selection.  In the example below, rows are from the data range (start and end dates) inclusive.  This is equivalent to SAS' WHERE clause:

    (where=(date between "25Dec2015"d and "31Dec2015"d))
    
The SAS example is shown below.

In [None]:
61       data _null_ (keep=date);
62          set uk_accidents(where=(date between "25Dec2015"d and "31Dec2015"d)) end=end;
63       retain count 0;
64       
65       count+1;
66          if end then put 'There were ' count' accidents between 25Dec15 and 31Dec15';

There were 53373 accidents between 25Dec15 and 31Dec15

The next two examples use a boolean mask with .loc() method.  The mask is a Series of True/False values the .loc() indexer uses to select those rows where the condition(s) evaluate True.  Notice we are not relying on the indexed column Date, however, this column is returned since it is remains set as the index for the DataFrame.

In [None]:
# selection with boolean conditions using .loc() method 
# All rows for Saturday, speed limit > 70 and the associated 'Time' values

df.loc[(df['Day_of_Week'] == 6) & (df['Speed_limit'] >= 70)].head(2)

In [None]:
# A boolean mask used to select the records for Saturday, speed limit > 70, and the associated column 'Time'

df.loc[(df['Day_of_Week'] == 6) & (df['Speed_limit'] >= 70), ['Time']].head()

In [None]:
###NOT WORKING
# what are the dates for the worst accident (Largest number of vehicles involved, casualties > 10)
print(df.casualties.max())
def filter_func(x):
    return x['casualties'].max() == df.casualties.max()
out=df.groupby('date').filter(filter_func)
out.head()

In [None]:
# FROM: http://stackoverflow.com/questions/25071937/filter-pandas-dataframe-based-on-max-values-in-a-column
#df.iloc[df.groupby('date')['casualties'].idxmax().values.ravel()]

In [None]:
print(type(df.date_num))

In [None]:
df.info()

In [None]:
resamp = df.set_index('date_num').groupby('severity').resample('M').sum()

In [None]:
resamp.head(30)

In [None]:
resamp.index

In [None]:
start = df.date_num.min()
print(start)
end = df.date_num.max()
print(end)

In [None]:
df['date_num'].dtype

In [None]:
df.info()

In [None]:
del df['date']

In [None]:
df = df.rename(columns = {'date_num':'date'})

In [None]:
df.info()

In [None]:
#NOT SURE WHEN THIS NEEDED******************************
#Time Series for plotting
#ts = df2.set_index(['date'])

In [None]:
# crosstab for number of accidents by day of week

pd.crosstab(df.casualties.sum(), df.day, rownames=['Casualties'], colnames=['Day of Week'])

In [None]:
equiv={1:'1=Sunday', 
       2:'2=Monday', 
       3:'3=Tuesday', 
       4:'4=Wednesday',
       5:'5=Thursday',
       6:'6=Friday',
       7:'7=Saturday'
      }

df["day_str"] = df["day"].map(equiv)

In [None]:
# crosstab for number of accidents by day of week
pd.crosstab(df.casualties.sum(), df.day_str, rownames=['Casualties'], colnames=['Day of Week'])


In [None]:
equiv={1:'1=Sunny', 
       2:'2=Raining', 
       3:'3=Snowing', 
       4:'4=Sunny + windy',
       5:'5=Raining + windy',
       6:'6=Snowing + windy',
       7:'7=Foggy',
       8:'8=Other',
       9:'9=Unknown'
      }

df["weather_str"] = df["weather"].map(equiv)

In [None]:
# crosstab for number of accidents by day of week with weather conditions

pd.crosstab(df.weather_str, df.day_str, df.casualties, aggfunc=[len],
            rownames=['Weather'], colnames=['Day of Week'])

In [None]:
import matplotlib.pyplot as plt
import matplotlib as mpl
fig, ax = plt.subplots(figsize=(10, 8))

df.pivot_table('casualties', index='day_str',
                    columns='severity', aggfunc='mean').plot(ax=ax)
plt.gca().set_xticklabels(['Mon', 'Tues', 'Wed', 'Thurs', 'Fri', 'Sat', 'Sun'])
plt.ylabel('mean casualties by day');

In [None]:
df.index

In [None]:
# Create a new data frame by selecting where day = Friday and weather = Sunny

print(len(df.index))

In [None]:
# Create a new data frame by selecting where day = Friday and weather = Sunny


friday = df.day == 6 # day is the numeric variable
sunny = df.weather == 1 #weather is a numeric variable

#friday and sunny are boolean variables (True or False)

df_sub1 = df[friday & sunny]
len(df_sub1.index)

type(df_sub1)
df_sub1.info()


In [None]:
# Create a new data frame by selecting where day = Friday and weather = Sunny
# Method 2 Using Variables attributes
#df.loc[(df['A'] == 'foo') & (df['B'] == 'one')]

rows= df.loc[(df['day'] == 6) & (df['weather'] == 1)]

In [None]:
print(df.size, df.shape, df.ndim)

In [None]:
print(data)

In [None]:
'a' in data

In [None]:
0.25 in data

In [None]:
data.keys()

In [None]:
df.set_index(df.date)

In [None]:
df.loc[0]

In [None]:
#from chapter 9 timeseries
counts = default_srt['index_nsa'].value_counts() 
counts.head(20)