# Chapter 7 -- Pandas, Part2
#### Slicing, Dicing, & Subsetting; 

The usual pre-amble to get packages loaded into the namespace.

In [None]:
import numpy as np
import pandas as pd
from numpy.random import randn
from pandas import Series, DataFrame, Index

Reading the .csv file with the reader read_csv.

In [2]:
# creating a DataFrame with the read_csv() method

file_loc = "C:\Data\\uk_accidents.csv"
df = pd.read_csv(file_loc,low_memory=False)

Drop rows with missing values.

In [3]:
# dropping rows in the original accidents DataFrame

print(df.shape)
df = df.dropna()
print(df.shape)

(266776, 27)
(266752, 27)


#### Slicing, Dicing, and Subseting

In [4]:
#df.sort_values(by='Date').tail()

In [5]:
df.set_index('Date', drop=False, inplace=True)

In [None]:
# Indexed returned when sorted
df.index

In [6]:
# Indexed returned when NOT-sorted
df.index

Index(['1/9/2015', '1/9/2015', '2/23/2015', '2/23/2015', '2/23/2015',
       '2/11/2015', '2/11/2015', '2/23/2015', '2/23/2015', '4/18/2015',
       ...
       '8/30/2015', '11/29/2015', '11/29/2015', '11/29/2015', '7/26/2015',
       '7/26/2015', '12/31/2015', '7/28/2015', '7/28/2015', '7/15/2015'],
      dtype='object', name='Date', length=266752)

In [None]:
#df['Date'] = pd.to_datetime(df['Date'])

In [None]:
df2 = pd.DataFrame([['cold','slow', np.nan, 2., 6., 3.], 
                    ['warm', 'medium', 4, 5, 7, 9],
                    ['hot', 'fast', 9, 4, np.nan, 6],
                    ['cool', None, np.nan, np.nan, 17, 89],
                    ['cool', 'medium', 16, 44, 21, 13],
                    ['cold', 'slow', np.nan, 29, 33, 17]],
                    columns=['col1', 'col2', 'col3', 'col4', 'col5', 'col6'],
                    index=['f','b','c','d','e','f'])

In [8]:
df.index.is_monotonic_increasing

False

In [None]:
# Range for rows with dates between 25Dec15 and 31Dec15
df.loc['12/25/2015':'12/31/2015']

print('There were {} accidents between 25Dec2015 and 31Dec2015:'.format(len(df.loc['2015-12-25'-'2015-12-31'])))

In [None]:
# add a new column to the dataframe 

df['id'] = list(range(len(df.index)))

In [None]:
df.set_index('Date', drop=False, inplace=True)

In [None]:
df.loc['2015-12-25'].head(2)

In [None]:
print(type(df.loc['2015-12-25']))

In [None]:
The built-in function len() returns of the length of the row slice.

In [None]:
# How many accidents occured on Christman Day?

print('There were {} accidents on Christmas 2015.'.format(len(df.loc['2015-12-25'])))

In [None]:
# returns rows between 15Dec2015 and 31DecJan2015 and the associated 'Number_of_Vehicles' and 'Time' values

df.loc['2015-12-25':'2015/12/31', ['Number_of_Vehicles', 'Time']].head()

In [None]:
# returns rows between 15Dec2015 and 31DecJan2015 and the associated 'Number_of_Vehicles' and 'Time' values

df.loc['2015-12-25':'2015/12/31', ['Number_of_Vehicles', 'Time']].head()

In [None]:
# selection with boolean conditions using .loc() method 
# All rows for Saturday, speed limit > 70 and the associated 'Time' values

df.loc[(df['Day_of_Week'] == 6) & (df['Speed_limit'] >= 70)].head(2)

In [None]:
# A boolean mask used to select the records for Saturday, speed limit > 70, and the associated column 'Time'

df.loc[(df['Day_of_Week'] == 6) & (df['Speed_limit'] >= 70), ['Time']].head()

In [None]:
# DataFrame describe method

df.describe(percentiles=None)

In [None]:
# similar to PROC SQL; select distinct, the unique() method finds unique values for a variable

df.Sex_of_Driver.unique()

In [None]:
# similar to PROC SQL; select distinct, the unique() method finds unique values for a variable

df.Sex_of_Driver.unique()

In [None]:
df.Sex_of_Driver.value_counts()

In [None]:
# From the SAS example above, output from PROC PRINT
#C:\Users\randy\Anaconda3\output

Image(filename='Anaconda3\\output\\freq_sex_of_driver.JPG')

Square brackets [ ] are used to subset a Python object.  This syntax is illustrated in the cell below.  df1 is a new DataFrame which contains three columns and all rows from DataFrame df.

In [None]:
# selecting all columns by name

df1 = df[['Age_of_Driver', 'Sex_of_Driver', 'Time']]
df1.shape

The analog SAS program is below.  Notice how the KEEP list is associated with the SET statement which directs SAS to read just the 3 columns from the input data set.  The KEEP list on the input data set returns the same results but instead reads all of the columns. 

In [None]:
52       data df1;
53          set uk_accidents(keep = age_of_driver sex_of_driver date);

NOTE: 266776 observations were read from "WORK.uk_accidents"
NOTE: Data set "WORK.df1" has 266776 observation(s) and 3 variable(s)

In [None]:
#Wrong syntax example
df2 = df[[('Sex_of_Driver' == 2) & ('Age_of_Driver' >= 70)]]

In [None]:
#Correct syntax
df2 = df[(df.Sex_of_Driver == 2) & (df.Age_of_Driver >= 70)]
print(type(df2))
print(len(df2))
print(df2.shape)

In [None]:
df3 = [(df.Sex_of_Driver == 2) & (df.Age_of_Driver >= 70)]
type(df3)

In [None]:
len(df3)

In [None]:
df3[0][1:10]

In [None]:
print("Female Drivers with age >= 70: {0}".format(len(df2)))