In [1]:
# vectorized string operation 
import numpy as np
x = np.array([2, 3, 5, 7, 11, 13])
x * 2

array([ 4,  6, 10, 14, 22, 26])

In [2]:
# vectorization of operation simplifies the syntax of operating on arrays of data, nimpy does not allow vectorization operation on string data
data = ['peter', 'Paul', 'MARY', 'gUIDO']
[s.capitalize() for s in data]

['Peter', 'Paul', 'Mary', 'Guido']

In [3]:
# it will break if there are any missing value
# data = ['peter', 'Paul', None, 'MARY', 'gUIDO'] 
# [s.capitalize() for s in data]

In [4]:
# pandas can do the vectorized string operation and handle missing value 
import pandas as pd
data = ['peter', 'Paul', None, 'MARY', 'gUIDO']
names = pd.Series(data)
names

0    peter
1     Paul
2     None
3     MARY
4    gUIDO
dtype: object

In [5]:
names.str.capitalize()

0    Peter
1     Paul
2     None
3     Mary
4    Guido
dtype: object

In [6]:
# Tables of Pandas String Methods 
monte = pd.Series(['Graham Chapman', 'John Cleese', 'Terry Gilliam',
                          'Eric Idle', 'Terry Jones', 'Michael Palin'])
monte

0    Graham Chapman
1       John Cleese
2     Terry Gilliam
3         Eric Idle
4       Terry Jones
5     Michael Palin
dtype: object

In [7]:
monte.str.lower()

0    graham chapman
1       john cleese
2     terry gilliam
3         eric idle
4       terry jones
5     michael palin
dtype: object

In [8]:
monte.str.len()

0    14
1    11
2    13
3     9
4    11
5    13
dtype: int64

In [9]:
monte.str.startswith('T')

0    False
1    False
2     True
3    False
4     True
5    False
dtype: bool

In [10]:
monte.str.split()

0    [Graham, Chapman]
1       [John, Cleese]
2     [Terry, Gilliam]
3         [Eric, Idle]
4       [Terry, Jones]
5     [Michael, Palin]
dtype: object

In [11]:
# methods using regular expression 
monte.str.extract('([A-Za-z]+)')

Unnamed: 0,0
0,Graham
1,John
2,Terry
3,Eric
4,Terry
5,Michael


In [12]:
# start-of-string (^) and end-of-string ($) regular expression characters
monte.str.findall(r'^[^AEIOU].*[^aeiou]$')

0    [Graham Chapman]
1                  []
2     [Terry Gilliam]
3                  []
4       [Terry Jones]
5     [Michael Palin]
dtype: object

In [13]:
# MISCELLANEOUS METHODS
monte.str.slice(0,3)

0    Gra
1    Joh
2    Ter
3    Eri
4    Ter
5    Mic
dtype: object

In [14]:
monte.str[0:3]

0    Gra
1    Joh
2    Ter
3    Eri
4    Ter
5    Mic
dtype: object

In [15]:
# f.str.get(i) and df.str[i] is similar
monte.str.split().str.get(-1)

0    Chapman
1     Cleese
2    Gilliam
3       Idle
4      Jones
5      Palin
dtype: object

In [16]:
# get dummies() for column containing some sort of coded indicator
full_monte = pd.DataFrame({'name': monte,
                           'info': ['B|C|D', 'B|D', 'A|C', 'B|D', 'B|C',
                           'B|C|D']})
full_monte

Unnamed: 0,name,info
0,Graham Chapman,B|C|D
1,John Cleese,B|D
2,Terry Gilliam,A|C
3,Eric Idle,B|D
4,Terry Jones,B|C
5,Michael Palin,B|C|D


In [17]:
full_monte['info'].str.get_dummies('|')

Unnamed: 0,A,B,C,D
0,0,1,1,1
1,0,1,0,1
2,1,0,1,0
3,0,1,0,1
4,0,1,1,0
5,0,1,1,1


In [18]:
# work with rime series
# built-in datetime module and third-party dateutil module
from datetime import datetime
datetime(year = 2015, month = 7, day = 4)

datetime.datetime(2015, 7, 4, 0, 0)

In [19]:
# use dateutil to parse dates from a variety of string formats
from dateutil import parser
date = parser.parse('4th of July, 2015')
date

datetime.datetime(2015, 7, 4, 0, 0)

In [20]:
date.strftime('%A')

'Saturday'

In [21]:
date = np.array('2015-07-04', dtype=np.datetime64) # Numpy time series data type datetime64
date

array('2015-07-04', dtype='datetime64[D]')

In [22]:
# once we have this date formatted, we can do vectorized operations on it
date + np.arange(10)

array(['2015-07-04', '2015-07-05', '2015-07-06', '2015-07-07',
       '2015-07-08', '2015-07-09', '2015-07-10', '2015-07-11',
       '2015-07-12', '2015-07-13'], dtype='datetime64[D]')

In [23]:
np.datetime64('2015-07-04') # day-based datetime

numpy.datetime64('2015-07-04')

In [24]:
np.datetime64('2015-07-04 12:00') # minute-based datetim

numpy.datetime64('2015-07-04T12:00')

In [25]:
# You can force any desired fundamental unit using one of many format codes
np.datetime64('2015-07-04 12:59:59.50', 'ns')

numpy.datetime64('2015-07-04T12:59:59.500000000')

In [26]:
# Y for year, M for month, W for week, D for day, h for hour, m for minute, s for second, ms for millisecond, us for microsecond, ns for nano second, ps for picosecond,fs for femtosecond, as for attosecond

In [27]:
# use pandas to parse a flexibly formatted string date
date = pd.to_datetime('4 th of July, 2015')
date

Timestamp('2015-07-04 00:00:00')

In [28]:
# use format code to ouput the day of the week
date.strftime('%A')

'Saturday'

In [29]:
# do np-style vectorized operation 
date + pd.to_timedelta(np.arange(10), 'D')

DatetimeIndex(['2015-07-04', '2015-07-05', '2015-07-06', '2015-07-07',
               '2015-07-08', '2015-07-09', '2015-07-10', '2015-07-11',
               '2015-07-12', '2015-07-13'],
              dtype='datetime64[ns]', freq=None)

In [30]:
# indexing by time , index date by timestamps
index = pd.DatetimeIndex(['2014-07-04', '2014-08-04',
                          '2015-07-04', '2015-08-04'])
data = pd.Series([0,1,2,3], index = index)
data

2014-07-04    0
2014-08-04    1
2015-07-04    2
2015-08-04    3
dtype: int64

In [31]:
# then we can use Series functions 
data['2014-07-04':'2015-07-04']

2014-07-04    0
2014-08-04    1
2015-07-04    2
dtype: int64

In [32]:
# pass a year to obtain a slice of all data from that year
data['2015']

2015-07-04    2
2015-08-04    3
dtype: int64

In [33]:
# pandas time series data structure for time stamps associated index structure DatetimeIndex
# time periods associated index structure PeriodIndex
# time deltas associated index structure TimedeltaIndex
# Passing a single date to pd.to_datetime() yields a Timestamp passing a series of dates by default yields a DatetimeIndex
dates = pd.to_datetime([datetime(2015, 7, 3), '4th of July, 2015',
                        '2015-Jul-6', '07-07-2015', '20150708'])
dates

DatetimeIndex(['2015-07-03', '2015-07-04', '2015-07-06', '2015-07-07',
               '2015-07-08'],
              dtype='datetime64[ns]', freq=None)

In [34]:
# DatetimeIndex can be converted to a PeriodIndex using to_period() with additional frequency code 
dates.to_period('D')

PeriodIndex(['2015-07-03', '2015-07-04', '2015-07-06', '2015-07-07',
             '2015-07-08'],
            dtype='period[D]', freq='D')

In [35]:
# TimedeltaIndex is created when one date is subtrzcted from another
dates-dates[0]

TimedeltaIndex(['0 days', '1 days', '3 days', '4 days', '5 days'], dtype='timedelta64[ns]', freq=None)

In [36]:
# regular date sequences
# pd.date_range() for timestamps
# pd.period_range() for periods
# pd.timedelta_range() for time deltas 
# starting date with ending date with step default by day
pd.date_range('2015-07-03', '2015-07-10')

DatetimeIndex(['2015-07-03', '2015-07-04', '2015-07-05', '2015-07-06',
               '2015-07-07', '2015-07-08', '2015-07-09', '2015-07-10'],
              dtype='datetime64[ns]', freq='D')

In [37]:
pd.date_range('2015-07-03', periods=8) # alternative way 

DatetimeIndex(['2015-07-03', '2015-07-04', '2015-07-05', '2015-07-06',
               '2015-07-07', '2015-07-08', '2015-07-09', '2015-07-10'],
              dtype='datetime64[ns]', freq='D')

In [38]:
# the default frequency is D, but we can construct a range of hourly timestamps
pd.date_range('2015-07-03' , periods = 8, freq='H')

DatetimeIndex(['2015-07-03 00:00:00', '2015-07-03 01:00:00',
               '2015-07-03 02:00:00', '2015-07-03 03:00:00',
               '2015-07-03 04:00:00', '2015-07-03 05:00:00',
               '2015-07-03 06:00:00', '2015-07-03 07:00:00'],
              dtype='datetime64[ns]', freq='H')

In [39]:
pd.period_range('2015-07', periods=8, freq='M')

PeriodIndex(['2015-07', '2015-08', '2015-09', '2015-10', '2015-11', '2015-12',
             '2016-01', '2016-02'],
            dtype='period[M]', freq='M')

In [40]:
pd.timedelta_range(0, periods=10, freq='H')

TimedeltaIndex(['0 days 00:00:00', '0 days 01:00:00', '0 days 02:00:00',
                '0 days 03:00:00', '0 days 04:00:00', '0 days 05:00:00',
                '0 days 06:00:00', '0 days 07:00:00', '0 days 08:00:00',
                '0 days 09:00:00'],
               dtype='timedelta64[ns]', freq='H')

In [41]:
# frequency code D calenday B business day W weekly M mont end, BM business month end, Q quarter end, BQ business quarter end,
# A year end, BA business year end, H hours BH business hours T minutes S second
# add S suffix to M, BM Q etc marks the beginning such as MS month start QS quarter start
# you can change the month by adding a three-letter month code as a suffix
# Q-JAN, BQ-FEB, QS-MAR, BQS-APR
# A-JAN, BA-FEB, AS-MAR, BAS-APR 
# W-SUN, W-MON, W-TUE, W-WED

In [42]:
pd.timedelta_range(0, periods=9, freq="2H30T") # codes can be combined with numbers to specify other frequencies

TimedeltaIndex(['0 days 00:00:00', '0 days 02:30:00', '0 days 05:00:00',
                '0 days 07:30:00', '0 days 10:00:00', '0 days 12:30:00',
                '0 days 15:00:00', '0 days 17:30:00', '0 days 20:00:00'],
               dtype='timedelta64[ns]', freq='150T')

In [43]:
# pd.tseries.offsets module for pandas time series offsets
from pandas.tseries.offsets import BDay
pd.date_range('2015-07-01', periods = 5, freq = BDay())

DatetimeIndex(['2015-07-01', '2015-07-02', '2015-07-03', '2015-07-06',
               '2015-07-07'],
              dtype='datetime64[ns]', freq='B')