In [35]:
import pandas as pd
import numpy as np

#### string to time

In [6]:
#可以接受array作为参数！
help(pd.to_datetime)

Help on function to_datetime in module pandas.core.tools.datetimes:

to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False, utc=None, box=True, format=None, exact=True, unit=None, infer_datetime_format=False, origin='unix', cache=False)
    Convert argument to datetime.
    
    Parameters
    ----------
    arg : integer, float, string, datetime, list, tuple, 1-d array, Series
    
        .. versionadded:: 0.18.1
    
           or DataFrame/dict-like
    
    errors : {'ignore', 'raise', 'coerce'}, default 'raise'
    
        - If 'raise', then invalid parsing will raise an exception
        - If 'coerce', then invalid parsing will be set as NaT
        - If 'ignore', then invalid parsing will return the input
    dayfirst : boolean, default False
        Specify a date parse order if `arg` is str or its list-likes.
        If True, parses dates with the day first, eg 10/11/12 is parsed as
        2012-11-10.
        with day first (this is a known bug, based on dateutil

In [7]:
from dateutil.parser import parse
help(parse)

Help on function parse in module dateutil.parser._parser:

parse(timestr, parserinfo=None, **kwargs)
    Parse a string in one of the supported formats, using the
    ``parserinfo`` parameters.
    
    :param timestr:
        A string containing a date/time stamp.
    
    :param parserinfo:
        A :class:`parserinfo` object containing parameters for the parser.
        If ``None``, the default arguments to the :class:`parserinfo`
        constructor are used.
    
    The ``**kwargs`` parameter takes the following keyword arguments:
    
    :param default:
        The default datetime object, if this is a datetime object and not
        ``None``, elements specified in ``timestr`` replace elements in the
        default object.
    
    :param ignoretz:
        If set ``True``, time zones in parsed strings are ignored and a naive
        :class:`datetime` object is returned.
    
    :param tzinfos:
        Additional time zone names / aliases which may be present in the
        s

#### time series basic

In [16]:
dates = pd.date_range(start='2000/1/1', periods=100, freq='W-WED')
long_df = pd.DataFrame(np.random.randn(100, 4),
                      index=dates,
                      columns=['Colorado', 'Texas',
                               'New York', 'Ohio'])

In [18]:
#你可以直接选择年月, 返回所有匹配的日期
long_df.loc['2001-5']

Unnamed: 0,Colorado,Texas,New York,Ohio
2001-05-02,-0.302907,0.588677,-1.640485,0.273731
2001-05-09,-0.74385,-1.718399,0.481137,-0.452213
2001-05-16,1.919083,1.506956,-0.492534,-0.362583
2001-05-23,0.940581,-0.305602,1.168225,0.228228
2001-05-30,-0.232412,-0.494634,-0.097365,-2.724059


In [20]:
#a useful shorthand for boolean indexing based on index values above or below certain thresholds.
help(long_df.truncate)

Help on method truncate in module pandas.core.generic:

truncate(before=None, after=None, axis=None, copy=True) method of pandas.core.frame.DataFrame instance
    Truncate a Series or DataFrame before and after some index value.
    
    This is a useful shorthand for boolean indexing based on index
    values above or below certain thresholds.
    
    Parameters
    ----------
    before : date, string, int
        Truncate all rows before this index value.
    after : date, string, int
        Truncate all rows after this index value.
    axis : {0 or 'index', 1 or 'columns'}, optional
        Axis to truncate. Truncates the index (rows) by default.
    copy : boolean, default is True,
        Return a copy of the truncated section.
    
    Returns
    -------
    type of caller
        The truncated Series or DataFrame.
    
    See Also
    --------
    DataFrame.loc : Select a subset of a DataFrame by label.
    DataFrame.iloc : Select a subset of a DataFrame by position.
    
 

In [21]:
#如何处理重复日期
dates = pd.DatetimeIndex(['1/1/2000', '1/2/2000', 
                          '1/2/2000','1/2/2000', '1/3/2000'])
dup_ts = pd.Series(np.arange(5), index=dates)

In [24]:
#判断是否有重复
dup_ts.index.is_unique

False

In [26]:
#有重复
dup_ts['1/2/2000']

2000-01-02    1
2000-01-02    2
2000-01-02    3
dtype: int64

In [27]:
#去重方法之一: groupby(level=0)
dup_ts.groupby(level=0).mean()

2000-01-01    0
2000-01-02    2
2000-01-03    4
dtype: int64

#### Date Ranges, Frequencies, and Shifting

In [50]:
#产生time_range
#!!!!!你最多只能指定四个参数中的三个: start, end, freq, periods
pd.date_range(start='2012-04-01', end='2012-06-01', freq='5D')

DatetimeIndex(['2012-04-01', '2012-04-06', '2012-04-11', '2012-04-16',
               '2012-04-21', '2012-04-26', '2012-05-01', '2012-05-06',
               '2012-05-11', '2012-05-16', '2012-05-21', '2012-05-26',
               '2012-05-31'],
              dtype='datetime64[ns]', freq='5D')

In [30]:
N = 150
times = pd.date_range(start='2017-05-20 00:00', freq='1min', periods=N)
df = (pd.DataFrame({'time': times,
                   'value': np.arange(N)})
     .set_index('time'))

In [51]:
#resample method
#每隔5分钟采样一次
df.resample('5min').mean().head()

Unnamed: 0_level_0,value
time,Unnamed: 1_level_1
2017-05-20 00:00:00,2
2017-05-20 00:05:00,7
2017-05-20 00:10:00,12
2017-05-20 00:15:00,17
2017-05-20 00:20:00,22


In [38]:
df2 = (pd.DataFrame({'time': times.repeat(3),
                    'key': np.tile(['a', 'b', 'c'], N),
                    'value': np.arange(N * 3.)})
      .set_index('time'))

In [45]:
#通过pd.TimeGrouper对象来resampling 'key'
#记住了你的索引必须是time！！！
df2.groupby(['key', pd.Grouper(freq='5min')]).sum().head()

Unnamed: 0_level_0,Unnamed: 1_level_0,value
key,time,Unnamed: 2_level_1
a,2017-05-20 00:00:00,30.0
a,2017-05-20 00:05:00,105.0
a,2017-05-20 00:10:00,180.0
a,2017-05-20 00:15:00,255.0
a,2017-05-20 00:20:00,330.0
