In [1]:
#Series as one-dim array
import pandas as pd
import numpy as np

In [2]:
data = pd.Series([0.25, 0.5, 0.75, 1.0],index=['a', 'b', 'c', 'd'])
data['a':'c'],data[0:2],data[(data>0.3)&(data<0.8)]

(a    0.25
 b    0.50
 c    0.75
 dtype: float64, a    0.25
 b    0.50
 dtype: float64, b    0.50
 c    0.75
 dtype: float64)

In [4]:
#fancy indexing
data[['a','d']]

a    0.25
d    1.00
dtype: float64

# indexers:loc,iloc,ix

In [5]:
#some confusions
data = pd.Series(['a','b','c'],index=[1,3,5])
data

1    a
3    b
5    c
dtype: object

In [6]:
#explicit index when indexing
data[1]

'a'

In [7]:
#implicit index when slicing
data[1:3]

3    b
5    c
dtype: object

因为整数索引可能造成这样的迷惑，pandas提供了特殊的indexers属性的索引

In [8]:
#loc属性关联到显式的索引
data.loc[1]

'a'

In [9]:
data.loc[1:3]

1    a
3    b
dtype: object

In [10]:
#iloc属性关联到隐式的索引
data.iloc[1]

'b'

In [11]:
data.iloc[1:3]

3    b
5    c
dtype: object

# Data Selection in DataFrame

记住可以从两种角度看待dataframe，一种是二维数组，另一种是由Series构成的字典

In [13]:
#DataFrame as a dictionary
area = pd.Series({'California': 423967, 'Texas': 695662,
                'New York': 141297, 'Florida': 170312,
                'Illinois': 149995})
pop = pd.Series({'California': 38332521, 'Texas': 26448193,
                'New York': 19651127, 'Florida': 19552860,
                'Illinois': 12882135})
data = pd.DataFrame({'area':area, 'pop':pop})
data

Unnamed: 0,area,pop
California,423967,38332521
Texas,695662,26448193
New York,141297,19651127
Florida,170312,19552860
Illinois,149995,12882135


In [15]:
data['area'],data.area

(California    423967
 Texas         695662
 New York      141297
 Florida       170312
 Illinois      149995
 Name: area, dtype: int64, California    423967
 Texas         695662
 New York      141297
 Florida       170312
 Illinois      149995
 Name: area, dtype: int64)

In [16]:
data['density'] = data['pop']/data['area']

In [17]:
data

Unnamed: 0,area,pop,density
California,423967,38332521,90.413926
Texas,695662,26448193,38.01874
New York,141297,19651127,139.076746
Florida,170312,19552860,114.806121
Illinois,149995,12882135,85.883763


In [18]:
#DataFrame as two-dimensional array

In [19]:
data.values

array([[4.23967000e+05, 3.83325210e+07, 9.04139261e+01],
       [6.95662000e+05, 2.64481930e+07, 3.80187404e+01],
       [1.41297000e+05, 1.96511270e+07, 1.39076746e+02],
       [1.70312000e+05, 1.95528600e+07, 1.14806121e+02],
       [1.49995000e+05, 1.28821350e+07, 8.58837628e+01]])

In [20]:
data.T

Unnamed: 0,California,Texas,New York,Florida,Illinois
area,423967.0,695662.0,141297.0,170312.0,149995.0
pop,38332520.0,26448190.0,19651130.0,19552860.0,12882140.0
density,90.41393,38.01874,139.0767,114.8061,85.88376


In [21]:
#隐式索引 position based indexing
data.iloc[:3,:2]

Unnamed: 0,area,pop
California,423967,38332521
Texas,695662,26448193
New York,141297,19651127


In [22]:
#显式索引 label based indexing
data.loc[:'Illinois',:'pop']

Unnamed: 0,area,pop
California,423967,38332521
Texas,695662,26448193
New York,141297,19651127
Florida,170312,19552860
Illinois,149995,12882135


In [23]:
#The ix indexer allows a hybrid of these two approaches:
data.ix[:3,:'pop']

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  


Unnamed: 0,area,pop
California,423967,38332521
Texas,695662,26448193
New York,141297,19651127


In [24]:
data.loc[data.density>100,['pop','density']]

Unnamed: 0,pop,density
New York,19651127,139.076746
Florida,19552860,114.806121


# Operating on Data in Pandas

# Ufuncs:index preservation

NumPy ufunc will work on Pandas Series and DataFrame objects
我们可以对Series和DataFrame执行numpy的函数，返回的是另外的对象但是索引被保存下来

In [26]:
import pandas as pd
import numpy as np
rng = np.random.RandomState(42)
ser = pd.Series(rng.randint(0,10,4))

In [27]:
df = pd.DataFrame(rng.randint(0,10,(3,4)),columns=['a','b','c','d'])

In [28]:
np.exp(ser)

0     403.428793
1      20.085537
2    1096.633158
3      54.598150
dtype: float64

In [29]:
np.sin(df+4)

Unnamed: 0,a,b,c,d
0,-0.544021,0.420167,-0.279415,-0.544021
1,-0.99999,0.989358,0.656987,-0.99999
2,-0.99999,-0.279415,0.412118,0.989358


## UFuncs: Index Alignment

对二元操作来说，pandas会将索引自动对齐，这在数据量不全的时候很有帮助。

In [30]:
#UFuncs: Index Alignment
area = pd.Series({'Alaska': 1723337, 'Texas': 695662,
                    'California': 423967}, name='area')
population = pd.Series({'California': 38332521, 'Texas': 26448193,
                    'New York': 19651127}, name='population')
population/area

Alaska              NaN
California    90.413926
New York            NaN
Texas         38.018740
dtype: float64

In [31]:
A = pd.Series([2, 4, 6], index=[0, 1, 2]) 
B = pd.Series([1, 3, 5], index=[1, 2, 3]) 
A+B

0    NaN
1    5.0
2    9.0
3    NaN
dtype: float64

如果想要填充空值，可以使用对象方法来替代操作符

In [32]:
A.add(B,fill_value=0)

0    2.0
1    5.0
2    9.0
3    5.0
dtype: float64

In [33]:
A = pd.DataFrame(rng.randint(0, 20, (2, 2)),
                columns=list('AB'))
B = pd.DataFrame(rng.randint(0, 10, (3, 3)),
                columns=list('BAC'))
A+B

Unnamed: 0,A,B,C
0,1.0,15.0,
1,13.0,6.0,
2,,,


In [34]:
fill = A.stack().mean()
fill

4.5

In [35]:
A.add(B,fill_value=fill)

Unnamed: 0,A,B,C
0,1.0,15.0,13.5
1,13.0,6.0,4.5
2,6.5,13.5,10.5


# python操作符和对应的pandas对象方法

## Ufuncs: Operations Between DataFrame and Series

In [47]:
#根据broadcasting rules,subtraction between a two-dimensional array and one of its rows is applied row-wise
A = rng.randint(10, size=(3, 4))
A-A[0]

array([[2, 2, 0, 4],
       [9, 6, 9, 8],
       [6, 8, 7, 1]])

In [45]:
#In Pandas, the convention similarly operates row-wise by default
df = pd.DataFrame(A, columns=list('QRST'))
df - df.iloc[0]
#if you would like to operate column-wise,axis = 0
df.sub(df['R'],axis=0)

Unnamed: 0,Q,R,S,T
0,8,0,8,6
1,1,0,-7,0
2,5,0,-2,5


In [48]:
df

Unnamed: 0,Q,R,S,T
0,8,0,8,6
1,8,7,0,7
2,7,2,0,7


In [50]:
half = df.iloc[0,::2]
half

Q    8
S    8
Name: 0, dtype: int64

In [51]:
df -half

Unnamed: 0,Q,R,S,T
0,0.0,,0.0,
1,0.0,,-8.0,
2,-1.0,,-8.0,


# Handling Missing Data

## None: Pythonic missing data

In [2]:
#None is a object 如果在数组中有none,那么它是object类型
vals = np.array([1,None,3,4])

In [3]:
#所以不能执行聚合操作
vals.sum()

TypeError: unsupported operand type(s) for +: 'int' and 'NoneType'

# NaN:Missing numerical data

In [8]:
vals2 = np.array([1, np.nan, 3, 4])
vals2.dtype

dtype('float64')

In [4]:
#it is a special floating-point value recognized by all systems that use the standard IEEE floating-point representation

In [6]:
#和nan值进行的算数运算返回的结果都是nan
1+np.nan

nan

In [10]:
#可以执行聚合操作，但是得不到想要的结果
vals2.sum()

nan

In [12]:
#如果想要忽略nan值
np.nansum(vals2)

8.0

In [14]:
#自动类型转换float64 
pd.Series([1,np.nan,2,None])

0    1.0
1    NaN
2    2.0
3    NaN
dtype: float64

In [16]:
#会将None值转化为NaN
x = pd.Series(range(2),dtype=int)
x[0] = None
x

0    NaN
1    1.0
dtype: float64

In [18]:
#detecting null values
data = pd.Series([1,np.nan,'hello',None])
data.isnull()

0    False
1     True
2    False
3     True
dtype: bool

In [20]:
data[data.notnull()]

0        1
2    hello
dtype: object

In [21]:
#dropping null values
data.dropna()
#for dataframe 我们不能删除单个null值，我们只能删除整行或者整列
df = pd.DataFrame([[1,      np.nan, 2],
                  [2,      3,      5],
                [np.nan,4,6]])
#how默认参数是any,也就是某行或者某列只要含有Null值就删除
#axis默认参数是删除行，参数axis=1 or axis = 'columns'删除列
df.dropna()
#how=all 整行或整列全是null值才删除
#thresh参数可以控制某一行或者某一列最少的非空值数量

Unnamed: 0,0,1,2
1,2.0,3.0,5


In [23]:
#filling null value
data = pd.Series([1, np.nan, 2, None, 3], index=list('abcde'))
data.fillna(0)#返回的是一个新的对象

a    1.0
b    0.0
c    2.0
d    0.0
e    3.0
dtype: float64

In [25]:
data.fillna(method='ffill')

a    1.0
b    1.0
c    2.0
d    2.0
e    3.0
dtype: float64

In [26]:
data.fillna(method='bfill')

a    1.0
b    2.0
c    2.0
d    3.0
e    3.0
dtype: float64

# Combining Datasets:Contat and Append

In [29]:
def make_df(cols,ind):
    data = {c:[str(c)+str(i) for i in ind]for c in cols}
    return pd.DataFrame(data,ind)

In [32]:
#recall concatenation of numpy arrays
x= [1,2,3]
y= [4,5,6]
z= [7,8,9]
np.concatenate([x,y,z])

array([1, 2, 3, 4, 5, 6, 7, 8, 9])

In [35]:
df = make_df('AB',[0,1])
df2 = make_df('CD',[0,1])
pd.concat([df,df2],axis=1)

Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1


In [43]:
pd.concat([df,df2])

Unnamed: 0,A,B,C,D
0,A0,B0,,
1,A1,B1,,
0,,,C0,D0
1,,,C1,D1


In [36]:
x = make_df('AB', [0, 1])
y = make_df('AB', [2, 3])
y.index = x.index
pd.concat([x,y])

Unnamed: 0,A,B
0,A0,B0
1,A1,B1
0,A2,B2
1,A3,B3


In [38]:
try:
    pd.concat([x,y],verify_integrity=True)
except ValueError as e:
    print(e)

Indexes have overlapping values: [0, 1]


In [39]:
pd.concat([x,y],ignore_index=True)

Unnamed: 0,A,B
0,A0,B0
1,A1,B1
2,A2,B2
3,A3,B3


In [41]:
df5 = make_df('ABC', [1, 2])
df6 = make_df('BCD', [3, 4])
pd.concat([df5,df6],join='inner')

Unnamed: 0,B,C
1,B1,C1
2,B2,C2
3,B3,C3
4,B4,C4


In [42]:
pd.concat([df5,df6],join_axes=[df5.columns])

Unnamed: 0,A,B,C
1,A1,B1,C1
2,A2,B2,C2
3,,B3,C3
4,,B4,C4


In [48]:
df5.append(df6)

Unnamed: 0,A,B,C,D
1,A1,B1,C1,
2,A2,B2,C2,
3,,B3,C3,D3
4,,B4,C4,D4


# Combining Datasets: Merge and Join

In [44]:
#one to one
df1 = pd.DataFrame({'employee': ['Bob', 'Jake', 'Lisa', 'Sue'],
                        'group': ['Accounting', 'Engineering', 'Engineering', 'HR']})
df2 = pd.DataFrame({'employee': ['Lisa', 'Bob', 'Jake', 'Sue'],
'hire_date': [2004, 2008, 2012, 2014]})
df1

Unnamed: 0,employee,group
0,Bob,Accounting
1,Jake,Engineering
2,Lisa,Engineering
3,Sue,HR


In [45]:
df2

Unnamed: 0,employee,hire_date
0,Lisa,2004
1,Bob,2008
2,Jake,2012
3,Sue,2014


In [46]:
df3 = pd.merge(df1,df2)

In [47]:
df3

Unnamed: 0,employee,group,hire_date
0,Bob,Accounting,2008
1,Jake,Engineering,2012
2,Lisa,Engineering,2004
3,Sue,HR,2014


In [49]:
#many to one joins
df4 = pd.DataFrame({'group': ['Accounting', 'Engineering', 'HR'],
                    'supervisor': ['Carly', 'Guido', 'Steve']})
pd.merge(df3,df4)

Unnamed: 0,employee,group,hire_date,supervisor
0,Bob,Accounting,2008,Carly
1,Jake,Engineering,2012,Guido
2,Lisa,Engineering,2004,Guido
3,Sue,HR,2014,Steve


In [50]:
#many to many

In [51]:
#on
pd.merge(df1,df2,on='employee')

Unnamed: 0,employee,group,hire_date
0,Bob,Accounting,2008
1,Jake,Engineering,2012
2,Lisa,Engineering,2004
3,Sue,HR,2014


In [53]:
#left_on and right_on
df3 = pd.DataFrame({'name': ['Bob', 'Jake', 'Lisa', 'Sue'],
'salary': [70000, 80000, 120000, 90000]}) 
print(df1); print(df3);print(pd.merge(df1, df3, left_on="employee", right_on="name"))

  employee        group
0      Bob   Accounting
1     Jake  Engineering
2     Lisa  Engineering
3      Sue           HR
   name  salary
0   Bob   70000
1  Jake   80000
2  Lisa  120000
3   Sue   90000
  employee        group  name  salary
0      Bob   Accounting   Bob   70000
1     Jake  Engineering  Jake   80000
2     Lisa  Engineering  Lisa  120000
3      Sue           HR   Sue   90000


In [54]:
#the left_index and right_index 

In [58]:
df1a = df1.set_index('employee')
df2a = df2.set_index('employee')
pd.merge(df1a,df2a,left_index=True,right_index=True)

Unnamed: 0_level_0,group,hire_date
employee,Unnamed: 1_level_1,Unnamed: 2_level_1
Bob,Accounting,2008
Jake,Engineering,2012
Lisa,Engineering,2004
Sue,HR,2014


In [59]:
# join performs a merge that defaults to joining on indices
df1a.join(df2a)

Unnamed: 0_level_0,group,hire_date
employee,Unnamed: 1_level_1,Unnamed: 2_level_1
Bob,Accounting,2008
Jake,Engineering,2012
Lisa,Engineering,2004
Sue,HR,2014


In [60]:
print(df1a),print(df3)

(                group
 employee             
 Bob        Accounting
 Jake      Engineering
 Lisa      Engineering
 Sue                HR,    name  salary
 0   Bob   70000
 1  Jake   80000
 2  Lisa  120000
 3   Sue   90000)

In [61]:
pd.merge(df1a,df3,left_index=True,right_on='name')

Unnamed: 0,group,name,salary
0,Accounting,Bob,70000
1,Engineering,Jake,80000
2,Engineering,Lisa,120000
3,HR,Sue,90000


In [62]:
df6 = pd.DataFrame({'name': ['Peter', 'Paul', 'Mary'],
                    'food': ['fish', 'beans', 'bread']},
columns=['name', 'food'])
df7 = pd.DataFrame({'name': ['Mary', 'Joseph'],
                    'drink': ['wine', 'beer']},
columns=['name', 'drink'])

In [63]:
pd.merge(df6,df7,how='outer')

Unnamed: 0,name,food,drink
0,Peter,fish,
1,Paul,beans,
2,Mary,bread,wine
3,Joseph,,beer


In [64]:
df8 = pd.DataFrame({'name': ['Bob', 'Jake', 'Lisa', 'Sue'],
                                'rank': [1, 2, 3, 4]})
df9 = pd.DataFrame({'name': ['Bob', 'Jake', 'Lisa', 'Sue'],
                    'rank': [3, 1, 4, 2]})
pd.merge(df8,df9,on='name',suffixes=['_L','_R'])

Unnamed: 0,name,rank_L,rank_R
0,Bob,1,3
1,Jake,2,1
2,Lisa,3,4
3,Sue,4,2


# Aggregation and Grouping

In [66]:
import seaborn as sns
planets = sns.load_dataset('planets')
planets.shape

(1035, 6)

In [67]:
planets.dropna().describe()

Unnamed: 0,number,orbital_period,mass,distance,year
count,498.0,498.0,498.0,498.0,498.0
mean,1.73494,835.778671,2.50932,52.068213,2007.37751
std,1.17572,1469.128259,3.636274,46.596041,4.167284
min,1.0,1.3283,0.0036,1.35,1989.0
25%,1.0,38.27225,0.2125,24.4975,2005.0
50%,1.0,357.0,1.245,39.94,2009.0
75%,2.0,999.6,2.8675,59.3325,2011.0
max,6.0,17337.5,25.0,354.0,2014.0


In [68]:
#count(),first(),last(),mean(),median(),min(),max(),std(),var(),mad(),product(),sum()

## GroupBy: Split, Apply, Combine

In [69]:
df = pd.DataFrame({'key': ['A', 'B', 'C', 'A', 'B', 'C'],'data':range(6)},columns=['key','data'])
df.groupby('key').sum()

Unnamed: 0_level_0,data
key,Unnamed: 1_level_1
A,3
B,5
C,7


In [70]:
planets.groupby('method')['orbital_period'].median()

method
Astrometry                         631.180000
Eclipse Timing Variations         4343.500000
Imaging                          27500.000000
Microlensing                      3300.000000
Orbital Brightness Modulation        0.342887
Pulsar Timing                       66.541900
Pulsation Timing Variations       1170.000000
Radial Velocity                    360.200000
Transit                              5.714932
Transit Timing Variations           57.011000
Name: orbital_period, dtype: float64

In [72]:
# Iteration over groups
for (method, group) in planets.groupby('method'):
    print("{0:30s} shape={1}".format(method,group.shape))

Astrometry                     shape=(2, 6)
Eclipse Timing Variations      shape=(9, 6)
Imaging                        shape=(38, 6)
Microlensing                   shape=(23, 6)
Orbital Brightness Modulation  shape=(3, 6)
Pulsar Timing                  shape=(5, 6)
Pulsation Timing Variations    shape=(1, 6)
Radial Velocity                shape=(553, 6)
Transit                        shape=(397, 6)
Transit Timing Variations      shape=(4, 6)


## Aggregation

In [74]:
rng = np.random.RandomState(0)
df = pd.DataFrame({'key': ['A', 'B', 'C', 'A', 'B', 'C'],'data1': range(6),'data2': rng.randint(0, 10, 6)},columns = ['key', 'data1', 'data2'])

In [82]:
df

Unnamed: 0,key,data1,data2
0,A,0,5
1,B,1,0
2,C,2,3
3,A,3,3
4,B,4,7
5,C,5,9


In [75]:
# GroupBy aggregations with sum(), median(), and the like, but the aggregate() method allows for even more flexibility

In [76]:
df.groupby('key').aggregate(['min',np.median,max])

Unnamed: 0_level_0,data1,data1,data1,data2,data2,data2
Unnamed: 0_level_1,min,median,max,min,median,max
key,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
A,0,1.5,3,3,4.0,5
B,1,2.5,4,0,3.5,7
C,2,3.5,5,3,6.0,9


In [77]:
df.groupby('key').aggregate({'data1':'min','data2':'max'})

Unnamed: 0_level_0,data1,data2
key,Unnamed: 1_level_1,Unnamed: 2_level_1
A,0,5
B,1,7
C,2,9


## Filtering

In [78]:
def filter_func(x):
    return x['data2'].std()> 4
df

Unnamed: 0,key,data1,data2
0,A,0,5
1,B,1,0
2,C,2,3
3,A,3,3
4,B,4,7
5,C,5,9


In [79]:
df.groupby('key').std()

Unnamed: 0_level_0,data1,data2
key,Unnamed: 1_level_1,Unnamed: 2_level_1
A,2.12132,1.414214
B,2.12132,4.949747
C,2.12132,4.242641


In [80]:
df.groupby('key').filter(filter_func)

Unnamed: 0,key,data1,data2
1,B,1,0
2,C,2,3
4,B,4,7
5,C,5,9


## Transforation

In [81]:
df.groupby('key').transform(lambda x:x-x.mean())

Unnamed: 0,data1,data2
0,-1.5,1.0
1,-1.5,-3.5
2,-1.5,-3.0
3,1.5,-1.0
4,1.5,3.5
5,1.5,3.0


## apply

In [83]:
df.groupby('key').apply(sum)

Unnamed: 0_level_0,key,data1,data2
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,AA,3,8
B,BB,5,7
C,CC,7,12


In [85]:
#A list, array, series, or index providing the grouping keys
L = [0,1,0,1,2,0]
df.groupby(L).sum()

Unnamed: 0,data1,data2
0,7,17
1,4,3
2,4,7


In [86]:
#A dictionary or series mapping index to group
df2 = df.set_index('key')
mapping = {'A':'vowel','B':'consonant','C':'consonant'}
df2.groupby(mapping).sum()

Unnamed: 0,data1,data2
consonant,12,19
vowel,3,8


In [87]:
#Any Python functio
df2.groupby(str.lower).mean()

Unnamed: 0,data1,data2
a,1.5,4.0
b,2.5,3.5
c,3.5,6.0


In [88]:
# A list of valid keys
df2.groupby([str.lower,mapping]).mean()

Unnamed: 0,Unnamed: 1,data1,data2
a,vowel,1.5,4.0
b,consonant,2.5,3.5
c,consonant,3.5,6.0


In [89]:
#vectorized string operations
#numpy 不提供字符串的向量化，并且不能包含none
data = ['peter', 'Paul', None, 'MARY', 'gUIDO']
names = pd.Series(data)
names.str.capitalize()

0    Peter
1     Paul
2     None
3     Mary
4    Guido
dtype: object

In [90]:
monte = pd.Series(['Graham Chapman', 'John Cleese', 'Terry Gilliam',
                              'Eric Idle', 'Terry Jones', 'Michael Palin'])


In [91]:
monte.str.split().str.get(-1)

0    Chapman
1     Cleese
2    Gilliam
3       Idle
4      Jones
5      Palin
dtype: object

In [92]:
full_monte = pd.DataFrame({'name': monte,'info': ['B|C|D', 'B|D', 'A|C', 'B|D', 'B|C','B|C|D']})
full_monte

Unnamed: 0,info,name
0,B|C|D,Graham Chapman
1,B|D,John Cleese
2,A|C,Terry Gilliam
3,B|D,Eric Idle
4,B|C,Terry Jones
5,B|C|D,Michael Palin


In [95]:
full_monte['info'].str.get_dummies('|')

Unnamed: 0,A,B,C,D
0,0,1,1,1
1,0,1,0,1
2,1,0,1,0
3,0,1,0,1
4,0,1,1,0
5,0,1,1,1


# time

In [97]:
#python原生的时间日期：datetime and dateutil
#Typed arrays of times:Numpy's datetime64
#pandas
date = pd.to_datetime("4th of july,2015")

In [98]:
date

Timestamp('2015-07-04 00:00:00')

In [99]:
date.strftime('%A')

'Saturday'

In [100]:
date+pd.to_timedelta(np.arange(12),'D')

DatetimeIndex(['2015-07-04', '2015-07-05', '2015-07-06', '2015-07-07',
               '2015-07-08', '2015-07-09', '2015-07-10', '2015-07-11',
               '2015-07-12', '2015-07-13', '2015-07-14', '2015-07-15'],
              dtype='datetime64[ns]', freq=None)

In [103]:
from datetime import datetime
dates = pd.to_datetime([datetime(2015, 7, 3), '4th of July, 2015',
                                   '2015-Jul-6', '07-07-2015', '20150708'])
dates

DatetimeIndex(['2015-07-03', '2015-07-04', '2015-07-06', '2015-07-07',
               '2015-07-08'],
              dtype='datetime64[ns]', freq=None)

In [104]:
dates.to_period('D')

PeriodIndex(['2015-07-03', '2015-07-04', '2015-07-06', '2015-07-07',
             '2015-07-08'],
            dtype='period[D]', freq='D')

In [105]:
dates-dates[0]

TimedeltaIndex(['0 days', '1 days', '3 days', '4 days', '5 days'], dtype='timedelta64[ns]', freq=None)

In [106]:
pd.date_range('2018-07-01','2018-08-01')

DatetimeIndex(['2018-07-01', '2018-07-02', '2018-07-03', '2018-07-04',
               '2018-07-05', '2018-07-06', '2018-07-07', '2018-07-08',
               '2018-07-09', '2018-07-10', '2018-07-11', '2018-07-12',
               '2018-07-13', '2018-07-14', '2018-07-15', '2018-07-16',
               '2018-07-17', '2018-07-18', '2018-07-19', '2018-07-20',
               '2018-07-21', '2018-07-22', '2018-07-23', '2018-07-24',
               '2018-07-25', '2018-07-26', '2018-07-27', '2018-07-28',
               '2018-07-29', '2018-07-30', '2018-07-31', '2018-08-01'],
              dtype='datetime64[ns]', freq='D')

In [107]:
pd.date_range('2018-07-01',periods=8)

DatetimeIndex(['2018-07-01', '2018-07-02', '2018-07-03', '2018-07-04',
               '2018-07-05', '2018-07-06', '2018-07-07', '2018-07-08'],
              dtype='datetime64[ns]', freq='D')

In [108]:
pd.date_range('2018-07-01',periods=8,freq='H')

DatetimeIndex(['2018-07-01 00:00:00', '2018-07-01 01:00:00',
               '2018-07-01 02:00:00', '2018-07-01 03:00:00',
               '2018-07-01 04:00:00', '2018-07-01 05:00:00',
               '2018-07-01 06:00:00', '2018-07-01 07:00:00'],
              dtype='datetime64[ns]', freq='H')

In [109]:
pd.period_range('2015-08',periods=8,freq='H')

PeriodIndex(['2015-08-01 00:00', '2015-08-01 01:00', '2015-08-01 02:00',
             '2015-08-01 03:00', '2015-08-01 04:00', '2015-08-01 05:00',
             '2015-08-01 06:00', '2015-08-01 07:00'],
            dtype='period[H]', freq='H')

In [110]:
pd.timedelta_range(0, periods=10, freq='H')

TimedeltaIndex(['00:00:00', '01:00:00', '02:00:00', '03:00:00', '04:00:00',
                '05:00:00', '06:00:00', '07:00:00', '08:00:00', '09:00:00'],
               dtype='timedelta64[ns]', freq='H')