# introducing pandas objects
有三种基本的数据结构：Series,DataFrame,Index

## Series对象

In [2]:
import pandas as pd
import numpy as np

In [3]:
# 一个带索引数据的一维数组
data = pd.Series([0.25,0.5,0.75,2.0])

In [5]:
#we can access with the values and index attributes
#numpy array
data.values
#pd.Index
data.index

RangeIndex(start=0, stop=4, step=1)

#### 是通用的numpy数组，本质差异是索引，numpy数组是隐式定义的整数索引，而pandas的索引是显式定义的。

In [8]:
#比numpy array更灵活，它的index可以是string
data = pd.Series([0.25,0.5,0.75,1.0],index=['a','b','c','d'])
#索引可以是不连续的
data = pd.Series([0.25,0.5,0.75,1.0],index=[2，3，5，7])

#### 是特殊的字典

In [6]:
#Series as specialized dictionary
#也可以把它当成字典，不过更灵活有效
population_dict = {'California': 38332521,
                               'Texas': 26448193,
                               'New York': 19651127,
                               'Florida': 19552860,
                               'Illinois': 12882135}
population = pd.Series(population_dict)
#Unlike a dictionary, though, the Series also supports array-style operations such as slicing
population['California':'Illinois']
#根据字典的key排序
population

California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135
dtype: int64

#### 创建SEries的方法 pd.Series(data,inde=index)

In [7]:
pd.Series([2,4,6])

0    2
1    4
2    6
dtype: int64

In [8]:
pd.Series(5,index=[10,23,24])

10    5
23    5
24    5
dtype: int64

In [9]:
pd.Series({3:'s',9:'q',8:'a'})

3    s
9    q
8    a
dtype: object

## DataFrame

#### 作为通用型的numpy数组

In [10]:
#DataFrame as a generalized NumPy array
area_dict = {'California': 423967, 'Texas': 695662, 'New York': 141297,
                 'Florida': 170312, 'Illinois': 149995}
area = pd.Series(area_dict)
# 把具有相同行索引的series合并成dataframe
states = pd.DataFrame({'population':population,'area':area})
#DataFrame has an index(common with Series) and columns(values in Series) attributes
states.index,states.columns
#DataFrame as specialized dictionary

(Index(['California', 'Texas', 'New York', 'Florida', 'Illinois'], dtype='object'),
 Index(['population', 'area'], dtype='object'))

#### 作为特殊的字典 ，一列映射一个Series的对象

In [11]:
states['area']

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64

#### 创建dataframe的方法

In [12]:
#construct dataframe
#1. 通过单个series对象构建
pd.DataFrame(population,columns=['population'])

#2. 通过字典列表创建
data = [{'a':i,'b':i*2} for i in range(3)]#字典推导
pd.DataFrame(data)
#不需要键值一样
pd.DataFrame([{'a': 1, 'b': 2}, {'b': 3, 'c': 4}])

#3. 通过series对象字典创建
#just like before
pd.DataFrame({'population':population,'area':area})

#4. 通过二维数组直接创建
pd.DataFrame(np.random.rand(3, 2),
                         columns=['foo', 'bar'],
                         index=['a', 'b', 'c'])

Unnamed: 0,foo,bar
a,0.721953,0.064576
b,0.628724,0.429052
c,0.450269,0.752368


## Index

In [13]:
ind = pd.Index([2,3,5,6,7])
ind

Int64Index([2, 3, 5, 6, 7], dtype='int64')

#### 看作不可变的数组以及有序集合

In [14]:
#Index as immutable array
ind[::2]
#ind[1]=0 #will be wrong,cause it's immutable
#Index as ordered set，pandas会涉及很多数据的交集操作，这会依赖很多集合(set)算法，Index也支持set操作
indA = pd.Index([1, 3, 5, 7, 9])
indB = pd.Index([1, 2, 5, 7, 9])
indA & indB,indA | indB,indA ^ indB

(Int64Index([1, 5, 7, 9], dtype='int64'),
 Int64Index([1, 2, 3, 5, 7, 9], dtype='int64'),
 Int64Index([2, 3], dtype='int64'))