# DataFrame基础知识

In [1]:
import pandas as pd

## 创建空白DataFrame

In [2]:
# 列以abcd命名
empty_df = pd.DataFrame(columns=['a','b','c','d'])
empty_df

Unnamed: 0,a,b,c,d


## 常见函数

### df.shape

In [3]:
# 查询行和列数
empty_df.shape

(0, 4)

In [5]:
# 字典
rowdata = {'颜色深度': [14.13,13.2,13.16,14.27,13.24,12.07,12.43,11.79,12.37,12.04],
           '酒精浓度': [5.64,4.28,5.68,4.80,4.22,2.76,3.94,3.1,2.12,2.6], 
           '品种': [0,0,0,0,0,1,1,1,1,1]}
# 列表
listdata = [[1,2],[3,4],[5,6]]

In [6]:
# 字典转化为DataFrame
wine_data = pd.DataFrame(rowdata)

In [7]:
# 列表转化为DataFrame
new_list_data = pd.DataFrame(listdata)

In [8]:
wine_data

Unnamed: 0,颜色深度,酒精浓度,品种
0,14.13,5.64,0
1,13.2,4.28,0
2,13.16,5.68,0
3,14.27,4.8,0
4,13.24,4.22,0
5,12.07,2.76,1
6,12.43,3.94,1
7,11.79,3.1,1
8,12.37,2.12,1
9,12.04,2.6,1


In [9]:
new_list_data

Unnamed: 0,0,1
0,1,2
1,3,4
2,5,6


In [16]:
# 按某一列名统计数据
wine_data.groupby('酒精浓度').count()

Unnamed: 0_level_0,颜色深度,品种
酒精浓度,Unnamed: 1_level_1,Unnamed: 2_level_1
2.12,1,1
2.6,1,1
2.76,1,1
3.1,1,1
3.94,1,1
4.22,1,1
4.28,1,1
4.8,1,1
5.64,1,1
5.68,1,1


### pd.merge( ) 

In [22]:
a = pd.DataFrame({'a':[1,2,3],'b':[2,3,4]})
b = pd.DataFrame({'a':[11,22,33],'c':[22,33,44]})
c = pd.merge(a,b)
c

Unnamed: 0,a,b,c


In [23]:
c = pd.merge(a,b,how='outer',on='a')
c

Unnamed: 0,a,b,c
0,1,2.0,
1,2,3.0,
2,3,4.0,
3,11,,22.0
4,22,,33.0
5,33,,44.0


### pd.concat( )

In [24]:
# 使用pandas.concat(a,b)进行合并的时候，需要是list的形式，以下是错误示范及报错提示

pd.concat(a,b)

# TypeError: first argument must be an iterable of pandas objects, you passed an object of type "DataFrame"

TypeError: first argument must be an iterable of pandas objects, you passed an object of type "DataFrame"

In [25]:
# 横向连接
pd.concat([a,b],axis=1)

Unnamed: 0,a,b,a.1,c
0,1,2,11,22
1,2,3,22,33
2,3,4,33,44


In [26]:
# 纵向连接
pd.concat([a,b],axis=0)

Unnamed: 0,a,b,c
0,1,2.0,
1,2,3.0,
2,3,4.0,
0,11,,22.0
1,22,,33.0
2,33,,44.0


In [27]:
pd.concat([a,b],join='inner')

Unnamed: 0,a
0,1
1,2
2,3
0,11
1,22
2,33


In [28]:
# 更改连接表的序号
d = pd.concat([a,b])
d.index = list(range(0,6))
d

Unnamed: 0,a,b,c
0,1,2.0,
1,2,3.0,
2,3,4.0,
3,11,,22.0
4,22,,33.0
5,33,,44.0


### pd.Series( )

In [29]:
# 通过复制语句可以使得单列数据的拼接
e = pd.Series(list('abc'))
a['c'] = e
a

Unnamed: 0,a,b,c
0,1,2,a
1,2,3,b
2,3,4,c


In [30]:
f = pd.Series(list('123456'))
a['d'] = f
a

Unnamed: 0,a,b,c,d
0,1,2,a,1
1,2,3,b,2
2,3,4,c,3


### DataFrame.join( )

In [31]:
a.join(b)

# ValueError: columns overlap but no suffix specified: Index(['a', 'c'], dtype='object')

ValueError: columns overlap but no suffix specified: Index(['a', 'c'], dtype='object')

## 遍历DataFrame

In [None]:
df.iterrows()
# 按行遍历，将df的每一行迭代为（index,Series）对，可以通过row[name]对元素进行访问

df.itertuples()
# 按行遍历，将df的每一行迭代为元组，可以通过row[name]对元素进行访问，比iterrows()效率高

df.iteritems()
# 按列遍历，将df的每一列迭代为（列名,Series）对，可以通过row[index]对元素进行访问

In [35]:
inp = [{'c1':10,'c2':100},{'c1':11,'c2':110},{'c1':12,'c2':120}]
df = pd.DataFrame(inp)
df

Unnamed: 0,c1,c2
0,10,100
1,11,110
2,12,120


In [36]:
for index, row in df.iterrows():
    print(index,row['c1'],row['c2'])

0 10 100
1 11 110
2 12 120


In [37]:
for row in df.itertuples():
    print(getattr(row,'c1'),getattr(row,'c2'))

10 100
11 110
12 120


In [38]:
for index,row in df.iteritems():
    print(index,row[0],row[1],row[2])

c1 10 11 12
c2 100 110 120


## DataFrame的索引

In [39]:
a

Unnamed: 0,a,b,c,d
0,1,2,a,1
1,2,3,b,2
2,3,4,c,3


In [40]:
# 使用标签
a.loc[1,['b','c']]

b    3
c    b
Name: 1, dtype: object

In [41]:
# 使用下标
a.iloc[0:2,1]

0    2
1    3
Name: b, dtype: int64

In [42]:
a.iloc[0:3,1]

0    2
1    3
2    4
Name: b, dtype: int64

In [44]:
a.iloc[0:3,0:4]

Unnamed: 0,a,b,c,d
0,1,2,a,1
1,2,3,b,2
2,3,4,c,3
