## 基本操作

In [1]:
import pandas as pd

In [2]:
pd.__version__

'0.23.4'

In [3]:
# 从列表创建series
arr = [0,1,2,3,4]
df = pd.Series(arr)  # 如果不指定索引，则默认从0开始
df

0    0
1    1
2    2
3    3
4    4
dtype: int64

In [4]:
# 从字典创建Series
d = {'a':1,'b':2,'c':3,'d':4,'e':5}
df1 = pd.Series(d)
df1

a    1
b    2
c    3
d    4
e    5
dtype: int64

In [5]:
#  从numpy 数组创建DataFrame
import numpy as np
dates = pd.date_range('today',periods=6) # 定义时间序列作为index
num_arr = np.random.randn(6,4)  # 传入numpy随机数组
columns = ['A','B','C','D'] # 将列表作为列名
df2 = pd.DataFrame(num_arr,index=dates, columns=columns)
df2

Unnamed: 0,A,B,C,D
2020-09-05 21:58:48.434900,0.1925,-0.631671,-0.73004,-0.445153
2020-09-06 21:58:48.434900,-1.08261,0.475459,0.425029,2.201877
2020-09-07 21:58:48.434900,-0.978402,0.971373,1.622302,-0.741734
2020-09-08 21:58:48.434900,0.082219,-0.083596,1.427181,1.725164
2020-09-09 21:58:48.434900,-1.878338,0.591857,-0.196788,0.388725
2020-09-10 21:58:48.434900,1.184844,0.533369,0.620893,-1.806449


In [6]:
# 从CSV中创建DataFrame , 分隔符为;  编码为gbk
# df3 = pd.read_csv('1.csv',encoding='gbk',sep=';')

In [7]:
# 从字典对象data创建DataFrame ，设置索引为labels
data = {
    'animal':['cat', 'cat', 'snake', 'dog', 'dog', 'cat', 'snake', 'cat', 'dog', 'dog'],
    'age':[2.5, 3, 0.5, np.nan, 5, 2, 4.5, np.nan, 7, 3],
    'visits':[1, 3, 2, 3, 2, 3, 1, 1, 2, 1],
    'priority':['yes', 'yes', 'no', 'yes', 'no', 'no', 'no', 'yes', 'no', 'no']
}
labels = ['a','b','c','d','e','f','g','h','i','j']
df = pd.DataFrame(data,index=labels)
df

Unnamed: 0,animal,age,visits,priority
a,cat,2.5,1,yes
b,cat,3.0,3,yes
c,snake,0.5,2,no
d,dog,,3,yes
e,dog,5.0,2,no
f,cat,2.0,3,no
g,snake,4.5,1,no
h,cat,,1,yes
i,dog,7.0,2,no
j,dog,3.0,1,no


In [8]:
# 显示DataFrame的基础信息，包括行的数量、列名、每一列值的数量、类型
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10 entries, a to j
Data columns (total 4 columns):
animal      10 non-null object
age         8 non-null float64
visits      10 non-null int64
priority    10 non-null object
dtypes: float64(1), int64(1), object(2)
memory usage: 400.0+ bytes


In [9]:
df.describe()

Unnamed: 0,age,visits
count,8.0,10.0
mean,3.4375,1.9
std,2.007797,0.875595
min,0.5,1.0
25%,2.375,1.0
50%,3.0,2.0
75%,4.625,2.75
max,7.0,3.0


In [10]:
# 显示df的前3行
df[:3]

Unnamed: 0,animal,age,visits,priority
a,cat,2.5,1,yes
b,cat,3.0,3,yes
c,snake,0.5,2,no


In [11]:
df.iloc[:3]

Unnamed: 0,animal,age,visits,priority
a,cat,2.5,1,yes
b,cat,3.0,3,yes
c,snake,0.5,2,no


In [12]:
df.head(3)

Unnamed: 0,animal,age,visits,priority
a,cat,2.5,1,yes
b,cat,3.0,3,yes
c,snake,0.5,2,no


In [13]:
# 取出df的 animal和age列
df[['animal','age']]

Unnamed: 0,animal,age
a,cat,2.5
b,cat,3.0
c,snake,0.5
d,dog,
e,dog,5.0
f,cat,2.0
g,snake,4.5
h,cat,
i,dog,7.0
j,dog,3.0


In [14]:
# 取出索引为[3,4,8]的animal列和age列
df.loc[df.index[[3,4,8]], ['animal','age']]

Unnamed: 0,animal,age
d,dog,
e,dog,5.0
i,dog,7.0


In [15]:
# 取出age大于3的行
df[df['age']>3]

Unnamed: 0,animal,age,visits,priority
e,dog,5.0,2,no
g,snake,4.5,1,no
i,dog,7.0,2,no


In [16]:
# 取出age值缺失的行
df[df['age'].isnull()]

Unnamed: 0,animal,age,visits,priority
d,dog,,3,yes
h,cat,,1,yes


In [17]:
# 取出age 大于2 小于4的行
df[(2<df['age'])  &  (df['age'] <4)]

Unnamed: 0,animal,age,visits,priority
a,cat,2.5,1,yes
b,cat,3.0,3,yes
j,dog,3.0,1,no


In [18]:
# f行的数据
df.loc['f','age']= 1.5
df.loc['f','age']

1.5

In [19]:
# 取出 age在2，4之间的行(不包含2，4)
df[df['age'].between(2,4)]

Unnamed: 0,animal,age,visits,priority
a,cat,2.5,1,yes
b,cat,3.0,3,yes
j,dog,3.0,1,no


In [20]:
# 计算visits的总和
df['visits'].sum()

19

In [21]:
# 计算每个不同种类animal 的age 平均数
df.groupby('animal')['age'].mean()

animal
cat      2.333333
dog      5.000000
snake    2.500000
Name: age, dtype: float64

In [24]:
df.groupby('animal').age.agg(['mean','min','max','std'])

Unnamed: 0_level_0,mean,min,max,std
animal,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
cat,2.333333,1.5,3.0,0.763763
dog,5.0,3.0,7.0,2.0
snake,2.5,0.5,4.5,2.828427


In [25]:
# 计算df中每个种类animal的数量
df['animal'].value_counts()

cat      4
dog      4
snake    2
Name: animal, dtype: int64

In [26]:
# 先按照age降序排，再按visits升序排
df.sort_values(by=['age','visits'],ascending=[False,True])

Unnamed: 0,animal,age,visits,priority
i,dog,7.0,2,no
e,dog,5.0,2,no
g,snake,4.5,1,no
j,dog,3.0,1,no
b,cat,3.0,3,yes
a,cat,2.5,1,yes
f,cat,1.5,3,no
c,snake,0.5,2,no
h,cat,,1,yes
d,dog,,3,yes


In [27]:
# 将priority 列中的yes ，no 替换为True,False
df['priority'] = df['priority'].map({'yes':True,'no':False})
df['priority']

a     True
b     True
c    False
d     True
e    False
f    False
g    False
h     True
i    False
j    False
Name: priority, dtype: bool

In [28]:
df

Unnamed: 0,animal,age,visits,priority
a,cat,2.5,1,True
b,cat,3.0,3,True
c,snake,0.5,2,False
d,dog,,3,True
e,dog,5.0,2,False
f,cat,1.5,3,False
g,snake,4.5,1,False
h,cat,,1,True
i,dog,7.0,2,False
j,dog,3.0,1,False


In [29]:
# 将animal中的snake 替换为 python
df['animal'] = df['animal'].replace('snake','python')
df['animal']

a       cat
b       cat
c    python
d       dog
e       dog
f       cat
g    python
h       cat
i       dog
j       dog
Name: animal, dtype: object

In [30]:
df

Unnamed: 0,animal,age,visits,priority
a,cat,2.5,1,True
b,cat,3.0,3,True
c,python,0.5,2,False
d,dog,,3,True
e,dog,5.0,2,False
f,cat,1.5,3,False
g,python,4.5,1,False
h,cat,,1,True
i,dog,7.0,2,False
j,dog,3.0,1,False


In [33]:
# 每种animal的 ,每种不同数量visits ,计算平均age
df.dtypes

animal       object
age         float64
visits        int64
priority       bool
dtype: object

In [36]:
df_cp = df
df_cp.age = df_cp.age.astype(float)

In [37]:
df_cp.dtypes

animal       object
age         float64
visits        int64
priority       bool
dtype: object

In [38]:
df_cp.pivot_table(index='animal',columns='visits',values='age',aggfunc='mean')

visits,1,2,3
animal,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
cat,2.5,,2.25
dog,3.0,6.0,
python,4.5,0.5,


In [39]:
# 插入新行K，再删除
df_cp.loc['k'] = ['dog',5.5,2,'no']

In [40]:
df_cp

Unnamed: 0,animal,age,visits,priority
a,cat,2.5,1,True
b,cat,3.0,3,True
c,python,0.5,2,False
d,dog,,3,True
e,dog,5.0,2,False
f,cat,1.5,3,False
g,python,4.5,1,False
h,cat,,1,True
i,dog,7.0,2,False
j,dog,3.0,1,False


In [41]:
df_cp = df_cp.drop('k')

In [42]:
df_cp

Unnamed: 0,animal,age,visits,priority
a,cat,2.5,1,True
b,cat,3.0,3,True
c,python,0.5,2,False
d,dog,,3,True
e,dog,5.0,2,False
f,cat,1.5,3,False
g,python,4.5,1,False
h,cat,,1,True
i,dog,7.0,2,False
j,dog,3.0,1,False


### 进阶操作