# Series

## 构造Series

### 用列表或者数组构造Series

In [1]:
import numpy as np
import pandas as pd

from pandas import Series

In [3]:
# 用数组构造
arr = np.array([1,2,3])
pd.Series(arr)

0    1
1    2
2    3
dtype: int32

In [4]:
# 用列表构造
list_1=[1,2,3]
pd.Series(list_1)

0    1
1    2
2    3
dtype: int64

区别：数组构造Series是印用，列表是创建副本

## 用字典构造Series

In [5]:
# 手动输入index
name_list = ['aa','bb','cc']
pd.Series(name_list,index=['k1','k2','k3'])


k1    aa
k2    bb
k3    cc
dtype: object

In [8]:
# 直接用字典构造
dic = {
    '语文':'150',
    '数学':'150',
    '英语':'150',
    '理综':'300'
}
pd.Series(dic)

语文    150
数学    150
英语    150
理综    300
dtype: object

## 访问Series

In [1]:
import numpy as np
import pandas as pd
from pandas import Series
arr = np.random.randint(0,101,5)
arr

array([14, 33, 22, 65, 18])

In [2]:
s =Series(data=arr , index=['lucy', 'tony', 'andy', 'tom', 'mario'])
s

lucy     14
tony     33
andy     22
tom      65
mario    18
dtype: int32

### 1.loc访问显示索引

In [3]:
s.loc['lucy']

14

In [4]:
list_1 = ['lucy','andy']
s.loc[list_1]

lucy    14
andy    22
dtype: int32

In [5]:
s.loc[['lucy','andy']]

lucy    14
andy    22
dtype: int32

### 2.iloc访问隐式索引


In [6]:
s.iloc[0]

14

In [7]:
s.iloc[[0,1]]

lucy    14
tony    33
dtype: int32

In [8]:
list_1 = [0,1]
s.iloc[list_1]

lucy    14
tony    33
dtype: int32

### 3.利用bool访问

In [9]:
s[s>70]

Series([], dtype: int32)

## Series的属性和方法

### 属性 shape size index values name

In [10]:
s =Series(data=arr , index=['lucy', 'tony', 'andy', 'tom', 'mario'])
s

lucy     14
tony     33
andy     22
tom      65
mario    18
dtype: int32

In [11]:
s.shape

(5,)

In [12]:
s.size

5

In [13]:
s.values

array([14, 33, 22, 65, 18])

In [14]:
s.index

Index(['lucy', 'tony', 'andy', 'tom', 'mario'], dtype='object')

In [15]:
s.name='score'
s

lucy     14
tony     33
andy     22
tom      65
mario    18
Name: score, dtype: int32

### 应用


In [16]:
(s.index == 'amy').any()

False

In [17]:
s2 = Series(data=np.random.randint(1,100,5), index=s.index)
s2

lucy     46
tony     98
andy     53
tom      88
mario    25
dtype: int32

### 方法head（）和tail（）

相当于切片

In [18]:
s=Series(data=np.random.randint(1,100,3),index=['andy','tony','mario'])
s

andy     83
tony     10
mario    71
dtype: int32

In [19]:
s.head(2)

andy    83
tony    10
dtype: int32

In [20]:
s.tail(2)

tony     10
mario    71
dtype: int32

### 方法 isnull（）和 notnull（）检测缺失数据

配合any（）和all（）使用

In [21]:
dic={
    "amy":1,
    "tony":2,
    "mario":3
}
s=Series(data=dic,index=["amy","tony","mario","lucy"])
s

amy      1.0
tony     2.0
mario    3.0
lucy     NaN
dtype: float64

In [22]:
s.isnull()

amy      False
tony     False
mario    False
lucy      True
dtype: bool

In [23]:
s.isnull().any() # 至少有一个NaN

True

### 排序

值排序

In [25]:
abcde=('b','d','e','a','c')
s=Series(data=np.random.randint(1,100,5),index=list(abcde))
s

b    60
d    58
e    46
a    38
c    13
dtype: int32

In [29]:
s.sort_values()

c    13
a    38
e    46
d    58
b    60
dtype: int32

根据索引排序

In [31]:
s.sort_index(ascending=False) #降序排序

e    46
d    58
c    13
b    60
a    38
dtype: int32

### 统计次数

In [34]:
# 统计客户消费的次数
user_id = Series(data=["tom","tom","tom","lucy","lucy"])
user_id.value_counts()

tom     3
lucy    2
Name: count, dtype: int64

In [44]:
user_id =Series(data=np.random.randint(1,100,100)) # 统计序号分别为1-100的客户分别消费多少次
user_id.value_counts(ascending=False).head() # 只取消费次数最高地方前4


4     5
55    4
44    4
13    3
22    3
Name: count, dtype: int64

## Series运算

In [1]:
import numpy as np
import pandas as pd
from pandas import Series 

### 1. Series 和 一个数运算（类似于 广播机制）

In [3]:
arr_1 = Series(data=np.random.randint(1,100,size=5),index=list("ABCDE"))
arr_1

A    48
B    84
C    69
D    80
E    15
dtype: int32

In [4]:
arr_1 +5

A    53
B    89
C    74
D    85
E    20
dtype: int32

### 2.Series 和np数组运算(隐式索引对齐）
不遵循广播机制  
**尽量不用**

In [7]:
arr_2 = np.ones(shape=5)
arr_2

array([1., 1., 1., 1., 1.])

In [8]:
arr_1 + arr_2 

A    49.0
B    85.0
C    70.0
D    81.0
E    16.0
dtype: float64

In [9]:
# arr_1 是一个Series
# arr_1.values 是一个np.array
arr_1.values

array([48, 84, 69, 80, 15])

arr_1.values 与 arr_2 运算遵循广播机制（np.array 与 np.array 的运算）

### Series和Series 相加（显式索引对齐）


In [10]:
arr_1 = Series(data=np.random.randint(1,10,5),index=list("ABCDE"))
arr_2 = Series(data=np.random.randint(1,10,5),index=list("BCDAE"))
display(arr_1,arr_2)

A    8
B    1
C    4
D    9
E    6
dtype: int32

B    9
C    2
D    4
A    1
E    9
dtype: int32

In [11]:
arr_1 + arr_2

A     9
B    10
C     6
D    13
E    15
dtype: int32

In [13]:
arr_1 = Series(data=np.random.randint(1,10,5),index=list("ABCDE"))
arr_2 = Series(data=np.random.randint(1,10,3),index=list("BAE"))
display(arr_1,arr_2)

A    2
B    7
C    8
D    8
E    5
dtype: int32

B    7
A    7
E    2
dtype: int32

In [15]:
arr_1 + arr_2 # 一个null与其他值做运算都为null

A     9.0
B    14.0
C     NaN
D     NaN
E     7.0
dtype: float64

**解决方法**
相加：add()
相减：sub()
相乘：mul()
相除：div()

In [17]:
import numpy as np
import pandas as pd
from pandas import Series 

In [21]:
index_1 = ["语文","数学","英语","理综"]
index_2 = ["语文","数学","英语","文综"]
score_1 = Series(data=np.random.randint(1,150,4),index=index_1)
score_2 = Series(data=np.random.randint(1,150,4),index=index_2)
display(score_1,score_2)

语文     74
数学     85
英语     12
理综    133
dtype: int32

语文    120
数学    104
英语     46
文综     96
dtype: int32

In [23]:
score_1.add(score_2)

数学    189.0
文综      NaN
理综      NaN
英语     58.0
语文    194.0
dtype: float64

In [25]:
score_1.add(score_2,fill_value=0)

数学    189.0
文综     96.0
理综    133.0
英语     58.0
语文    194.0
dtype: float64

Series 支持聚合操作


In [48]:
score_1.mean(), score_1.values.mean()

(76.0, 76.0)

### 例子 

In [35]:
import numpy as np
import pandas as pd
from pandas import Series

In [36]:
index_1 = ['mary','jack','luicy','tony']
py_score = Series(data=np.random.randint(1,100,4),index=index_1,name='python score')
ja_score = Series(data=np.random.randint(1,100,4),index=index_1,name='jave score')
display(py_score,ja_score)

mary     22
jack     10
luicy    54
tony      8
Name: python score, dtype: int32

mary     77
jack     35
luicy    50
tony     48
Name: jave score, dtype: int32

In [37]:
# 求平均
(py_score+ja_score)/2

mary     49.5
jack     22.5
luicy    52.0
tony     28.0
dtype: float64

In [45]:
#　找出python未及格的学生的名字
py_score[py_score<60]

mary     22
jack     10
luicy    54
tony      8
Name: python score, dtype: int32

In [46]:
py_score[py_score<60].index

Index(['mary', 'jack', 'luicy', 'tony'], dtype='object')

# DataFrame 

行索引：index  
列索引：columns    
值：values

In [50]:
import numpy as np
import pandas as pd
from pandas import Series,DataFrame

## DataFrame 的创建

### 1.用DataFrame函数

In [53]:
DataFrame(data=np.random.randint(1,10,size=(3,5)),index=list("abc"),columns=list("ABCDE"))

Unnamed: 0,A,B,C,D,E
a,8,1,7,4,3
b,7,3,2,9,7
c,5,7,5,7,2


### 2.用字典构造
这种方式更好，每一列的数据类型都可以不一样

In [58]:
dic ={
    "A":np.random.randint(1,10,3),
    "B":np.random.randint(1,10,3),
    "C":np.random.randint(1,10,3),
    "D":np.random.randint(1,10,3),
    "E":np.random.randint(1,10,3),
}
DataFrame(data=dic,index=list("abc"))

Unnamed: 0,A,B,C,D,E
a,9,4,8,5,7
b,5,3,3,6,4
c,7,3,5,2,3


### 3.从文件中读取DataFrame对象

常用：           
pd.read_csv()     
pd.read_excel()    
pd.read_table()    

header() ：设置哪几行作为行标签，默认是第一行   
index_col()：设置哪几列作为列标签，默认第一列   
sheet_name()：第几个表格(可以指定索引：从0开始；也可以指定表名称)       
 

In [72]:
filename = 'D:\Program Files\python exercise\数据库学习\example.xlsx'
pd.read_excel(filename,sheet_name=0) 

Unnamed: 0.1,Unnamed: 0,经销商,发货地区,手机型号
0,0,dancer,beijing,iPhone
1,1,lucy,beijing,Android
2,2,tom,guangzhou,iPhone
3,3,petter,shenzhen,windowsPhone
4,4,mery,guangzhou,Android


### 4.将Series转为DataFrame
目的：两种类型支持的函数不一样

In [75]:
s = Series(data=np.random.randint(1,10,size=5),index=list("abcde"))
s

a    4
b    4
c    1
d    4
e    4
dtype: int32

In [77]:
df = DataFrame(data=s)
df

Unnamed: 0,0
a,4
b,4
c,1
d,4
e,4


In [78]:
s.shape, df.shape

((5,), (5, 1))

## DataFrame索引

In [2]:
import numpy as np
import pandas as pd
from pandas import Series,DataFrame

In [3]:
df = DataFrame(data=np.random.randint(1,100,size=(3,5)),index=list("abc"),columns=list("ABCDE"))
df

Unnamed: 0,A,B,C,D,E
a,90,12,44,40,33
b,95,10,55,97,90
c,60,93,71,92,74


### 访问

1.访问列

    （1）用类似字典方式访问

In [12]:
df["A"]

a    90
b    95
c    60
Name: A, dtype: int32

In [13]:
df[["A","B"]]

Unnamed: 0,A,B
a,90,12
b,95,10
c,60,93


    (2)属性访问

In [14]:
df.A

a    90
b    95
c    60
Name: A, dtype: int32

2.访问行

        （1）显示索引loc

In [15]:
df.loc["a"]

A    90
B    12
C    44
D    40
E    33
Name: a, dtype: int32

In [16]:
df.loc[["a","b"]]

Unnamed: 0,A,B,C,D,E
a,90,12,44,40,33
b,95,10,55,97,90


    (2)隐式索引

In [17]:
df.iloc[0]

A    90
B    12
C    44
D    40
E    33
Name: a, dtype: int32

In [18]:
df.iloc[[0,1]]

Unnamed: 0,A,B,C,D,E
a,90,12,44,40,33
b,95,10,55,97,90


3.访问元素

    （1）直接访问（官方推荐）适合读写操作
        先行后列

In [34]:
df

Unnamed: 0,A,B,C,D,E
a,90,12,44,40,33
b,95,10,55,97,90
c,60,93,71,92,74


In [35]:
# loc
df.loc["a","A"]

90

In [36]:
# iloc
df.iloc[0,0]

90

可以放入列表

In [58]:
df.loc[["a","b"],["A","B"]] 

Unnamed: 0,A,B
a,90,12
b,95,10


    （2）间接访问 适合只读操作
    

In [37]:
a= df["A"]
b= df["A"][0]
display(a,b)


a    90
b    95
c    60
Name: A, dtype: int32

90

In [38]:
a=df.loc["a"]
b=df.loc["a"]["A"]
display(a,b)

A    90
B    12
C    44
D    40
E    33
Name: a, dtype: int32

90

In [39]:
a=df.loc["a"].values
b=df.loc["a"].values[0]
display(a,b)

array([90, 12, 44, 40, 33])

90

## DataFrame切片

### 行切片

In [40]:
df

Unnamed: 0,A,B,C,D,E
a,90,12,44,40,33
b,95,10,55,97,90
c,60,93,71,92,74


In [51]:
# 行切片，显示访问
df.loc["a":"b"]

Unnamed: 0,A,B,C,D,E
a,90,12,44,40,33
b,95,10,55,97,90


In [50]:
# 行切片，隐式访问
df.iloc[0:1]

Unnamed: 0,A,B,C,D,E
a,90,12,44,40,33


### 列切片

In [49]:
# 列切片，显示访问
df.loc[:,"A":"C"]

Unnamed: 0,A,B,C
a,90,12,44
b,95,10,55
c,60,93,71


In [48]:
# 列切片，隐式访问
df.iloc[:,0:2]

Unnamed: 0,A,B
a,90,12
b,95,10
c,60,93


### 用bool列表访问


In [55]:
# 访问行
bool1=[True,False,True]
df.loc[bool1]

Unnamed: 0,A,B,C,D,E
a,90,12,44,40,33
c,60,93,71,92,74


In [56]:
# 访问列
bool_2=[True,False,True,False,True]
df.loc[:,bool_2]

Unnamed: 0,A,C,E
a,90,44,33
b,95,55,90
c,60,71,74


## DataFrame运算

### 1.聚合运算

In [1]:
import numpy as np
import pandas as pd
from pandas import Series,DataFrame

In [2]:
df = DataFrame(data=np.random.randint(1,100,size=(3,5)), index=('lucy','mary','candy'),columns=list("ABCDE"))
df

Unnamed: 0,A,B,C,D,E
lucy,56,33,36,29,24
mary,55,92,21,3,8
candy,80,10,66,7,23


In [3]:
# 默认是沿着行方向进行
a = df.sum()
b = df.mean() # 可以理解为求某科的平均分
display(a,b) 

A    191
B    135
C    123
D     39
E     55
dtype: int64

A    63.666667
B    45.000000
C    41.000000
D    13.000000
E    18.333333
dtype: float64

In [4]:
# 通过修改axis的值可改变聚合方向
a = df.sum(axis=1)  # 可以理解为求每个人的总分
b = df.mean(axis=1)  # 可以理解为求每个人的平均分
display(a,b)

lucy     178
mary     179
candy    186
dtype: int64

lucy     35.6
mary     35.8
candy    37.2
dtype: float64

### 运算
1.DataFrame 和 一个数、numpy运算 （遵循广播机制）             
2.DataFrame 和 Series 运算 （显性索引对齐，也会进行广播，通过控制轴的方向进行运算）    
3.DaraFrame 和 DataFrame 运算（显性索引对齐）

In [None]:
df = DataFrame(data=np.random.randint(1,100,size=(3,5)),index=(list("abc")),columns=list("ABCDE"))
df

Unnamed: 0,A,B,C,D,E
a,23,99,42,7,3
b,18,37,49,17,80
c,80,73,45,34,94


In [7]:
arr = np.random.randint(1,10,size=5)
arr

array([7, 1, 7, 5, 7])

In [22]:
s1 = Series(np.random.randint(1,10,size=3),index=list("abc"))
s2 = Series(np.random.randint(1,10,size=5),index=list("ABCDE"))
display(s1,s2)

a    8
b    7
c    8
dtype: int32

A    9
B    9
C    2
D    7
E    6
dtype: int32

#### 1.DataFrame 和一个数、numpy运算

In [9]:
df

Unnamed: 0,A,B,C,D,E
a,23,99,42,7,3
b,18,37,49,17,80
c,80,73,45,34,94


In [10]:
# 和一个数运算（广播）
df + 10

Unnamed: 0,A,B,C,D,E
a,33,109,52,17,13
b,28,47,59,27,90
c,90,83,55,44,104


In [15]:
display(df,arr)

Unnamed: 0,A,B,C,D,E
a,23,99,42,7,3
b,18,37,49,17,80
c,80,73,45,34,94


array([7, 1, 7, 5, 7])

In [16]:
# 和一个数组运算 (沿着行进行广播)
df + arr

Unnamed: 0,A,B,C,D,E
a,30,100,49,12,10
b,25,38,56,22,87
c,87,74,52,39,101


#### 2.DataFrame 和 Series运算

In [23]:
display(df,s1,s2)

Unnamed: 0,A,B,C,D,E
a,23,99,42,7,3
b,18,37,49,17,80
c,80,73,45,34,94


a    8
b    7
c    8
dtype: int32

A    9
B    9
C    2
D    7
E    6
dtype: int32

DataFrame 和 Series 相加要注意轴的方向
要注意广播机制以及索引对齐

In [24]:
df + s1

Unnamed: 0,A,B,C,D,E,a,b,c
a,,,,,,,,
b,,,,,,,,
c,,,,,,,,


In [25]:
df.add(s1)

Unnamed: 0,A,B,C,D,E,a,b,c
a,,,,,,,,
b,,,,,,,,
c,,,,,,,,


In [27]:
df.add(s1,axis=0)

Unnamed: 0,A,B,C,D,E
a,31,107,50,15,11
b,25,44,56,24,87
c,88,81,53,42,102


In [29]:
display(df,s2)

Unnamed: 0,A,B,C,D,E
a,23,99,42,7,3
b,18,37,49,17,80
c,80,73,45,34,94


A    9
B    9
C    2
D    7
E    6
dtype: int32

In [28]:
df.add(s2)

Unnamed: 0,A,B,C,D,E
a,32,108,44,14,9
b,27,46,51,24,86
c,89,82,47,41,100


#### 3.Dataframe 和 Dataframe 相加

In [37]:

index_1 = list("abc")
columns_1 = list("ABCDE")
df_1 = DataFrame(data=np.random.randint(1,10,size=(3,5)),index=index_1,columns=columns_1)
df_2 = DataFrame(data=np.random.randint(1,10,size=(3,5)),index=index_1,columns=columns_1)
df_3 = DataFrame(data=np.random.randint(1,10,size=(3,4)),index=index_1,columns=list("ABCD"))
display(df_1,df_2,df_3)


Unnamed: 0,A,B,C,D,E
a,3,2,9,9,4
b,1,7,9,8,9
c,3,2,3,9,7


Unnamed: 0,A,B,C,D,E
a,1,8,8,1,7
b,4,3,4,9,1
c,2,1,9,3,6


Unnamed: 0,A,B,C,D
a,7,1,8,6
b,3,9,8,3
c,9,5,9,3


In [35]:
df_1.add(df_2)

Unnamed: 0,A,B,C,D,E
a,14,5,12,14,16
b,14,17,14,10,9
c,10,9,11,12,8


In [38]:
df_1.add(df_3)

Unnamed: 0,A,B,C,D,E
a,10,3,17,15,
b,4,16,17,11,
c,12,7,12,12,
