In [4]:
import seaborn as sns
import pandas as pd
sns.set()
tips_df = sns.load_dataset("tips")

In [5]:
tips_df.head(3)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3


## pandas 对比 hive 操作
**此处的对比仅包括读操作 hive大部分情况下是只读的**

#### limit

In [6]:
tips_df.head(2)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3


#### select where 
```SQL
select 
    tip,
    sex
from tips_df
where smoker != 'No' and (size > 2 or sex = 'Female')
```

In [7]:
# 使用bool列加位运算符的方式 dataframe.<colname>的方式来选取列 
# size 列因为和内置函数size冲突所以使用了 dataframe[<colname>]的方式 
# 注意运算符的优先级 位运算优先级较低需要辅助括号
res_df = tips_df[(tips_df.smoker != 'No') & ((tips_df["size"] > 2) | (tips_df.sex == 'Female'))][["tip", "sex"]]
res_df.head(3)

Unnamed: 0,tip,sex
56,3.0,Male
63,3.76,Male
67,1.0,Female


In [8]:
# 使用query方式 表达式类比于hive 但是要用== 不应该用=  字符串的常量数据要加单引号
res_df = tips_df.query("smoker != 'No' and (size > 2 or sex == 'Female')")[["tip", "sex"]]
res_df.head(3)

Unnamed: 0,tip,sex
56,3.0,Male
63,3.76,Male
67,1.0,Female


#### sum count avg distinct
```SQL
select
    count(sex) as sex_cnt,
    count(day) as day_cnt,
    count(distinct sex) as sex_distinct_cnt,
    sum(tip) as tip_sum,
    sum(size) as size_sum
from tips_df
```
对于pandas 不同的聚合操作得分别计算 不能用一个表达式计算   
相同的地方在于 pandas 一个dataFrame聚合后的结果是一行 Series hive也是一行

In [9]:
# sum
sumed = tips_df[["tip", "size"]].sum()
sumed

tip     731.58
size    627.00
dtype: float64

In [10]:
# count
counted = tips_df[["sex", "day"]].count()
counted

sex    244
day    244
dtype: int64

In [11]:
# count distinct 
# series unique 相当于distinct
# dataFrame nunique 相当于count distinct
counted = tips_df[["sex", "smoker"]].nunique()
counted

sex       2
smoker    2
dtype: int64

#### group by
```SQL
select
    sex,
    smoker,
    sum(tip) as tip_sum,
    sum(size) as size_sum,
    count(day) as day_count
from tips_df
group by sex, smoker
```

In [12]:
# group by ["sex", "smoker"] -> sum(tip) sum(size)
# group by 后是一个grouped dataFrame对象 以group的列组成笛卡尔积元组 对groupedDataFrame进行列聚合操作 结果又变为DataFrame 行索引变为多级索引
res = tips_df.groupby(["sex", "smoker"])[["tip", "size"]].sum()
print(type(res))
res

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0_level_0,Unnamed: 1_level_0,tip,size
sex,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1
Male,Yes,183.07,150
Male,No,302.0,263
Female,Yes,96.74,74
Female,No,149.77,140


#### order by
```SQL
select
    sex,
    smoker, 
    tips,
    size
from tips_df
order by tips, size desc
```
order by 中的列要在select 中出现 order by 是最后执行的语句

In [13]:
res = tips_df[["sex", "smoker", "tip", "size"]].sort_values(by=["tip", "size"], ascending=[True, False])
res.head(3)

Unnamed: 0,sex,smoker,tip,size
92,Female,Yes,1.0,2
236,Male,Yes,1.0,2
67,Female,Yes,1.0,1


### join union 测试

In [14]:
from sklearn.utils import shuffle
# 构建左右数据集
left_df = shuffle(tips_df[["total_bill", "tip", "sex"]])
right_df = shuffle(tips_df[["smoker", "day", "time", "size"]])
left_df["left_id"] = left_df.index
right_df["right_id"] = right_df.index
left_df.reset_index(inplace=True)
right_df.reset_index(inplace=True)


shuffled_df = shuffle(tips_df)
shuffled_df["id"] = shuffled_df.index
up_df = shuffled_df.query("sex=='Male'")
down_df = shuffled_df.query("sex!='Male'")
up_df.reset_index(inplace=True)
down_df.reset_index(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':


In [15]:
up_df.head(3)

Unnamed: 0,index,total_bill,tip,sex,smoker,day,time,size,id
0,236,12.6,1.0,Male,Yes,Sat,Dinner,2,236
1,97,12.03,1.5,Male,Yes,Fri,Dinner,2,97
2,80,19.44,3.0,Male,Yes,Thur,Lunch,2,80


In [16]:
down_df.head(3)

Unnamed: 0,index,total_bill,tip,sex,smoker,day,time,size,id
0,205,16.47,3.23,Female,Yes,Thur,Lunch,3,205
1,131,20.27,2.83,Female,No,Thur,Lunch,2,131
2,94,22.75,3.25,Female,No,Fri,Dinner,2,94


In [17]:
# row 顺序已经使用shuffle打乱
left_df.head(3)

Unnamed: 0,index,total_bill,tip,sex,left_id
0,97,12.03,1.5,Male,97
1,4,24.59,3.61,Female,4
2,7,26.88,3.12,Male,7


In [18]:
# row 顺序已经使用shuffle打乱
right_df.head(3)

Unnamed: 0,index,smoker,day,time,size,right_id
0,230,Yes,Sat,Dinner,4,230
1,42,No,Sun,Dinner,2,42
2,186,Yes,Sun,Dinner,3,186


#### join
```SQL
select
    left_df.left_id as id,
    total_bill,
    smoker
from left_df join right_df on left_df.left_id = right_df.right_id
where size = 2
```

In [19]:
# 使用pd.merge 或者 DataFrame.merge 来完成join  如果 公共列的列名在两边都存在 直接用关键字参数on就行 
df_merged = left_df.merge(right_df, how='inner', left_on = "left_id", right_on = "right_id").query("size == 2")
df_merged.head(3)

Unnamed: 0,index_x,total_bill,tip,sex,left_id,index_y,smoker,day,time,size,right_id
0,97,12.03,1.5,Male,97,97,Yes,Fri,Dinner,2,97
3,0,16.99,1.01,Female,0,0,No,Sun,Dinner,2,0
4,20,17.92,4.08,Male,20,20,No,Sat,Dinner,2,20


In [20]:
# concat 是直接把两个df做合并 concate 只会使用index去join 所以 
# 可以看到left_id 和right_id 并没有按照预期相等 
# 所以最好不要用concat完成join的功能 更适合做union
df_concated = pd.concat([left_df, right_df], join='inner', axis='columns')
df_concated.head(3)

Unnamed: 0,index,total_bill,tip,sex,left_id,index.1,smoker,day,time,size,right_id
0,97,12.03,1.5,Male,97,230,Yes,Sat,Dinner,4,230
1,4,24.59,3.61,Female,4,42,No,Sun,Dinner,2,42
2,7,26.88,3.12,Male,7,186,Yes,Sun,Dinner,3,186


#### union
union all 不去重 union去重
```SQL
select
    *
from (
    select
        *
    from up_df
    union all
    select 
        *
    from down_df
) t
```

In [21]:
# concat 是直接把两个df做合并 concate 只会使用index去join 所以 
# 可以看到left_id 和right_id 并没有按照预期相等 
# 所以最好不要用concat完成join的功能 更适合做union
df_unioned = pd.concat([up_df, down_df], join='inner', axis='index')
df_unioned.query("day == 'Sun' and time == 'Dinner' and size >= 3 and tip >= 5.00") 
# 可以看到结果中包含了 male 和female 说明union 成功 

Unnamed: 0,index,total_bill,tip,sex,smoker,day,time,size,id
10,156,48.17,5.0,Male,No,Sun,Dinner,6,156
13,47,32.4,6.0,Male,No,Sun,Dinner,4,47
37,116,29.93,5.07,Male,No,Sun,Dinner,4,116
47,185,20.69,5.0,Male,No,Sun,Dinner,5,185
78,44,30.4,5.6,Male,No,Sun,Dinner,4,44
145,183,23.17,6.5,Male,Yes,Sun,Dinner,4,183
73,11,35.26,5.0,Female,No,Sun,Dinner,4,11
81,52,34.81,5.2,Female,No,Sun,Dinner,4,52
84,155,29.85,5.14,Female,No,Sun,Dinner,5,155


#### udf
- 此处的udf指按行操作 即每一行产生一个结果 此处以 size * total_bill + 1 为例

```SQL
select
    size * total_bill + 1  as res_col1
    size + total_bill + 2  as res_col2
from tips_df
```

In [22]:
tips_df.head(2)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3


In [23]:
# udf 演示
df_applied = tips_df.apply(lambda r: pd.Series({"res_col1": r["size"] * r["total_bill"] + 1, "res_col2": r["size"] + r["total_bill"] + 2}), axis=1)
df_applied.head(2)

Unnamed: 0,res_col1,res_col2
0,34.98,20.99
1,32.02,15.34


#### udaf
```SQL
select
    avg(tip) as tip_avg,
    avg(size) as size_avg
from tips_df
```

- 此处的udaf指对列进行操作 每列产生一个结果 此处以 sum(size) + 1 为例

In [24]:
# udaf 演示 对每列计算平均数
# 对列使用apply
import numpy as np
df_applied = tips_df[["tip", "size"]].apply(lambda r: sum(r) / len(r), axis=0)
df_applied.head(2)

tip     2.998279
size    2.569672
dtype: float64

In [25]:
# 使用agg
tips_df[["tip", "size"]].agg(np.mean)

tip     2.998279
size    2.569672
dtype: float64

### 透视表
- 本质上就是一个sql的方便版groupby
1. index 使用index首先groupby 该index会作为结果的index
1. 其次使用columns 再进行gropby 该column会作title的分割标准
1. values 是要使用 aggfunc 聚合统计的值
1. aggfunc 是需要的聚合函数 除了单个聚合函数对象外还可以是 list 或者dict 可以实现对不同的value 使用不同的聚合函数

In [26]:
tips_df = sns.load_dataset("tips")
tips_df.head(3)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3


In [27]:
tips_df.pivot_table(index=["day", "time"], values=["size", "total_bill", "tip"], columns=["sex"], aggfunc=np.sum)

Unnamed: 0_level_0,Unnamed: 1_level_0,size,size,tip,tip,total_bill,total_bill
Unnamed: 0_level_1,sex,Male,Female,Male,Female,Male,Female
day,time,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
Thur,Lunch,73.0,77.0,89.41,79.42,561.44,516.11
Thur,Dinner,,2.0,,3.0,,18.78
Fri,Lunch,5.0,9.0,5.7,10.98,34.16,55.76
Fri,Dinner,16.0,10.0,21.23,14.05,164.41,71.55
Sat,Dinner,156.0,63.0,181.95,78.45,1227.35,551.05
Sun,Dinner,163.0,53.0,186.78,60.61,1269.46,357.7


In [28]:
# 使用group by 实现 透视表功能 就是看着比较丑
tips_df.groupby(["day", "time"])[["size", "total_bill", "tip"]].sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,size,total_bill,tip
day,time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Thur,Lunch,150.0,1077.55,168.83
Thur,Dinner,2.0,18.78,3.0
Fri,Lunch,14.0,89.92,16.68
Fri,Dinner,26.0,235.96,35.28
Sat,Lunch,,,
Sat,Dinner,219.0,1778.4,260.4
Sun,Lunch,,,
Sun,Dinner,216.0,1627.16,247.39


#### 总结
- SQL 和 pandas 各有优劣 pandas 面向单机应用 所以每行每列相对之间是有顺序的 可以看做是内存中行和列都有有意义名字的矩阵
- hive SQL 面向分布式大数据， 行和列之间没有绝对的顺序
- pandas 适合更灵巧的操作 hive 更适合于求和求平均等一般性的操作
- pandas 可以原地修改数据 hive 不能 只能生成新的数据