In [1]:
import seaborn as sns
import pandas as pd
sns.set()
tips_df = sns.load_dataset("tips")

In [2]:
tips_df.head(3)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3


## pandas 对比 hive 操作
**此处的对比仅包括读操作 hive大部分情况下是只读的**

#### limit

In [3]:
tips_df.head(2)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3


#### select where 
```SQL
select 
    tip,
    sex
from tips_df
where smoker != 'No' and (size > 2 or sex = 'Female')
```

In [4]:
# 使用bool列加位运算符的方式 dataframe.<colname>的方式来选取列 
# size 列因为和内置函数size冲突所以使用了 dataframe[<colname>]的方式 
# 注意运算符的优先级 位运算优先级较低需要辅助括号
res_df = tips_df[(tips_df.smoker != 'No') & ((tips_df["size"] > 2) | (tips_df.sex == 'Female'))][["tip", "sex"]]
res_df.head(3)

Unnamed: 0,tip,sex
56,3.0,Male
63,3.76,Male
67,1.0,Female


In [5]:
# 使用query方式 表达式类比于hive 但是要用== 不应该用=  字符串的常量数据要加单引号
res_df = tips_df.query("smoker != 'No' and (size > 2 or sex == 'Female')")[["tip", "sex"]]
res_df.head(3)

Unnamed: 0,tip,sex
56,3.0,Male
63,3.76,Male
67,1.0,Female


#### sum count avg distinct
```SQL
select
    count(sex) as sex_cnt,
    count(day) as day_cnt,
    count(distinct sex) as sex_distinct_cnt,
    sum(tip) as tip_sum,
    sum(size) as size_sum
from tips_df
```
对于pandas 不同的聚合操作得分别计算 不能用一个表达式计算   
相同的地方在于 pandas 一个dataFrame聚合后的结果是一行 Series hive也是一行

In [6]:
# sum
sumed = tips_df[["tip", "size"]].sum()
sumed

tip     731.58
size    627.00
dtype: float64

In [7]:
# count
counted = tips_df[["sex", "day"]].count()
counted

sex    244
day    244
dtype: int64

In [8]:
# count distinct 
# series unique 相当于distinct
# dataFrame nunique 相当于count distinct
counted = tips_df[["sex", "smoker"]].nunique()
counted

sex       2
smoker    2
dtype: int64

#### group by
```SQL
select
    sex,
    smoker,
    sum(tip) as tip_sum,
    sum(size) as size_sum,
    count(day) as day_count
from tips_df
group by sex, smoker
```

In [9]:
# group by ["sex", "smoker"] -> sum(tip) sum(size)
# group by 后是一个grouped dataFrame对象 以group的列组成笛卡尔积元组 对groupedDataFrame进行列聚合操作 结果又变为DataFrame 行索引变为多级索引
res = tips_df.groupby(["sex", "smoker"])[["tip", "size"]].sum()
print(type(res))
res

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0_level_0,Unnamed: 1_level_0,tip,size
sex,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1
Male,Yes,183.07,150
Male,No,302.0,263
Female,Yes,96.74,74
Female,No,149.77,140


#### order by
```SQL
select
    sex,
    smoker, 
    tips,
    size
from tips_df
order by tips, size desc
```
order by 中的列要在select 中出现 order by 是最后执行的语句

In [10]:
res = tips_df[["sex", "smoker", "tip", "size"]].sort_values(by=["tip", "size"], ascending=[True, False])
res.head(3)

Unnamed: 0,sex,smoker,tip,size
92,Female,Yes,1.0,2
236,Male,Yes,1.0,2
67,Female,Yes,1.0,1


### join union 测试

In [11]:
from sklearn.utils import shuffle
# 构建左右数据集
left_df = shuffle(tips_df[["total_bill", "tip", "sex"]])
right_df = shuffle(tips_df[["smoker", "day", "time", "size"]])
left_df["left_id"] = left_df.index
right_df["right_id"] = right_df.index
left_df.reset_index(inplace=True)
right_df.reset_index(inplace=True)


shuffled_df = shuffle(tips_df)
shuffled_df["id"] = shuffled_df.index
up_df = shuffled_df.query("sex=='Male'")
down_df = shuffled_df.query("sex!='Male'")
up_df.reset_index(inplace=True)
down_df.reset_index(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':


In [12]:
up_df.head(3)

Unnamed: 0,index,total_bill,tip,sex,smoker,day,time,size,id
0,232,11.61,3.39,Male,No,Sat,Dinner,2,232
1,176,17.89,2.0,Male,Yes,Sun,Dinner,2,176
2,34,17.78,3.27,Male,No,Sat,Dinner,2,34


In [13]:
down_df.head(3)

Unnamed: 0,index,total_bill,tip,sex,smoker,day,time,size,id
0,4,24.59,3.61,Female,No,Sun,Dinner,4,4
1,74,14.73,2.2,Female,No,Sat,Dinner,2,74
2,21,20.29,2.75,Female,No,Sat,Dinner,2,21


In [14]:
# row 顺序已经使用shuffle打乱
left_df.head(3)

Unnamed: 0,index,total_bill,tip,sex,left_id
0,105,15.36,1.64,Male,105
1,156,48.17,5.0,Male,156
2,226,10.09,2.0,Female,226


In [15]:
# row 顺序已经使用shuffle打乱
right_df.head(3)

Unnamed: 0,index,smoker,day,time,size,right_id
0,63,Yes,Sat,Dinner,4,63
1,30,No,Sat,Dinner,2,30
2,180,Yes,Sun,Dinner,4,180


#### join
```SQL
select
    left_df.left_id as id,
    total_bill,
    smoker
from left_df join right_df on left_df.left_id = right_df.right_id
where size = 2
```

In [16]:
# 使用pd.merge 或者 DataFrame.merge 来完成join  如果 公共列的列名在两边都存在 直接用关键字参数on就行 
df_merged = left_df.merge(right_df, how='inner', left_on = "left_id", right_on = "right_id").query("size == 2")
df_merged.head(3)

Unnamed: 0,index_x,total_bill,tip,sex,left_id,index_y,smoker,day,time,size,right_id
0,105,15.36,1.64,Male,105,105,Yes,Sat,Dinner,2,105
2,226,10.09,2.0,Female,226,226,Yes,Fri,Lunch,2,226
3,235,10.07,1.25,Male,235,235,No,Sat,Dinner,2,235


In [17]:
# concat 是直接把两个df做合并 concate 只会使用index去join 所以 
# 可以看到left_id 和right_id 并没有按照预期相等 
# 所以最好不要用concat完成join的功能 更适合做union
df_concated = pd.concat([left_df, right_df], join='inner', axis='columns')
df_concated.head(3)

Unnamed: 0,index,total_bill,tip,sex,left_id,index.1,smoker,day,time,size,right_id
0,105,15.36,1.64,Male,105,63,Yes,Sat,Dinner,4,63
1,156,48.17,5.0,Male,156,30,No,Sat,Dinner,2,30
2,226,10.09,2.0,Female,226,180,Yes,Sun,Dinner,4,180


#### union
union all 不去重 union去重
```SQL
select
    *
from (
    select
        *
    from up_df
    union all
    select 
        *
    from down_df
) t
```

In [18]:
# concat 是直接把两个df做合并 concate 只会使用index去join 所以 
# 可以看到left_id 和right_id 并没有按照预期相等 
# 所以最好不要用concat完成join的功能 更适合做union
df_unioned = pd.concat([up_df, down_df], join='inner', axis='index')
df_unioned.query("day == 'Sun' and time == 'Dinner' and size >= 3 and tip >= 5.00") 
# 可以看到结果中包含了 male 和female 说明union 成功 

Unnamed: 0,index,total_bill,tip,sex,smoker,day,time,size,id
10,185,20.69,5.0,Male,No,Sun,Dinner,5,185
44,116,29.93,5.07,Male,No,Sun,Dinner,4,116
51,47,32.4,6.0,Male,No,Sun,Dinner,4,47
105,44,30.4,5.6,Male,No,Sun,Dinner,4,44
134,183,23.17,6.5,Male,Yes,Sun,Dinner,4,183
136,156,48.17,5.0,Male,No,Sun,Dinner,6,156
5,11,35.26,5.0,Female,No,Sun,Dinner,4,11
53,155,29.85,5.14,Female,No,Sun,Dinner,5,155
56,52,34.81,5.2,Female,No,Sun,Dinner,4,52


#### udf
- 此处的udf指按行操作 即每一行产生一个结果 此处以 size * total_bill + 1 为例

```SQL
select
    size * total_bill + 1  as res_col1
    size + total_bill + 2  as res_col2
from tips_df
```

In [19]:
tips_df.head(2)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3


In [20]:
# udf 演示
df_applied = tips_df.apply(lambda r: pd.Series({"res_col1": r["size"] * r["total_bill"] + 1, "res_col2": r["size"] + r["total_bill"] + 2}), axis=1)
df_applied.head(2)

Unnamed: 0,res_col1,res_col2
0,34.98,20.99
1,32.02,15.34


#### udaf
```SQL
select
    avg(tip) as tip_avg,
    avg(size) as size_avg
from tips_df
```

- 此处的udaf指对列进行操作 每列产生一个结果 此处以 sum(size) + 1 为例

In [21]:
# udaf 演示 对每列计算平均数
# 对列使用apply
import numpy as np
df_applied = tips_df[["tip", "size"]].apply(lambda r: sum(r) / len(r), axis=0)
df_applied.head(2)

tip     2.998279
size    2.569672
dtype: float64

In [22]:
# 使用agg
tips_df[["tip", "size"]].agg(np.mean)

tip     2.998279
size    2.569672
dtype: float64

#### 总结
- SQL 和 pandas 各有优劣 pandas 面向单机应用 所以每行每列相对之间是有顺序的 可以看做是内存中行和列都有有意义名字的矩阵
- hive SQL 面向分布式大数据， 行和列之间没有绝对的顺序
- pandas 适合更灵巧的操作 hive 更适合于求和求平均等一般性的操作
- pandas 可以原地修改数据 hive 不能 只能生成新的数据