In [6]:
import seaborn as sns
sns.set()
tips_df = sns.load_dataset("tips")

In [7]:
tips_df.head(3)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3


## pandas 对比 hive 操作
**此处的对比仅包括读操作 hive大部分情况下是只读的**

#### limit

In [10]:
tips_df.head(2)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3


#### select where 
```SQL
select 
    tip,
    sex
from tips_df
where smoker != 'No' and (size > 2 or sex = 'Female')
```

In [24]:
# 使用bool列加位运算符的方式 dataframe.<colname>的方式来选取列 
# size 列因为和内置函数size冲突所以使用了 dataframe[<colname>]的方式 
# 注意运算符的优先级 位运算优先级较低需要辅助括号
res_df = tips_df[(tips_df.smoker != 'No') & ((tips_df["size"] > 2) | (tips_df.sex == 'Female'))][["tip", "sex"]]
res_df.head(3)

Unnamed: 0,tip,sex
56,3.0,Male
63,3.76,Male
67,1.0,Female


In [46]:
# 使用query方式 表达式类比于hive 但是要用== 不应该用=  字符串的常量数据要加单引号
res_df = tips_df.query("smoker != 'No' and (size > 2 or sex == 'Female')")[["tip", "sex"]]
res_df.head(3)

Unnamed: 0,tip,sex
56,3.0,Male
63,3.76,Male
67,1.0,Female


#### sum count avg distinct
```SQL
select
    count(sex) as sex_cnt,
    count(day) as day_cnt,
    count(distinct sex) as sex_distinct_cnt,
    sum(tip) as tip_sum,
    sum(size) as size_sum
from tips_df
```
对于pandas 不同的聚合操作得分别计算 不能用一个表达式计算   
相同的地方在于 pandas 一个dataFrame聚合后的结果是一行 Series hive也是一行

In [53]:
# sum
sumed = tips_df[["tip", "size"]].sum()
sumed

tip     731.58
size    627.00
dtype: float64

In [56]:
# count
counted = tips_df[["sex", "day"]].count()
counted

sex    244
day    244
dtype: int64

In [63]:
# count distinct 
# series unique 相当于distinct
# dataFrame nunique 相当于count distinct
counted = tips_df[["sex", "smoker"]].nunique()
counted

sex       2
smoker    2
dtype: int64

#### group by
```SQL
select
    sex,
    smoker,
    sum(tip) as tip_sum,
    sum(size) as size_sum,
    count(day) as day_count
from tips_df
group by sex, smoker
```

In [68]:
# group by ["sex", "smoker"] -> sum(tip) sum(size)
# group by 后是一个grouped dataFrame对象 以group的列组成笛卡尔积元组 对groupedDataFrame进行列聚合操作 结果又变为DataFrame 行索引变为多级索引
res = tips_df.groupby(["sex", "smoker"])[["tip", "size"]].sum()
print(type(res))
res

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0_level_0,Unnamed: 1_level_0,tip,size
sex,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1
Male,Yes,183.07,150
Male,No,302.0,263
Female,Yes,96.74,74
Female,No,149.77,140


#### order by
```SQL
select
    sex,
    smoker, 
    tips,
    size
from tips_df
order by tips, size desc
```
order by 中的列要在select 中出现 order by 是最后执行的语句

In [81]:
res = tips_df[["sex", "smoker", "tip", "size"]].sort_values(by=["tip", "size"], ascending=[True, False])
res.head(3)

Unnamed: 0,sex,smoker,tip,size
92,Female,Yes,1.0,2
236,Male,Yes,1.0,2
67,Female,Yes,1.0,1


#### join


In [92]:
left_table = tips_df[["total_bill", "tip", "sex"]]
right_table = tips_df[["smoker", "day", "time", "size"]]
left_table["left_id"] = left_table.index
right_table["right_id"] = right_table.index

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [93]:
left_table.head(3)

Unnamed: 0,total_bill,tip,sex,left_id
0,16.99,1.01,Female,0
1,10.34,1.66,Male,1
2,21.01,3.5,Male,2


In [94]:
right_table.head(3)

Unnamed: 0,smoker,day,time,size,right_id
0,No,Sun,Dinner,2,0
1,No,Sun,Dinner,3,1
2,No,Sun,Dinner,3,2
