### python 使用新技能记录

#### 1. 生成数据
+ 通过numpy生成数据
+ 通过 Dict创建
+ 通过nparray 创建， 也可以转置后创建
+ pd.Timestamp
+ to_datetime()

In [11]:
# 通过numpy生成数据
dates = pd.date_range('20170101',  periods=6)
df = pd.DataFrame(np.random.randn(6, 4),  index=dates,  columns=list('ABCD'))
df

Unnamed: 0,A,B,C,D
2017-01-01,-1.212702,-0.618336,-1.791715,-0.095061
2017-01-02,-0.81766,-0.108716,0.499001,0.65718
2017-01-03,0.996122,-1.459087,-1.667035,1.507964
2017-01-04,1.75419,1.171723,0.493786,0.713137
2017-01-05,-0.723157,0.357306,-0.084604,0.596102
2017-01-06,1.450784,1.905929,0.58278,-1.636454


In [4]:
# 通过 Dict创建
df2 = pd.DataFrame({ 'A' : 1.,
                               'B' : pd.Timestamp('20130102'),
                               'C' : pd.Series(1, index=list(range(4)), dtype='float32'),
                               'D' : np.array([3] * 4, dtype='int32'),
                               'E' : pd.Categorical(["test", "train", "test", "train"]),
                               'F' : 'foo' })
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [7]:
# 通过np array 创建， 也可以转置后创建
data = [[2000, 1, 2],
           [2001, 1, 3]]

df = pd.DataFrame(data,
                            index=['one', 'two'],
                            columns=['year', 'state', 'pop'])
df

Unnamed: 0,year,state,pop
one,2000,1,2
two,2001,1,3


In [8]:
import pandas as pd
from datetime import datetime as dt

p1 = pd.Timestamp(2017, 6, 19)
p2 = pd.Timestamp(dt(2017, 6, 19, hour=9, minute=13, second=45))
p3 = pd.Timestamp("2017-6-19 9:13:45")

In [10]:
import pandas as pd
from datetime import datetime as dt

p4 = pd.to_datetime("2017-6-19 9:13:45")
p5 = pd.to_datetime(dt(2017, 6, 19, hour=9, minute=13, second=45))

#### 2. apply
+ 定义函数，查找前n条数据
+ 分位数和桶分析
+ 定义函数，对数据进行统计
+ 定义函数，求加权平均数

In [1]:
import pandas as pd
import numpy as np

In [41]:
df = pd.DataFrame({ 'total_bill' : np.random.randn(700, 1).flatten(),
                             'tip' : np.random.randn(700, 1).flatten(),
                             'smoker' : ['yes', 'no']*350,
                             'day': ['1', '2', '3', '4', '5', '6', '7']*100,
                             'size': [1, 2, 3, 4, 7, 6, 7]*100})

In [42]:
# 定义函数，查找前n条数据
def  top(df, n=5, column='tip'):
    return df.sort_values(by=column)[-n:]

In [43]:
grouped = df.groupby('smoker')
grouped.apply(top)

Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,tip,smoker,day,size
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
no,187,0.949143,2.580631,no,6,6
no,377,-0.421504,2.588732,no,7,7
no,661,0.217543,2.787999,no,4,4
no,557,-1.350705,2.888313,no,5,7
no,45,0.0053,3.483185,no,4,4
yes,300,0.55321,2.037802,yes,7,7
yes,502,-0.525894,2.091052,yes,6,6
yes,382,1.377806,2.160574,yes,5,7
yes,176,0.330065,2.177677,yes,2,2
yes,180,0.806243,2.348313,yes,6,6


In [44]:
# 分组键会跟原始对象的索引共同构成结果对象中的层次化索引，将group_keys设置成False即可
grouped = df.groupby('smoker', group_keys=False)
grouped.apply(top)

Unnamed: 0,total_bill,tip,smoker,day,size
187,0.949143,2.580631,no,6,6
377,-0.421504,2.588732,no,7,7
661,0.217543,2.787999,no,4,4
557,-1.350705,2.888313,no,5,7
45,0.0053,3.483185,no,4,4
300,0.55321,2.037802,yes,7,7
502,-0.525894,2.091052,yes,6,6
382,1.377806,2.160574,yes,5,7
176,0.330065,2.177677,yes,2,2
180,0.806243,2.348313,yes,6,6


In [29]:
df.groupby(['smoker', 'day']).apply(top, n=1, column='total_bill')

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,total_bill,tip,smoker,day,size
smoker,day,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
no,1,231,2.842382,-0.984559,no,1,1
no,2,533,1.951704,-0.1316,no,2,2
no,3,639,1.722682,-0.272726,no,3,3
no,4,283,1.826005,0.942699,no,4,4
no,5,95,2.071527,0.432614,no,5,7
no,6,621,2.286159,1.527302,no,6,6
no,7,419,3.260771,-0.823381,no,7,7
yes,1,238,2.649499,0.21674,yes,1,1
yes,2,652,2.321149,-2.511195,yes,2,2
yes,3,114,2.205318,1.015169,yes,3,3


In [32]:
# 分位数和桶分析
# cut将数据分成几个区，每个区长度一样，区间数据数量不一定一样
quartiles = pd.cut(df.total_bill, 4)
quartiles[:10]

0    (-1.347, 0.189]
1     (0.189, 1.725]
2    (-1.347, 0.189]
3    (-1.347, 0.189]
4    (-1.347, 0.189]
5    (-1.347, 0.189]
6     (0.189, 1.725]
7    (-1.347, 0.189]
8    (-1.347, 0.189]
9     (0.189, 1.725]
Name: total_bill, dtype: category
Categories (4, interval[float64]): [(-2.89, -1.347] < (-1.347, 0.189] < (0.189, 1.725] < (1.725, 3.261]]

In [34]:
# 定义函数，对数据进行统计
def get_stats(group):
    return {'min': group.min(),
               'max': group.max(),
               'cnt':  group.count(),
               'mean': group.mean()}
grouped = df.tip.groupby(quartiles)
grouped.apply(get_stats).unstack()

Unnamed: 0_level_0,cnt,max,mean,min
total_bill,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"(-2.89, -1.347]",53.0,2.182337,-0.168562,-2.473014
"(-1.347, 0.189]",344.0,2.677216,-0.021576,-3.949428
"(0.189, 1.725]",278.0,3.232267,0.03066,-2.535697
"(1.725, 3.261]",25.0,1.676914,-0.088116,-2.511195


In [37]:
# 定义函数，求加权平均数
df = pd.DataFrame({ 'category' :  ['yes', 'no']*50,
                             'data' : np.random.randn(100),
                             'weight' :np.random.rand(100)})


In [38]:
grouped = df.groupby('category')
grouped.size()

category
no     50
yes    50
dtype: int64

In [40]:
get_wavg = lambda g: np.average(g['data'], weights=g['weight'])
grouped.apply(get_wavg)

category
no     0.174674
yes   -0.114259
dtype: float64

In [None]:
def func_stand(data_one_stock_num, time_step):
    #通过apply进入函数内的数据，其股票名为data_one_stock_num.name，类型为pd.dataFrame
    #即，进入此函数的数据为所有名为data_one_stock_num.name的集合
    #dataFrame.shape:(num , 11), num是这个股票出现的次数
    for colu_name in data_one_stock_num.columns:
        if colu_name in ["gate", "stock_date", "stock_num"]:
            continue
        #只针对输入数据进行标准化，标准化算法为: (原始数据 - 平均值) / 标准差
        #这里每一次for循环，都拿出了1列数据，针对这一列进行标准化并覆盖原数据
        data_one_stock_num[colu_name] = ((data_one_stock_num[colu_name] - data_one_stock_num[colu_name].rolling(time_step).mean())/data_one_stock_num[colu_name].rolling(time_step).std())
    return data_one_stock_num
g_stock_num = total_data.groupby(by = ["stock_num"])
data_after_stand = g_stock_num.apply(func_stand, time_step = time_step)

# 2.求窗口期函数的百分位数
g = df.groupby(['dow', 'hour', 'minute'])['x']
s = pd.concat({y:g.apply(lambda x : x.rolling(5,min_periods=1).quantile(y)) for y in [0.25,0.30,0.50]},1)
yourdf = pd.concat([df,s],axis=1)

