In [48]:
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest

In [49]:
df = pd.read_csv("中间数据集\\多表_按单品\\白玉菇(袋).csv")

In [50]:
df = df.drop(['单品名称', '分类名称', '单品编码', '销售类型', '是否打折销售'], axis=1)
df.head()

Unnamed: 0,日期,销量(千克),销售单价(元/千克),批发价格(元/千克),损耗率_单品
0,2020-08-07,1.0,7.9,3.75,6.57
1,2020-08-07,1.0,7.9,3.75,6.57
2,2020-08-07,1.0,7.9,3.75,6.57
3,2020-08-07,1.0,7.9,3.75,6.57
4,2020-08-08,1.0,7.9,3.75,6.57


In [51]:
df['日期'] = pd.to_datetime(df['日期'])

# 计算平均利润
df['平均利润'] = (df['销售单价(元/千克)'] - df['批发价格(元/千克)']) * df['销量(千克)']

# 计算平均利润率，避免负利润率
df['平均利润率'] = (df['销售单价(元/千克)'] - df['批发价格(元/千克)']) / df['批发价格(元/千克)']

# 将负利润率设置为0，以避免计算负数的加成率
df['平均利润率'] = df['平均利润率']#.apply(lambda x: max(x, 0))

# 计算加成率，避免分母为0
df['加成率'] = df['平均利润率']#.apply(lambda x: x / (1 - x) if x < 1 else float('inf'))

# 计算成本加成定价
df['成本加成定价'] = df['批发价格(元/千克)'] * (1 + df['加成率'])

# 如果成本加成定价大于批发价的1.75倍，则将成本加成定价设置为批发价的1.75倍
max_allowed_price = df['批发价格(元/千克)'] * 1.75
df['成本加成定价'] = df['成本加成定价'].where(df['成本加成定价'] <= max_allowed_price, max_allowed_price)

df.head()

Unnamed: 0,日期,销量(千克),销售单价(元/千克),批发价格(元/千克),损耗率_单品,平均利润,平均利润率,加成率,成本加成定价
0,2020-08-07,1.0,7.9,3.75,6.57,4.15,1.106667,1.106667,6.5625
1,2020-08-07,1.0,7.9,3.75,6.57,4.15,1.106667,1.106667,6.5625
2,2020-08-07,1.0,7.9,3.75,6.57,4.15,1.106667,1.106667,6.5625
3,2020-08-07,1.0,7.9,3.75,6.57,4.15,1.106667,1.106667,6.5625
4,2020-08-08,1.0,7.9,3.75,6.57,4.15,1.106667,1.106667,6.5625


In [52]:
# 处理异常值
iso_forest = IsolationForest(contamination=0.05, random_state=42)  # contamination 表示异常值的比例
iso_forest.fit(df[['销售单价(元/千克)', '成本加成定价']])
# 预测结果 (-1 表示异常值，1 表示正常值)
df['anomaly'] = iso_forest.predict(df[['销售单价(元/千克)', '成本加成定价']])
# 过滤掉异常值，只保留正常值的数据
df = df[df['anomaly'] == 1].drop('anomaly', axis=1)

df.head()

Unnamed: 0,日期,销量(千克),销售单价(元/千克),批发价格(元/千克),损耗率_单品,平均利润,平均利润率,加成率,成本加成定价
0,2020-08-07,1.0,7.9,3.75,6.57,4.15,1.106667,1.106667,6.5625
1,2020-08-07,1.0,7.9,3.75,6.57,4.15,1.106667,1.106667,6.5625
2,2020-08-07,1.0,7.9,3.75,6.57,4.15,1.106667,1.106667,6.5625
3,2020-08-07,1.0,7.9,3.75,6.57,4.15,1.106667,1.106667,6.5625
4,2020-08-08,1.0,7.9,3.75,6.57,4.15,1.106667,1.106667,6.5625


In [53]:
df = df.drop(['平均利润', '平均利润率', '加成率'], axis=1)
df.head()

Unnamed: 0,日期,销量(千克),销售单价(元/千克),批发价格(元/千克),损耗率_单品,成本加成定价
0,2020-08-07,1.0,7.9,3.75,6.57,6.5625
1,2020-08-07,1.0,7.9,3.75,6.57,6.5625
2,2020-08-07,1.0,7.9,3.75,6.57,6.5625
3,2020-08-07,1.0,7.9,3.75,6.57,6.5625
4,2020-08-08,1.0,7.9,3.75,6.57,6.5625


In [55]:
# 对时序数据进行聚合 resample
df_final = df.resample('D', on='日期').agg({
    '销量(千克)': 'sum',
    '销售单价(元/千克)': 'mean',
    '批发价格(元/千克)': 'mean',
    '损耗率_单品': 'mean',    
    '成本加成定价': 'mean'
})
df_final.head()

Unnamed: 0_level_0,销量(千克),销售单价(元/千克),批发价格(元/千克),损耗率_单品,成本加成定价
日期,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-08-07,4.0,7.9,3.75,6.57,6.5625
2020-08-08,7.0,7.9,3.75,6.57,6.5625
2020-08-09,0.0,,,,
2020-08-10,0.0,,,,
2020-08-11,0.0,,,,
