## Step 0: 生成“脏”数据

In [None]:
# 环境依赖
import pandas as pd
import numpy as np

In [None]:
# 构建原始字典数据 (对应上午的学习内容：理解字典到DataFrame的转换)
data = {
    'Transaction_ID': [101, 102, 103, 104, 102, 105, 106, 107, 108, 109],  # 注意 102 重复了
    'Branch': ['Beijing_A', 'Shanghai_B', 'Beijing_A', 'Shenzhen_C', 'Shanghai_B', 
               'Beijing_A', 'Shenzhen_C', 'Shanghai_B', 'Beijing_A', 'Shenzhen_C'],
    'Product': ['Latte', 'Espresso', 'Cappuccino', 'Latte', 'Espresso', 
                'Mocha', 'Latte', 'Espresso', 'Latte', 'Mocha'],
    'Price': [30.0, 25.0, 32.0, 30.0, 25.0, np.nan, 30.0, 25.0, 30.0, np.nan], # 有缺失值
    'Quantity': [1, 2, 1, 3, 2, 2, 1, 5, 2, 1],
    'Date': ['2023-10-01'] * 10
}

# 创建 DataFrame
df_initial = pd.DataFrame(data)

# 保存为 CSV (对应中午的学习内容：数据加载)
df_initial.to_csv('dirty_coffee_sales.csv', index=False)

print("项目数据 'dirty_coffee_sales.csv' 已生成！")

## Step 1: 基础了解

In [None]:
# 理解数据基本形态
ser_product = pd.Series(data=data['Product'])
print("任务 A: 手动创建的 Product 列 Series:")
ser_product

In [None]:
# 使用 .loc 找出标签（索引）为 3 的那行数据
print("任务 B: 使用 .loc 找出标签为 3 的那行数据:")
df_initial.loc[3]

In [7]:
# 使用 .iloc 找出前 5 行、前 2 列的数据
print("任务 B: 使用 .iloc 找出前 5 行、前 2 列的数据:")
df_initial.iloc[:5, :2]

任务 B: 使用 .iloc 找出前 5 行、前 2 列的数据:


Unnamed: 0,Transaction_ID,Branch
0,101,Beijing_A
1,102,Shanghai_B
2,103,Beijing_A
3,104,Shenzhen_C
4,102,Shanghai_B


## Step 2: 数据加载与预览

In [13]:
df = pd.read_csv('dirty_coffee_sales.csv')
print("数据预览:head():")
df.head()
print("数据预览:info():")
df.info()

df

数据预览:head():
数据预览:info():
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Transaction_ID  10 non-null     int64  
 1   Branch          10 non-null     object 
 2   Product         10 non-null     object 
 3   Price           8 non-null      float64
 4   Quantity        10 non-null     int64  
 5   Date            10 non-null     object 
dtypes: float64(1), int64(2), object(3)
memory usage: 612.0+ bytes


Unnamed: 0,Transaction_ID,Branch,Product,Price,Quantity,Date
0,101,Beijing_A,Latte,30.0,1,2023-10-01
1,102,Shanghai_B,Espresso,25.0,2,2023-10-01
2,103,Beijing_A,Cappuccino,32.0,1,2023-10-01
3,104,Shenzhen_C,Latte,30.0,3,2023-10-01
4,102,Shanghai_B,Espresso,25.0,2,2023-10-01
5,105,Beijing_A,Mocha,,2,2023-10-01
6,106,Shenzhen_C,Latte,30.0,1,2023-10-01
7,107,Shanghai_B,Espresso,25.0,5,2023-10-01
8,108,Beijing_A,Latte,30.0,2,2023-10-01
9,109,Shenzhen_C,Mocha,,1,2023-10-01


## Step 3: 数据清洗与处理

In [None]:
print("任务 A: 处理重复值")
df.duplicated()
df = df.drop_duplicates().reset_index(drop=True)
df

任务 A: 处理重复值


Unnamed: 0,Transaction_ID,Branch,Product,Price,Quantity,Date
0,101,Beijing_A,Latte,30.0,1,2023-10-01
1,102,Shanghai_B,Espresso,25.0,2,2023-10-01
2,103,Beijing_A,Cappuccino,32.0,1,2023-10-01
3,104,Shenzhen_C,Latte,30.0,3,2023-10-01
4,105,Beijing_A,Mocha,,2,2023-10-01
5,106,Shenzhen_C,Latte,30.0,1,2023-10-01
6,107,Shanghai_B,Espresso,25.0,5,2023-10-01
7,108,Beijing_A,Latte,30.0,2,2023-10-01
8,109,Shenzhen_C,Mocha,,1,2023-10-01


In [None]:
# 处理缺失值
print("任务 B: 处理缺失值")
df = df.fillna(35.0) # 假设摩卡的价格为35.0
df.isna().sum() # 输出每个Series的缺失值数量

任务 B: 处理缺失值


Transaction_ID    0
Branch            0
Product           0
Price             0
Quantity          0
Date              0
Total_Price       0
dtype: int64

In [21]:
# 数据处理：获得每单总价
df['Total_Price'] = df['Price'] * df['Quantity']
df

Unnamed: 0,Transaction_ID,Branch,Product,Price,Quantity,Date,Total_Price
0,101,Beijing_A,Latte,30.0,1,2023-10-01,30.0
1,102,Shanghai_B,Espresso,25.0,2,2023-10-01,50.0
2,103,Beijing_A,Cappuccino,32.0,1,2023-10-01,32.0
3,104,Shenzhen_C,Latte,30.0,3,2023-10-01,90.0
4,105,Beijing_A,Mocha,35.0,2,2023-10-01,70.0
5,106,Shenzhen_C,Latte,30.0,1,2023-10-01,30.0
6,107,Shanghai_B,Espresso,25.0,5,2023-10-01,125.0
7,108,Beijing_A,Latte,30.0,2,2023-10-01,60.0
8,109,Shenzhen_C,Mocha,35.0,1,2023-10-01,35.0


## Step 4: 数据聚合与分析

In [27]:
print("任务 A: 数据聚合 - 按分店统计总销售额")
df.groupby('Branch')['Total_Price'].sum()

任务 A: 数据聚合 - 按分店统计总销售额


Branch
Beijing_A     192.0
Shanghai_B    175.0
Shenzhen_C    155.0
Name: Total_Price, dtype: float64

In [None]:
print("任务 B: 数据透视表 - 每个分店每种产品的总销售额")
(
    df.groupby(['Branch', 'Product'], as_index=False)
        .agg(Total_Price=('Total_Price', 'sum'))
        .sort_values(['Branch', 'Total_Price'], ascending=[True, False])
)

任务 B: 数据透视表 - 每个分店每种产品的总销售额


Unnamed: 0,Branch,Product,Total_Price
1,Beijing_A,Latte,90.0
2,Beijing_A,Mocha,70.0
0,Beijing_A,Cappuccino,32.0
3,Shanghai_B,Espresso,175.0
4,Shenzhen_C,Latte,120.0
5,Shenzhen_C,Mocha,35.0
