In [1]:
import pandas as pd
from matplotlib import pylab

# 中文字体设置
pylab.rcParams['font.sans-serif'] = ['SimHei']
pylab.rcParams['axes.unicode_minus'] = False

# 显示设置
pd.set_option("display.max_columns", None)


In [2]:
# 加载清洗后数据
df = pd.read_excel(r'E:\shenzhen_house_price_project\data\clean\清洗后数据.xlsx')
print(f"✅ 数据加载成功，共 {len(df):,} 条记录")

print("数据规模：", df.shape)
print("\n字段类型：")
print(df.dtypes)

print("\n数值字段描述性统计：")
print(df[["AREA", "per_price", "total_price"]].describe())


✅ 数据加载成功，共 29,904 条记录
数据规模： (29904, 10)

字段类型：
district        object
roomnum          int64
hall             int64
AREA           float64
C_floor         object
school           int64
subway           int64
per_price      float64
total_price    float64
source_file     object
dtype: object

数值字段描述性统计：
               AREA     per_price   total_price
count  29904.000000  29904.000000  29904.000000
mean     103.081402      5.695858    682.099839
std       52.367763      3.555464    870.308192
min       13.870000      0.620300     37.800000
25%       76.000000      3.429800    268.000000
50%       89.000000      4.635450    407.000000
75%      117.882500      6.818100    720.000000
max      960.000000     59.146900  16899.000000


In [3]:
# 1. 行政区价格与配套汇总
district_summary = (
    df.groupby("district")
      .agg(
          avg_per_price=("per_price", "mean"),
          avg_total_price=("total_price", "mean"),
          subway_ratio=("subway", "mean"),
          school_ratio=("school", "mean"),
          house_count=("district", "count")
      )
      .reset_index()
)

district_summary = district_summary.sort_values(
    by="avg_per_price", ascending=False
)

print(district_summary)
district_summary.to_csv(
    r'E:\shenzhen_house_price_project\data\clean\district_summary.csv',
    index=False,
    encoding="utf-8-sig"
)


  district  avg_per_price  avg_total_price  subway_ratio  school_ratio  \
1       南山      10.847225      1814.708977      0.946970      0.476584   
6       福田       8.874678      1136.691030      0.970109      0.702989   
4       宝安       5.901789       738.240032      0.842216      0.426913   
8       龙华       5.299328       554.114191      0.767010      0.455403   
7       罗湖       5.096743       410.916915      0.993508      0.600625   
5       盐田       4.059365       475.638575      0.967683      0.429673   
0       光明       4.020975       392.890196      0.738854      0.244978   
9       龙岗       3.897038       384.448074      0.875212      0.630482   
2       坪山       2.932494       272.262727      0.737052      0.392249   
3     大鹏新区       2.839108       289.428959      0.000000      0.352941   

   house_count  
1         2904  
6         3680  
4         3790  
8         3498  
7         4159  
5         2723  
0         2041  
9         4127  
2         2761  
3          221 

In [4]:
# 2. 构建预算分层
df["budget_group"] = pd.cut(
    df["total_price"],
    bins=[0, 500, 1000, float("inf")],
    labels=["刚需", "改善", "高端"]
)
# 预算 × 户型结构
budget_room_summary = (
    df.groupby(
        ["budget_group", "roomnum"],
        observed=True        # 显式声明，消除 FutureWarning
    )
    .size()
    .reset_index(name="house_count")
)

print(budget_room_summary)

budget_room_summary.to_csv(
    r"E:\shenzhen_house_price_project\data\clean\budget_room_summary.csv",
    index=False,
    encoding="utf-8-sig"
)

   budget_group  roomnum  house_count
0            刚需        1         2083
1            刚需        2         4880
2            刚需        3         9070
3            刚需        4         2185
4            刚需        5          170
5            刚需        6           16
6            刚需        7            2
7            改善        1           48
8            改善        2          686
9            改善        3         3191
10           改善        4         2204
11           改善        5          640
12           改善        6           86
13           改善        7           19
14           改善        8           10
15           改善        9            2
16           高端        1            8
17           高端        2          117
18           高端        3         1050
19           高端        4         1947
20           高端        5         1031
21           高端        6          273
22           高端        7          103
23           高端        8           53
24           高端        9           30


In [5]:
# 3. 是否地铁房价格对比（用于箱线图）
df[["district", "subway", "school", "per_price"]].to_csv(
     r'E:\shenzhen_house_price_project\data\clean\price_with_subway_school.csv',
    index=False,
    encoding="utf-8-sig"
)
# 各区地铁溢价率
subway_premium = (
    df.groupby(["district", "subway"])
      .agg(avg_price=("per_price", "mean"))
      .reset_index()
)

subway_premium = subway_premium.pivot(
    index="district",
    columns="subway",
    values="avg_price"
).reset_index()

subway_premium.columns = ["district", "no_subway_price", "subway_price"]

subway_premium["subway_premium_rate"] = (
    (subway_premium["subway_price"] - subway_premium["no_subway_price"])
    / subway_premium["no_subway_price"]
)

print(subway_premium)
subway_premium.to_csv(
    r'E:\shenzhen_house_price_project\data\clean\subway_premium_by_district.csv',
    index=False,
    encoding="utf-8-sig"
)


  district  no_subway_price  subway_price  subway_premium_rate
0       光明         3.638220      4.156260             0.142388
1       南山        11.203257     10.827287            -0.033559
2       坪山         2.630762      3.040139             0.155612
3     大鹏新区         2.839108           NaN                  NaN
4       宝安         3.685268      6.317039             0.714133
5       盐田         3.977295      4.062106             0.021324
6       福田         9.199033      8.864684            -0.036346
7       罗湖         4.491156      5.100700             0.135721
8       龙华         4.606263      5.509856             0.196166
9       龙岗         3.248366      3.989526             0.228164


In [6]:
# 4. 各行政区学区溢价率
# 全市层面：学区 vs 非学区价格对比
df[["district", "school", "per_price"]].to_csv(
    r'E:\shenzhen_house_price_project\data\clean\price_with_school.csv',
    index=False,
    encoding="utf-8-sig"
)
school_premium = (
    df.groupby(["district", "school"])
      .agg(avg_price=("per_price", "mean"))
      .reset_index()
)

print(school_premium)
school_premium = school_premium.pivot(
    index="district",
    columns="school",
    values="avg_price"
).reset_index()

school_premium.columns = [
    "district",
    "no_school_price",
    "school_price"
]

school_premium["school_premium_rate"] = (
    (school_premium["school_price"] - school_premium["no_school_price"])
    / school_premium["no_school_price"]
)
print(school_premium)
school_premium.to_csv(
    r'E:\shenzhen_house_price_project\data\clean\school_premium_by_district.csv',
    index=False,
    encoding="utf-8-sig"
)


   district  school  avg_price
0        光明       0   4.047709
1        光明       1   3.938581
2        南山       0  12.358595
3        南山       1   9.187339
4        坪山       0   2.885927
5        坪山       1   3.004644
6      大鹏新区       0   2.737613
7      大鹏新区       1   3.025182
8        宝安       0   5.724463
9        宝安       1   6.139830
10       盐田       0   4.046879
11       盐田       1   4.075938
12       福田       0   8.582507
13       福田       1   8.998119
14       罗湖       0   4.934396
15       罗湖       1   5.204692
16       龙华       0   4.944503
17       龙华       1   5.723647
18       龙岗       0   3.733186
19       龙岗       1   3.993070
  district  no_school_price  school_price  school_premium_rate
0       光明         4.047709      3.938581            -0.026961
1       南山        12.358595      9.187339            -0.256603
2       坪山         2.885927      3.004644             0.041137
3     大鹏新区         2.737613      3.025182             0.105044
4       宝安         5.724463      6

In [7]:
# 5. 楼层市场份额
floor_summary = (
    df.groupby("C_floor")
      .agg(
          house_count=("C_floor", "count"),
          avg_per_price=("per_price", "mean")
      )
      .reset_index()
)

floor_summary["ratio"] = (
    floor_summary["house_count"] / floor_summary["house_count"].sum()
)

print(floor_summary)
floor_summary.to_csv(
    r'E:\shenzhen_house_price_project\data\clean\floor_summary.csv',
    index=False,
    encoding="utf-8-sig"
)


  C_floor  house_count  avg_per_price     ratio
0    high        10192       5.765741  0.340824
1     low         8942       5.887748  0.299024
2  middle        10770       5.470404  0.360152
