In [1]:
import pandas as pd
import numpy as np

In [2]:
xiaoqu = pd.read_csv("/home/mw/project/xiaoqu.csv")
xiaoqu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2973 entries, 0 to 2972
Data columns (total 18 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   名称       2973 non-null   object 
 1   环线位置     1730 non-null   object 
 2   建筑年代     2793 non-null   float64
 3   房屋总数     2973 non-null   int64  
 4   楼栋总数     2973 non-null   int64  
 5   物业公司     2344 non-null   object 
 6   绿 化 率    2973 non-null   float64
 7   容 积 率    2973 non-null   float64
 8   物 业 费    2630 non-null   object 
 9   供水       2798 non-null   object 
 10  供暖       2273 non-null   object 
 11  供电       2812 non-null   object 
 12  燃气费      2564 non-null   object 
 13  供热费      1190 non-null   object 
 14  停车位      2973 non-null   float64
 15  停车费用     2973 non-null   float64
 16  coord_x  2973 non-null   float64
 17  coord_y  2973 non-null   float64
dtypes: float64(7), int64(2), object(9)
memory usage: 418.2+ KB


In [3]:
rent = pd.read_csv('/home/mw/input/quant4533/ruc_Class25Q1_rent.csv')
rent.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 84150 entries, 0 to 84149
Data columns (total 22 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   城市      84150 non-null  int64  
 1   小区名称    84150 non-null  object 
 2   户型      84149 non-null  object 
 3   装修      29916 non-null  object 
 4   价格      84150 non-null  int64  
 5   楼层      84145 non-null  object 
 6   面积      84150 non-null  object 
 7   朝向      84145 non-null  object 
 8   交易时间    84150 non-null  object 
 9   付款方式    69127 non-null  object 
 10  租赁方式    84150 non-null  object 
 11  电梯      84148 non-null  object 
 12  车位      21377 non-null  object 
 13  用水      72771 non-null  object 
 14  用电      73206 non-null  object 
 15  燃气      81965 non-null  object 
 16  采暖      61414 non-null  object 
 17  租期      44315 non-null  object 
 18  配套设施    58921 non-null  object 
 19  lon     84150 non-null  float64
 20  lat     84150 non-null  float64
 21  年份      84150 non-null  float64
dty

In [4]:
rent['面积'] = rent['面积'].apply(lambda x:x[:-1]).astype('float')

In [5]:
rent['每平米月租金'] = rent.apply(
    lambda row: row['价格'] / row['面积'] if pd.notna(row['面积']) and row['面积'] > 0 else None,
    axis=1
)

In [6]:
average_rent = rent.groupby('小区名称')['每平米月租金'].mean().reset_index(name='平均每平米月租金')

In [7]:
is_matching = xiaoqu['名称'].isin(average_rent['小区名称'])

# 获取匹配的索引
matching_indexes = xiaoqu.index[is_matching]

In [8]:
not_matching = ~xiaoqu['名称'].isin(average_rent['小区名称'])
# 获取不匹配的索引
unmatching_indexes = xiaoqu.index[not_matching]
print(unmatching_indexes)
# 获取不匹配的小区名称
unmatched_names = xiaoqu.loc[not_matching, '名称']

Int64Index([   0,   15,   18,   23,   39,   45,   83,  109,  141,  164,
            ...
            2838, 2865, 2867, 2869, 2899, 2916, 2932, 2942, 2968, 2969],
           dtype='int64', length=298)


In [9]:
def haversine(lon1, lat1, lon2, lat2):
    """
    计算两个经纬度点之间的距离（单位：千米）
    """
    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = np.sin(dlat / 2.0) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2.0) ** 2
    c = 2 * np.arcsin(np.sqrt(a))
    km = 6367 * c
    return km

In [10]:
# 对于匹配的索引的行，直接添加匹配的小区名称
xiaoqu.loc[matching_indexes, '匹配租房'] = xiaoqu.loc[matching_indexes, '名称']

# 对于不匹配的索引的行，根据经纬度匹配最近的 xiaoqu 的名称
for index, row in xiaoqu.loc[unmatching_indexes].iterrows():
    distances = haversine(row['coord_x'], row['coord_y'], rent['lon'], rent['lat'])
    nearest_index = np.argmin(distances)
    xiaoqu.loc[index, '匹配租房'] = rent.loc[nearest_index, '小区名称']

In [11]:
xiaoqu = xiaoqu.merge(average_rent[['小区名称'] + ['平均每平米月租金']],
                          left_on='匹配租房', right_on='小区名称', how='left')

In [12]:
xiaoqu['平均每平米月租金'].describe()

count    2973.000000
mean       68.406675
std        43.501310
min        11.297822
25%        34.389787
50%        53.482531
75%        96.600556
max       567.715338
Name: 平均每平米月租金, dtype: float64

In [13]:
xiaoqu.to_csv('/home/mw/project/xiaoqu_rent.csv', index=False)