## 欢迎进入 ModelWhale Notebook  

这里你可以编写代码，文档  

### 关于文件目录  


**project**：project 目录是本项目的工作空间，可以把将项目运行有关的所有文件放在这里，目录中文件的增、删、改操作都会被保留  


**input**：input 目录是数据集的挂载位置，所有挂载进项目的数据集都在这里，未挂载数据集时 input 目录被隐藏  


**temp**：temp 目录是临时磁盘空间，训练或分析过程中产生的不必要文件可以存放在这里，目录中的文件不会保存  


In [37]:
# 查看个人持久化工作区文件
import pandas as pd
import re
import cn2an
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression,Ridge, Lasso, ElasticNet
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import seaborn as sas
import matplotlib.pyplot as plt
import jieba

In [3]:
pip install cn2an

Collecting cn2an
  Downloading cn2an-0.5.23-py3-none-any.whl (224 kB)
     |████████████████████████████████| 224 kB 583 kB/s            
[?25hCollecting proces>=0.1.7
  Downloading proces-0.1.7-py3-none-any.whl (137 kB)
     |████████████████████████████████| 137 kB 4.5 MB/s            
[?25hInstalling collected packages: proces, cn2an
Successfully installed cn2an-0.5.23 proces-0.1.7
Note: you may need to restart the kernel to use updated packages.


In [36]:
pip install jieba

Collecting jieba
  Downloading jieba-0.42.1.tar.gz (19.2 MB)
     |████████████████████████████████| 19.2 MB 719 kB/s            
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hBuilding wheels for collected packages: jieba
  Building wheel for jieba (setup.py) ... [?25ldone
[?25h  Created wheel for jieba: filename=jieba-0.42.1-py3-none-any.whl size=19314476 sha256=fef23cc3389a0312db52e6c99058f2c832a8da5c4d3561c88c4efa48f6cca0f0
  Stored in directory: /home/mw/.cache/pip/wheels/24/aa/17/5bc7c72e9a37990a9620cc3aad0acad1564dcff6dbc2359de3
Successfully built jieba
Installing collected packages: jieba
Successfully installed jieba-0.42.1
Note: you may need to restart the kernel to use updated packages.


In [39]:
# 查看当前挂载的数据集目录
data=pd.read_csv("/home/mw/input/quant4533/ruc_Class25Q1_train.csv")
data.isnull().sum()

城市          0
区域          0
板块          0
环线      41407
小区名称        0
价格          0
房屋户型      605
所在楼层        0
建筑面积        0
套内面积    58987
房屋朝向        0
建筑结构      605
装修情况      605
梯户比例     1695
配备电梯     8315
别墅类型    83384
交易时间        0
交易权属        0
上次交易    28953
房屋用途        2
房屋年限    29782
产权所属        0
抵押信息    84133
房屋优势    16064
核心卖点    16366
户型介绍    63671
周边配套    34027
交通出行    32437
lon         0
lat         0
年份          0
dtype: int64

In [3]:
#房屋类型处理
def house_type(x):
    pattern_1=r"(\d+)室(\d+)厅(\d+)厨(\d+)卫"
    pattern_2=r"(\d+)房间(\d+)卫"
    match=re.search(pattern_1,x)
    if match:
        return {"房间个数":match.group(1),"厅个数":match.group(3),"厨房个数":match.group(2),"厕所个数":match.group(4)}
    else:
        match_2=re.search(pattern_2,x)
        if match_2:
            return {"房间个数":match_2.group(1),"厅个数":0,"厨房个数":0,"厕所个数":match_2.group(2)}
        

In [4]:
#楼户比处理
def elevator_ratio(x):
    pattern=r"(\D+)梯(\D+)户"
    match=re.search(pattern,x)
    if match:
        elevators=match.group(1)
        households=match.group(2)
        elevators_num = int(cn2an.cn2an(elevators, "normal"))
        households_num = int(cn2an.cn2an(households, "normal"))
    return elevators_num/households_num

In [5]:
#所在楼层
def floor(x):
    parts = x.replace('(', '').replace(')', '').split()
    floor_type = parts[0]
    return floor_type
def total_floor(x):
    parts = x.replace('(', '').replace(')', '').split()
    total_floor = int(parts[1][1])
    return total_floor

In [6]:
#建筑面积
def area(x):
    pattern=r"(\d+)\.(\d+)㎡"
    match=re.search(pattern,x)
    if match:
        return float(match.group(1))

In [7]:
#时间处理
def safe_date_convert(date_str):
    try:
        return pd.to_datetime(date_str)
    except :
        return pd.Timestamp("2018-04-11")
        

In [8]:
#地点处理
def location(row):
    return str(row["城市"]) + "_" + str(row["区域"]) + "_" + str(row["板块"])


In [9]:
#环线处理
def simplify_ring(x):
    ring={"核心" :["内环内","一至二环","二环内","一环内"],
    "市中心" : [ "二至三环", "三至四环","内环至中环"],
    "近郊": ["内环至外环","四至五环", "五至六环", "中环至外环"],
    "远郊" : ["六环外", "外环外"],
    "其他":["三环外", "四环外", "其他"]}
    for key , item in ring.items():
        if x in ring[key]:
            return key
    

In [10]:
#使用K-mean对经纬度进行聚类
def kmeans(data):
    kmeans = KMeans(n_clusters=10, random_state=42)
    data["location_cluster"] = kmeans.fit_predict(data[["lon", "lat"]])
    data=data.drop(columns=["lon","lat"])
    return data

In [11]:
#模型评估
def evaluate_model(y_true, y_pred, dataset_name="Dataset"):
    print(f"—— {dataset_name} 评估 ——")
    print(f"R² Score: {r2_score(y_true, y_pred):.4f}")
    print(f"MAE: {mean_absolute_error(y_true, y_pred):.4f}")
    print(f"RMSE: {np.sqrt(mean_squared_error(y_true, y_pred)):.4f}")
    print("-" * 30)

In [12]:
#数据预处理
def data_preprocess(data):
    for col in ["房屋优势","核心卖点","户型介绍","周边配套","交通出行"]:
        data[col]=data[col].map(lambda x:x if pd.notna(x) else "无")
    data["地址"]=data.apply(lambda x:location(x),axis=1)
    data=data.fillna(method="bfill")
    if data.isnull().sum().sum() !=0 :
        data=data.fillna(method="ffill")
    return data

In [30]:
def text(row):
    return str(row["核心卖点"])+str(row["户型介绍"])+str(row["周边配套"])+str(row["交通出行"])
important_words={"地铁", "公交", "银行", "商场", "超市", "医院", "学校", "公园", "菜市场"}
def feature(row):
    words=jieba.lcut(row["text"])
    return list(set([word for word in words if word in important_words]))
    


In [32]:
def feature_enigeering(data):
    #合并环线
    data["环线"]=data["环线"].map(lambda x :x if pd.notna(x) else "其他")
    data["环线"]=data["环线"].map(lambda x :simplify_ring(x))
    #文本特征处理
    data["text"]=data.apply(lambda x: text(x),axis=1)
    data["feature"]=data.apply(lambda x :feature(x),axis=1)
    for col in important_words:
        data[col] = data["feature"].apply(lambda x: 1 if col in x else 0)
        data=pd.get_dummies(data, columns=[col], drop_first=True) 
    #将城市 区域 板块合成为一个特征
    data["地址"]=data.apply(lambda x:location(x),axis=1)
    #将房屋朝向变为主朝向
    data["房屋朝向"]=data["房屋朝向"].map(lambda x:x.split()[0])
    #将类别变量转化为独热代码
    for col in ["城市","房屋朝向","建筑结构","装修情况","配备电梯","交易权属","房屋用途","房屋年限","产权所属","环线","location_cluster"]:
        data[col]=data[col].map(lambda x:x if pd.notna(x) else "其他")
        data=pd.get_dummies(data, columns=[col], drop_first=True)
    #合并
    #data["房屋用途_公寓"]=data["房屋用途_公寓"]+data["房屋用途_公寓/住宅"]+data["房屋用途_公寓（住宅）"]+data["房屋用途_公寓/公寓"]
    data["楼层"]=data["所在楼层"].map(lambda x:floor(x) if pd.notna(x) else None)
    data=pd.get_dummies(data, columns=["楼层"], drop_first=True)
    data["楼层总数"]=data["所在楼层"].map(lambda x:total_floor(x) if pd.notna(x) else None)
    data["梯户比例"]=data["梯户比例"].map(lambda x:elevator_ratio(x) if pd.notna(x) else None)
    #将房屋户型分为房间 客厅等特征
    house_info=data["房屋户型"].map(lambda x:house_type(x) if pd.notna(x) else None)
    house_info_df = pd.json_normalize(house_info)
    data = pd.concat([data, house_info_df], axis=1)
    data["建筑面积"]=data["建筑面积"].map(lambda x:area(x) if pd.notna(x) else 0 )
    #时间特征的处理，房价段时间内波动不大，只保留交易的年份
    data["交易时间"] = safe_date_convert(data["交易时间"])
    data["上次交易"]=safe_date_convert(data["上次交易"])
    data["交易时间间隔"] = ( data["交易时间"]- data["上次交易"]).dt.days
    data["交易时间"]=data["交易时间"].dt.year
    data=data.fillna(method='bfill')
    #生成频率特征，衡量市场的供给
    place=pd.DataFrame(data.groupby("地址").size())
    data["频率"]=data["地址"].apply(lambda x:place.loc[x,0])
    #删去无用的特征
    data=data.drop(columns=["地址","小区名称","套内面积","别墅类型","年份","房屋户型","所在楼层","上次交易","抵押信息"])
    return data

    

In [33]:
data=data_preprocess(data)

In [40]:
#数据处理和生产特征将重复无用特征删除
data=kmeans(data)
data=feature_enigeering(data)
#data=data.drop(columns=["房屋用途_公寓/住宅","房屋用途_公寓/公寓","房屋用途_公寓（住宅）"])

Building prefix dict from the default dictionary ...
Dumping model to file cache /tmp/jieba.cache
Loading model cost 0.899 seconds.
Prefix dict has been built successfully.


In [41]:
#分为两个样本包含极端值和不包含极端值
houing_outlier=data.copy()
houing_outlier_price=houing_outlier["价格"]
Q1=houing_outlier_price.quantile(0.25)
Q3=houing_outlier_price.quantile(0.75)
IQR=Q3-Q1
upper_bound=Q3+1.5*IQR
lower_bound=Q1-1.5*IQR
houing_outlier=houing_outlier[(houing_outlier["价格"]>=lower_bound)&(houing_outlier["价格"]<=upper_bound)]

In [42]:
#除去价格
house_price=data["价格"]
data=data.drop(columns=["价格"])
house_price_outlier=houing_outlier["价格"]
houing_outlier=houing_outlier.drop(columns=["价格"])


In [52]:
data_1=data.copy()
data_1=data_1.drop(columns=["区域","板块","房屋优势","核心卖点","户型介绍","周边配套","交通出行","text","feature"])

In [44]:
# data__test处理
data_test=pd.read_csv("/home/mw/input/quant4533/ruc_Class25Q1_test.csv")
ID=data_test["ID"]
data_test=data_test.drop(columns="ID")

In [45]:
#数据清理产生特征
data_test=data_preprocess(data_test)
data_test=kmeans(data_test)
data_test=feature_enigeering(data_test)

In [46]:
data_test.isnull().sum()

区域        0
板块        0
建筑面积      3
梯户比例      0
交易时间      0
         ..
厅个数       0
厨房个数      0
厕所个数      0
交易时间间隔    0
频率        0
Length: 98, dtype: int64

In [47]:
data_test=data_test.fillna(data_test.mean())

  """Entry point for launching an IPython kernel.


In [53]:
X=data_1
y=house_price
X_train, X_test_1, y_train, y_test_1 = train_test_split(X, y, test_size=0.2, random_state=42)

In [55]:
#匹配data_train和data_test的特征
X_test=data_test
common_cols = X_train.columns.intersection(X_test.columns)
X_train = X_train[common_cols]
X_test = X_test[common_cols]


In [56]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 4. 训练回归模型：Ridge, Lasso, ElasticNet

# Ridge 回归
#ridge = Ridge(alpha=1.0)  # alpha 是正则化的强度
#ridge.fit(X_train_scaled, y_train)
#ridge_pred = ridge.predict(X_test_scaled)

In [30]:
# Lasso 回归
lasso = Lasso(alpha=0.1)  # alpha 是正则化的强度
lasso.fit(X_train_scaled, y_train)
lasso_pred = lasso.predict(X_test_scaled)

In [31]:
print(lasso_pred)


In [57]:
# ElasticNet 回归
elastic_net = ElasticNet(alpha=0.1, l1_ratio=0.5)  # alpha 和 l1_ratio 控制正则化
elastic_net.fit(X_train_scaled, y_train)
elastic_net_pred = elastic_net.predict(X_test_scaled)

In [59]:
print(elastic_net_pred)
y_test_pred=pd.Series(elastic_net_pred,name="price")
y_test_pred=pd.DataFrame(elastic_net_pred)
y_test_pred.to_csv("elastic_net_pred_1.csv")

[6053135.70956078 4876213.51493572 2223279.99816117 ... 3076339.66534006
 2856269.70177657 2635621.00824414]
