In [29]:
# Import necessary libraries
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV, cross_validate
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures, FunctionTransformer, RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, make_scorer
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, f_regression

# Load datasets 
train = pd.read_csv("train.csv")
details = pd.read_csv("details.csv")
rent = pd.read_csv("rent.csv")
test_df = pd.read_csv("test.csv")

# Preprocess 'details' dataset
details = details.drop(["区县", "城市", "板块", "环线位置", "小区地址", "建筑结构", "物业办公电话", "coord_x", "coord_y"], axis=1)
details = details.drop_duplicates(subset="名称", keep="first")

# Merge details with train and test data
train = pd.merge(train, details, left_on="小区名称", right_on="名称", how="left")
test_df = pd.merge(test_df, details, left_on="小区名称", right_on="名称", how="left")

In [30]:
# Clean area-related columns by removing '㎡' and convert to float
area_cols = ["建筑面积", "套内面积"]
for df in [train, test_df]:
    for col in area_cols:
        df[col] = df[col].str.replace('㎡', '', regex=False).astype(float)

# Regex patterns for feature extraction
PATTERNS = {
    "room": r'(\d+)室',
    "living": r'(\d+)厅',
    "kitchen": r'(\d+)厨',
    "bath": r'(\d+)卫'
}

def extract_number(pattern, text):
    """Extract first numeric value from text using regex pattern."""
    if isinstance(text, str) and re.search(pattern, text):
        return int(re.search(pattern, text).group(1))
    return None

def safe_extract_digits(x):
    """Safely extract first numeric sequence from mixed-type data."""
    if pd.isna(x):
        return None
    matches = re.findall(r'\d+', str(x))
    return matches[0] if matches else None

def process_dataframe(df):
    """Process dataframe to create features from raw text columns."""
    # Extract room features
    for feature, pattern in PATTERNS.items():
        df[feature] = df["房屋户型"].apply(lambda x: extract_number(pattern, x))
    
    # Extract directional features
    directions = ["东", "南", "西", "北", "东南", "西北", "东北", "西南"]
    for direction in directions:
        df[direction] = df["房屋朝向"].apply(
            lambda x: direction in x.split() if isinstance(x, str) else False
        )
    
    # Extract floor-related features
    df["high"] = df["所在楼层"].str.contains("高").astype(int)
    df["low"] = df["所在楼层"].str.contains("低").astype(int)
    df["rou"] = df["所在楼层"].apply(safe_extract_digits)
    
    # Count special characters in text columns
    text_columns = {
        "good": ("房屋优势", "、"),
        "point": ("核心卖点", "，"),
        "intro": ("户型介绍", "，"),
        "near": ("周边配套", "，")
    }
    for feat, (col, sep) in text_columns.items():
        df[feat] = df[col].str.count(sep).fillna(0)
    
    # Extract numeric features from various columns
    numeric_features = {
        "year": "建筑年代",
        "house": "房屋总数",
        "building": "楼栋总数",
        "green": "绿 化 率",
        "volume": "容 积 率",
        "manage": "物 业 费",
        "gas": "燃气费",
        "heat": "供热费",
        "park": "停车位",
        "parkmoney": "停车费用"
    }
    for new_col, orig_col in numeric_features.items():
        df[new_col] = df[orig_col].apply(safe_extract_digits)
    df["road"] = df["交通出行"].str.count("线") + df["交通出行"].str.count("路")
    return df

# Process both train and test data 
train = process_dataframe(train)
test_df = process_dataframe(test_df)

# Define column categories 
categorical_cols = ["城市", "区域", "板块", "环线", "小区名称", "建筑结构", "装修情况", "梯户比例", "配备电梯", "别墅类型", "交易权属", 
"房屋用途", "房屋年限", "产权所属", "年份", "high", "low", "东", "西", "南", "北", "东南", "西北", "东北", "西南", "物业类别", "开发商", "物业公司", 
"产权描述", "供水", "供暖", "供电"]
numerical_cols = ["建筑面积", "套内面积", "lon", "lat", "good", "point", "intro", "near", "rou", "room", "living", "kitchen", "bath", 
"year", "house", "building", "green", "volume", "manage", "gas", "heat", "park", "parkmoney"] 

In [31]:
# Preprocessing pipeline components
log_transformer = FunctionTransformer(func=np.log1p, inverse_func=np.expm1)

numeric_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('outlier', FunctionTransformer(
        lambda x: np.clip(x, np.quantile(x, 0.05), np.quantile(x, 0.95)),
        validate=True 
    )),
    ('log_transform', log_transformer),
    ('scaler', RobustScaler()),
    ('poly', PolynomialFeatures(degree=2, include_bias=False))
])

categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='Missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])
preprocessor = ColumnTransformer([
    ('num', numeric_pipeline, numerical_cols),
    ('cat', categorical_pipeline, categorical_cols)
])

# Model configuration using factory pattern
def create_model(estimator, use_selector=False):
    """Factory function to create pipeline with optional feature selector."""
    steps = [('preprocessor', preprocessor)]
    if use_selector:
        steps.append(('selector', SelectKBest(f_regression, k=50)))
    steps.append(('regressor', estimator))
    return Pipeline(steps)

# Model parameters and configurations
MODEL_CONFIG = {
    'OLS': {
        'estimator': LinearRegression(),
        'use_selector': True,
        'params': {}
    },
    'Lasso': {
        'estimator': Lasso(max_iter=10000),
        'params': {'regressor__alpha': [0.01, 0.1, 1.0, 10.0]}
    },
    'Ridge': {
        'estimator': Ridge(),
        'params': {'regressor__alpha': [0.1, 1.0, 10.0, 100.0]}
    },
    'ElasticNet': {
        'estimator': ElasticNet(),
        'params': {
            'regressor__alpha': [0.01, 0.1, 1.0],
            'regressor__l1_ratio': [0.3, 0.5, 0.7]
        }
    }
}


In [32]:
# Initialize models
models = {}
for name, config in MODEL_CONFIG.items():
    models[name] = create_model(config['estimator'], config.get('use_selector', False))

# Prepare data
X = train.drop(columns=['价格'])
y = np.log1p(train['价格'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=111)

In [13]:
# Hyperparameter tuning
optimized_models = {}
for name in ['Lasso', 'Ridge', 'ElasticNet']:
    grid_search = GridSearchCV(
        estimator=models[name],
        param_grid=MODEL_CONFIG[name]['params'],
        cv=KFold(n_splits=6),
        scoring='r2',
        n_jobs=-1
    )
    grid_search.fit(X_train, y_train)
    optimized_models[name] = grid_search.best_estimator_
    print(f"{name} Best params: {grid_search.best_params_}")
    print(f"{name} Best CV R²: {grid_search.best_score_:.4f}\n")

Lasso Best params: {'regressor__alpha': 0.01}
Lasso Best CV R²: 0.8418

Ridge Best params: {'regressor__alpha': 1.0}
Ridge Best CV R²: 0.9764

ElasticNet Best params: {'regressor__alpha': 0.01, 'regressor__l1_ratio': 0.3}
ElasticNet Best CV R²: 0.8767



In [35]:
# Update models with optimized versions
models.update(optimized_models)

def rmse_expm1(y_true_log, y_pred_log):
    y_true = np.expm1(y_true_log)
    y_pred = np.expm1(y_pred_log)
    return np.sqrt(mean_squared_error(y_true, y_pred))

def mae_expm1(y_true_log, y_pred_log):
    y_true = np.expm1(y_true_log)
    y_pred = np.expm1(y_pred_log)
    return mean_absolute_error(y_true, y_pred)

def r2_expm1(y_true_log, y_pred_log):
    y_true = np.expm1(y_true_log)
    y_pred = np.expm1(y_pred_log)
    return r2_score(y_true, y_pred)

scoring = {
    'r2': make_scorer(r2_expm1),
    'mae': make_scorer(mae_expm1),
    'rmse': make_scorer(rmse_expm1)
}

# Model evaluation
results = []
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    
    cv_results = cross_validate(
        estimator=model,
        X=X_train,
        y=y_train,
        cv=KFold(n_splits=6),
        scoring=scoring,
        n_jobs=-1
    )
    

    metrics = {
        'Model': name,
        'MAE_train': mean_absolute_error(np.expm1(y_train), np.expm1(y_pred_train)),
        'RMSE_train': np.sqrt(mean_squared_error(np.expm1(y_train), np.expm1(y_pred_train))),
        'MAE_test': mean_absolute_error(np.expm1(y_test), np.expm1(y_pred_test)),
        'RMSE_test': np.sqrt(mean_squared_error(np.expm1(y_test), np.expm1(y_pred_test))),
        'R2_train': r2_expm1(y_train, y_pred_train),
        'R2_test': r2_expm1(y_test, y_pred_test), 
        'CV_R2': cv_results['test_r2'].mean(),
        'CV_MAE': cv_results['test_mae'].mean(),
        'CV_RMSE': cv_results['test_rmse'].mean()
    }
    results.append(metrics)

In [36]:
# Generate submission files
for model_name, model in models.items():
    model.fit(X, y)  # Retrain on full dataset
    test_df['Price'] = np.expm1(model.predict(test_df))
    test_df[['ID', 'Price']].to_csv(f'{model_name}_submission.csv', index=False)

In [37]:
# Convert results to DataFrame and print
results_df = pd.DataFrame(results)
print("\n模型评估结果:")
print(results_df.to_markdown(index=False, floatfmt=".4f"))


模型评估结果:
| Model      |   MAE_train |   RMSE_train |    MAE_test |    RMSE_test |   R2_train |   R2_test |   CV_R2 |      CV_MAE |      CV_RMSE |
|:-----------|------------:|-------------:|------------:|-------------:|-----------:|----------:|--------:|------------:|-------------:|
| OLS        | 562316.7655 | 1361239.8117 | 567489.9240 | 1532380.3646 |     0.7347 |    0.6602 |  0.7317 | 562449.3156 | 1365113.4321 |
| Lasso      | 590177.7473 | 1363015.4616 | 581091.9118 | 1359713.4268 |     0.7340 |    0.7325 |  0.7309 | 591451.4095 | 1366505.6051 |
| Ridge      | 168409.1414 |  464372.8031 | 183631.9274 |  538360.4891 |     0.9691 |    0.9581 |  0.9541 | 184816.4727 |  564927.3239 |
| ElasticNet | 505804.4070 | 1258576.6184 | 498960.2034 | 1229800.0277 |     0.7732 |    0.7812 |  0.7690 | 507701.3396 | 1263137.1575 |
