# 数据清洗部分

In [7]:
import pandas as pd
import numpy as np
import re

# 读取数据
data = pd.read_csv("D:/RUCer/大三下/ai_python/2022202777/期中/原始数据/ruc_Class25Q1_train.csv")
data_test = pd.read_csv("D:/RUCer/大三下/ai_python/2022202777/期中/原始数据/ruc_Class25Q1_test.csv")

# 清洗数据
## 户型
def parse_room_layout(layout):
    result = {
        '室': 0,
        '厅': 0,
        '厨': 0,
        '卫': 0
    }
    # 对于空值，返回默认值
    if isinstance(layout, float):
        return pd.Series({'室': 0, '厅': 0, '厨': 0, '卫': 0})
    # 处理"房间"和"室"的别名
    layout = layout.replace('房间', '室')
    
    # 使用正则表达式查找所有数字+单位组合
    matches = re.findall(r'(\d+)(室|厅|厨|卫)', layout)
    
    for num, unit in matches:
        result[unit] = int(num)
    
    return pd.Series(result)
data[['室', '厅', '厨', '卫']] = data['房屋户型'].apply(parse_room_layout)
data_test[['室', '厅', '厨', '卫']] = data_test['房屋户型'].apply(parse_room_layout)

## 所在楼层
def parse_floor_layout(floor_str):
    # 初始化默认值
    result = {
        '楼层位置': None,
        '楼层分组': None
    }
    
    # 处理缺失值
    if pd.isna(floor_str):
        return pd.Series(result)
    
    # 提取基本信息
    match = re.search(r'([底低中高顶](?:楼)?层)[^\d]*\(共(\d+)层\)', str(floor_str))
    if not match:
        return pd.Series(result)
    
    position, total_floors = match.groups()
    
    # 标准化楼层位置
    position = position[0]  # 取第一个字
    if position == '顶':
        position = '高'
    
    # 确定楼层分组
    try:
        total_floors = int(total_floors)
        if total_floors <= 6:
            group = '低层'
        elif 7 <= total_floors <= 15:
            group = '中层'
        else:
            group = '高层'
    except (ValueError, TypeError):
        group = None
    
    # 更新结果
    result.update({
        '楼层位置': position,
        '楼层分组': group
    })
    
    return pd.Series(result)
data[['层高','层型']] = data['所在楼层'].apply(parse_floor_layout)
data_test[['层高','层型']] = data_test['所在楼层'].apply(parse_floor_layout)

## 房屋面积
def remove_units(series):
    # 复制Series以避免修改原数据
    result = series.copy()
    
    # 移除单位并转换为数值
    result = result.astype(str).str.replace(r'[^\d.]', '', regex=True)
    
    # 转换为float类型，非数字转为NaN
    return pd.to_numeric(result, errors='coerce')
    
data['建筑面积'] = remove_units(data['建筑面积'])
data['套内面积'] = remove_units(data['套内面积'])
data_test['建筑面积'] = remove_units(data_test['建筑面积'])
data_test['套内面积'] = remove_units(data_test['套内面积'])


## 交通出行
def extract_traffic(series):
    def get_type(text):
        if pd.isna(text):
            return None
        text = str(text)
        if '地铁' in text or '号线' in text:
            return '地铁'
        elif '公交' in text:
            return '公交'
    return series.apply(get_type)
data['交通出行'] = extract_traffic(data['交通出行'])
data_test['交通出行'] = extract_traffic(data_test['交通出行'])

## 周边配套
def extract_surroundings(series):
    result = {
        '医院':0,
        '学校':0,
        '商场':0,
        '公园':0
    }

    if pd.isna(series):
        return pd.Series({'医院': 0, '学校': 0, '商场': 0,'公园':0})
        
    text = str(series)
    for facility in result:
        if facility in text:
            result[facility] = 1
    return pd.Series(result)
data[['医院','学校','商场','公园']] = data['周边配套'].apply(extract_surroundings)
data_test[['医院','学校','商场','公园']] = data_test['周边配套'].apply(extract_surroundings)

## 梯户比
def extract_lift_to_house_ratio(text):
    if pd.isna(text):  # 处理 NaN
        return pd.Series({'梯户比': None})
    
    text = str(text)
    # 定义中文数字到阿拉伯数字的映射
    chinese_num_map = {
        '单': 1, '一': 1, '两': 2, '二': 2, '三': 3, '四': 4,
        '五': 5, '六': 6, '七': 7, '八': 8, '九': 9, '十': 10
    }

    pattern = r'([单两一二三四五六七八九十\d]+)梯([单两一二三四五六七八九十\d]+)户'
    match = re.search(pattern, text)
    
    if not match:
        return pd.Series({'梯户比': None})  # 未匹配到有效数据
    
    lift_str, house_str = match.groups()
    
    # 转换梯数（lift）
    if lift_str.isdigit():  # 如果是数字，直接转换
        lift = int(lift_str)
    else:  # 如果是中文数字，映射到阿拉伯数字
        lift = chinese_num_map.get(lift_str, 1)  # 默认 "单梯" = 1
    
    # 转换户数（house）
    if house_str.isdigit():  # 如果是数字，直接转换
        house = int(house_str)
    else:  # 如果是中文数字，解析（如 "二十五" = 25）
        # 处理 "二十五" → 2*10 + 5 = 25
        if '十' in house_str:
            parts = house_str.split('十')
            if len(parts) == 2:
                house = chinese_num_map.get(parts[0], 1) * 10 + chinese_num_map.get(parts[1], 0)
            else:  # "十" 单独出现，如 "十户" → 10
                house = 10
        else:
            house = chinese_num_map.get(house_str, 1)
    
    # 计算梯户比（梯数 / 户数）
    ratio = lift / house if house != 0 else None  # 避免除零错误
    return pd.Series({'梯户比': ratio})
data['梯户比'] = data['梯户比例'].apply(extract_lift_to_house_ratio)
data_test['梯户比'] = data_test['梯户比例'].apply(extract_lift_to_house_ratio)

## 交易时间差
def calculate_transaction_time_diff(df, current_time_col='交易时间', last_time_col='上次交易'):
    # 转换日期列为 datetime 格式
    df[current_time_col] = pd.to_datetime(df[current_time_col], errors='coerce')
    df[last_time_col] = pd.to_datetime(df[last_time_col], errors='coerce')
    
    # 计算时间差（天数）
    df['交易时间差（天）'] = (df[current_time_col] - df[last_time_col]).dt.days
    
    # 处理异常值（如 NaT 导致的 NaN）
    df['交易时间差（天）'] = df['交易时间差（天）'].fillna(-1)  # -1 表示无效数据
    
    return df

data = calculate_transaction_time_diff(data)
data_test = calculate_transaction_time_diff(data_test)

# 整合到一个csv文件中
selected_cols = ['城市','区域','板块','环线','价格','小区名称','建筑面积','套内面积','房屋朝向','建筑结构','装修情况','梯户比例','交易权属','上次交易','房屋用途','房屋年限','产权所属','lon','lat','年份','室','厅','厨','卫','层高','层型','医院','学校','商场','公园','梯户比','交易时间差（天）']
selected_cols1 = ['ID','城市','区域','板块','环线','小区名称','建筑面积','套内面积','房屋朝向','建筑结构','装修情况','梯户比例','交易权属','上次交易','房屋用途','房屋年限','产权所属','lon','lat','年份','室','厅','厨','卫','层高','层型','医院','学校','商场','公园','梯户比','交易时间差（天）']
data[selected_cols].to_csv('train_result.csv', index=False, encoding='utf-8-sig')
data_test[selected_cols1].to_csv('test_result.csv', index=False, encoding='utf-8-sig')

# knn部分

In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler

# 计算地理位置评分并保存到训练集
def compute_location_score(file_path, min_k=5, max_k=15, output_file=None):
    df = pd.read_csv(file_path, encoding='utf-8')

    if '价格' not in df.columns or '建筑面积' not in df.columns:
        raise ValueError("CSV 文件中未找到 '价格' 或 '建筑面积' 列")
    
    # 计算单价
    df["单价"] = df["价格"] / df["建筑面积"]
    
    # 选取经纬度作为地理特征
    locations = df[['lon', 'lat']].values
    
    # 标准化经纬度数据
    scaler = StandardScaler()
    locations_scaled = scaler.fit_transform(locations)
    
    # 计算每个点的最近邻数（自适应）
    knn = NearestNeighbors(n_neighbors=max_k, algorithm='ball_tree')
    knn.fit(locations_scaled)  # 使用标准化后的经纬度进行 KNN 训练
    distances, indices = knn.kneighbors(locations_scaled)
    
    location_scores = []
    
    for i, idx_list in enumerate(indices):
        # 获取当前点的邻居的距离
        current_distances = distances[i]
        current_indices = idx_list
        
        # 进行距离排序
        sorted_distances = np.sort(current_distances)
        
        # 使用排序后的邻居距离来判断有效邻居数
        valid_neighbors = [current_indices[j] for j in
        range(len(sorted_distances)) if j == 0
        or sorted_distances[j] <= 3 * sorted_distances[j - 1]]
        
        # 确保有效邻居数在 min_k 和 max_k 之间
        k_neighbors = max(min_k, min(len(valid_neighbors), max_k))
        
        # 选择有效邻居并计算均价
        selected_indices = valid_neighbors[:k_neighbors]
        neighbor_prices = df.iloc[selected_indices]["单价"].values
        location_scores.append(np.mean(neighbor_prices))
    
    df["地理位置评分"] = location_scores
    print(location_scores[:10])  # 打印前10个地理位置评分

    # 保存训练集数据（包括地理位置评分）
    output_path = output_file if output_file else file_path
    df.to_csv(output_path, index=False, encoding='utf-8')
    print(f"地理位置评分已保存到 {output_path}")
    
    return df, scaler



# 运行训练集地理位置评分计算
train_df, scaler = compute_location_score("D:/RUCer/大三下/ai_python/2022202777/期中/期中作业/train_result.csv", min_k=5, max_k=15)

[110168.72979172092, 36242.90444135293, 54595.93145850888, 69129.24425341187, 54192.1400007641, 24204.415417595897, 48889.136093334484, 60554.00302261476, 34165.85776974518, 88644.33533334543]
地理位置评分已保存到 D:/RUCer/大三下/ai_python/2022202777/期中/期中作业/train_result.csv


In [9]:
# 为测试集计算地理位置评分
def compute_location_score_for_test(test_file, train_df, scaler, min_k=5, max_k=15, output_file=None):
    test_df = pd.read_csv(test_file, encoding='utf-8')
    
    # 确保测试集包含经纬度数据
    if 'lon' not in test_df.columns or 'lat' not in test_df.columns:
        raise ValueError("CSV 文件中未找到 'lon' 或 'lat' 列")
    
    # 选取测试集的经纬度
    test_locations = test_df[['lon', 'lat']].values
    
    # 标准化测试集经纬度，使用 **训练集的标准化模型**
    test_locations_scaled = scaler.transform(test_locations)
    
    # 获取训练集经纬度和地理位置评分
    train_locations = train_df[['lon', 'lat']].values
    train_locations_scaled = scaler.transform(train_locations) 
    train_location_scores = train_df['地理位置评分'].values
    
    # 使用 KNN 查找每个测试集点的最近邻
    knn = NearestNeighbors(n_neighbors=max_k, algorithm='ball_tree')
    knn.fit(train_locations_scaled) 
    distances, indices = knn.kneighbors(test_locations_scaled)
    
    location_scores = []
    
    for i, idx_list in enumerate(indices):
        # 根据最近邻索引选择训练集中的地理位置评分
        neighbor_scores = train_location_scores[idx_list]
        location_scores.append(np.mean(neighbor_scores))
    
    # 赋值回测试集
    test_df["地理位置评分"] = location_scores
    print(location_scores[:10])  # 打印前10个地理位置评分

    # 保存测试集结果
    output_path = output_file if output_file else test_file
    test_df.to_csv(output_path, index=False, encoding='utf-8')
    print(f"测试集地理位置评分已保存到 {output_path}")
    
    return test_df

# 运行测试集地理位置评分计算
compute_location_score_for_test("D:/RUCer/大三下/ai_python/2022202777/期中/期中作业/test_result.csv", train_df, scaler, min_k=5, max_k=15)

[90660.11918400704, 64102.54467695355, 33483.59907514992, 23764.768851361863, 101977.39560606776, 50085.00168972745, 69774.88232587205, 30642.622168863843, 24951.564037713393, 22791.89553319843]
测试集地理位置评分已保存到 D:/RUCer/大三下/ai_python/2022202777/期中/期中作业/test_result.csv


Unnamed: 0,ID,城市,区域,板块,环线,小区名称,建筑面积,套内面积,房屋朝向,建筑结构,...,卫,层高,层型,医院,学校,商场,公园,梯户比,交易时间差（天）,地理位置评分
0,0,0,45,416,四至五环,泛海容郡,209.20,165.71,南 北,钢混结构,...,3,中,高层,0,0,0,1,1.000000,1889.0,90660.119184
1,1,0,45,414,四至五环,慧谷金色家园,163.69,,南 北,钢混结构,...,2,低,中层,0,0,0,0,0.500000,7708.0,64102.544677
2,2,0,43,289,五至六环,天通苑中苑,102.92,,西,钢混结构,...,1,中,中层,1,0,1,1,0.333333,6554.0,33483.599075
3,3,0,39,374,,富乐小区南里,109.66,,南 北,混合结构,...,2,中,低层,0,0,0,0,0.500000,7344.0,23764.768851
4,4,0,79,724,二至三环,百万庄未区,57.20,,南 北,混合结构,...,1,底,低层,1,0,1,1,0.500000,4100.0,101977.395606
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14781,14781,6,71,244,,横巷市委家属院南区,65.78,,南 北,混合结构,...,1,中,中层,1,0,0,0,,1910.0,8264.839963
14782,14782,6,91,723,,润天小区,138.01,,南 北,砖混结构,...,2,高,低层,0,0,0,0,,-1.0,5071.761225
14783,14783,6,83,380,三环外,御湖蓝湾,110.00,,南 北,未知结构,...,2,中,高层,0,0,0,0,,-1.0,11464.411176
14784,14784,6,83,380,三环外,御湖蓝湾,136.00,,南 北,未知结构,...,2,高,中层,0,0,0,0,,-1.0,11464.411176


# 回归部分

In [None]:
# Ridge回归模型
import os
import pandas as pd
import numpy as np
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_absolute_error

def process_data(file_path):
    df = pd.read_csv(file_path)
    
    if '价格' in df.columns:
        Y = df['价格']
    else:
        Y = None
    
    feature_cols = ['地理位置评分', '建筑面积', '层高', '层型', '建筑结构', '装修情况', '房屋用途', '房屋年限', '梯户比']
    X = df.filter(items=feature_cols)
    
    categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
    numerical_cols = X.select_dtypes(include=['number']).columns.tolist()
    
    X_categorical = pd.get_dummies(X[categorical_cols], drop_first=True)
    X_numerical = X[numerical_cols].fillna(X[numerical_cols].mean())
    
    scaler = StandardScaler()
    X_numerical_scaled = scaler.fit_transform(X_numerical)
    X_numerical_scaled = pd.DataFrame(X_numerical_scaled, columns=numerical_cols)
    
    X_processed = pd.concat([X_numerical_scaled, X_categorical], axis=1)
    
    return X_processed, Y, scaler, categorical_cols

def train_and_predict(train_file, test_file, output_file, alpha=1.0):
    X, Y, scaler, categorical_cols = process_data(train_file)
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=111)
    
    model = Ridge(alpha=alpha)
    model.fit(X_train, Y_train)
    
    Y_train_pred = model.predict(X_train)
    in_sample_mae = mean_absolute_error(Y_train, Y_train_pred)
    
    Y_test_pred = model.predict(X_test)
    out_sample_mae = mean_absolute_error(Y_test, Y_test_pred)
    
    # 交叉验证（使用 MAE）
    cv_scores = cross_val_score(model, X, Y, cv=5, scoring='neg_mean_absolute_error')
    cv_mae = -np.mean(cv_scores)
    
    print(f"Ridge 回归评估:")
    print(f"  In-sample MAE: {in_sample_mae:.4f}")
    print(f"  Out-of-sample MAE: {out_sample_mae:.4f}")
    print(f"  Cross-validation MAE: {cv_mae:.4f}")
    
    # 处理测试集并预测
    X_test_data, _, _, _ = process_data(test_file)
    X_test_data = X_test_data.reindex(columns=X.columns, fill_value=0)
    predictions = model.predict(X_test_data)
    
    df_test = pd.read_csv(test_file)
    df_test['预测价格'] = predictions
    df_test.to_csv(output_file, index=False, encoding='utf-8-sig')
    print(f"预测结果已保存: {output_file}")

# 直接处理一个训练集和一个测试集
train_file = "D:/RUCer/大三下/ai_python/2022202777/期中/期中作业/train_result.csv"
test_file = "D:/RUCer/大三下/ai_python/2022202777/期中/期中作业/test_result.csv"
output_file = "D:/RUCer/大三下/ai_python/2022202777/期中/预测结果Ridge.csv"

train_and_predict(train_file, test_file, output_file, alpha=0.5)


Ridge 回归评估:
  In-sample MSE: 578034.8138
  Out-of-sample MSE: 582998.0466
  Cross-validation MSE: 716019.2134
预测结果已保存: D:/RUCer/大三下/ai_python/2022202777/期中/预测结果Ridge.csv


In [None]:
# OLS回归模型
import os
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_absolute_error

def process_data(file_path):
    df = pd.read_csv(file_path)
    
    if '价格' in df.columns:
        Y = df['价格']
    else:
        Y = None
    
    feature_cols = ['地理位置评分', '建筑面积', '层高', '层型', '建筑结构', '装修情况', '房屋用途', '房屋年限', '梯户比']
    X = df.filter(items=feature_cols)
    
    categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
    numerical_cols = X.select_dtypes(include=['number']).columns.tolist()
    
    X_categorical = pd.get_dummies(X[categorical_cols], drop_first=True)
    X_numerical = X[numerical_cols].fillna(X[numerical_cols].mean())
    
    scaler = StandardScaler()
    X_numerical_scaled = scaler.fit_transform(X_numerical)
    X_numerical_scaled = pd.DataFrame(X_numerical_scaled, columns=numerical_cols)
    
    X_processed = pd.concat([X_numerical_scaled, X_categorical], axis=1)
    
    return X_processed, Y, scaler, categorical_cols

def train_and_predict(train_file, test_file, output_file):
    X, Y, scaler, categorical_cols = process_data(train_file)
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=111)
    
    X_train = sm.add_constant(X_train)
    X_test = sm.add_constant(X_test)
    
    model = sm.OLS(Y_train, X_train.astype(float)).fit()
    
    Y_train_pred = model.predict(X_train)
    in_sample_mae = mean_absolute_error(Y_train, Y_train_pred)
    
    Y_test_pred = model.predict(X_test)
    out_sample_mae = mean_absolute_error(Y_test, Y_test_pred)

    lr_model = LinearRegression()
    cv_scores = cross_val_score(lr_model, X, Y, cv=5, scoring='neg_mean_absolute_error')
    cv_mae = -np.mean(cv_scores)

    print(f"OLS 回归评估:")
    print(f"  In-sample MAE: {in_sample_mae:.4f}")
    print(f"  Out-of-sample MAE: {out_sample_mae:.4f}")
    print(f"  Cross-validation MAE: {cv_mae:.4f}")
    
    # 处理测试集并预测
    X_test_data, _, _, _ = process_data(test_file)
    X_test_data = X_test_data.reindex(columns=X.columns, fill_value=0)
    X_test_data = sm.add_constant(X_test_data)
    predictions = model.predict(X_test_data)
    
    df_test = pd.read_csv(test_file)
    df_test['预测价格'] = predictions
    df_test.to_csv(output_file, index=False, encoding='utf-8-sig')
    print(f"预测结果已保存: {output_file}")

# 直接处理一个训练集和一个测试集
train_file = "D:/RUCer/大三下/ai_python/2022202777/期中/期中作业/train_result.csv"
test_file = "D:/RUCer/大三下/ai_python/2022202777/期中/期中作业/test_result.csv"
output_file = "D:/RUCer/大三下/ai_python/2022202777/期中/预测结果OLS.csv"

train_and_predict(train_file, test_file, output_file)


OLS 回归评估:
  In-sample MSE: 578031.2948
  Out-of-sample MSE: 582982.7372
  Cross-validation MAE: 74289589913767472.0000
                            OLS Regression Results                            
Dep. Variable:                     价格   R-squared:                       0.676
Model:                            OLS   Adj. R-squared:                  0.676
Method:                 Least Squares   F-statistic:                     3797.
Date:                Sat, 05 Apr 2025   Prob (F-statistic):               0.00
Time:                        17:01:22   Log-Likelihood:            -1.0528e+06
No. Observations:               67306   AIC:                         2.106e+06
Df Residuals:                   67268   BIC:                         2.106e+06
Df Model:                          37                                         
Covariance Type:            nonrobust                                         
                  coef    std err          t      P>|t|      [0.025      0.975]
-----------

In [11]:
# Lasso回归模型
import os
import pandas as pd
import numpy as np
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_absolute_error

def process_data(file_path):
    df = pd.read_csv(file_path)
    
    if '价格' in df.columns:
        Y = df['价格']
    else:
        Y = None
    
    feature_cols = ['地理位置评分', '建筑面积', '层高', '层型', '建筑结构', '装修情况', '房屋用途', '房屋年限', '梯户比']
    X = df.filter(items=feature_cols)
    
    categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
    numerical_cols = X.select_dtypes(include=['number']).columns.tolist()
    
    X_categorical = pd.get_dummies(X[categorical_cols], drop_first=True)
    X_numerical = X[numerical_cols].fillna(X[numerical_cols].mean())
    
    scaler = StandardScaler()
    X_numerical_scaled = scaler.fit_transform(X_numerical)
    X_numerical_scaled = pd.DataFrame(X_numerical_scaled, columns=numerical_cols)
    
    X_processed = pd.concat([X_numerical_scaled, X_categorical], axis=1)
    
    return X_processed, Y, scaler, categorical_cols

def train_and_predict(train_file, test_file, output_file, alpha=1.0):
    X, Y, scaler, categorical_cols = process_data(train_file)
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=111)
    
    model = Lasso(alpha=alpha)
    model.fit(X_train, Y_train)
    
    Y_train_pred = model.predict(X_train)
    in_sample_mae = mean_absolute_error(Y_train, Y_train_pred)
    
    Y_test_pred = model.predict(X_test)
    out_sample_mae = mean_absolute_error(Y_test, Y_test_pred)
    
    # 交叉验证（MAE 版本，注意 scoring 参数）
    cv_scores = cross_val_score(model, X, Y, cv=5, scoring='neg_mean_absolute_error')
    cv_mae = -np.mean(cv_scores)
    
    print(f"Lasso 回归评估（使用 MAE）:")
    print(f"  In-sample MAE: {in_sample_mae:.4f}")
    print(f"  Out-of-sample MAE: {out_sample_mae:.4f}")
    print(f"  Cross-validation MAE: {cv_mae:.4f}")
    
    # 处理测试集并预测
    X_test_data, _, _, _ = process_data(test_file)
    X_test_data = X_test_data.reindex(columns=X.columns, fill_value=0)
    predictions = model.predict(X_test_data)
    
    df_test = pd.read_csv(test_file)
    df_test['预测价格'] = predictions
    df_test.to_csv(output_file, index=False, encoding='utf-8-sig')
    print(f"预测结果已保存: {output_file}")

# 路径设置
train_file = "D:/RUCer/大三下/ai_python/2022202777/期中/期中作业/train_result.csv"
test_file = "D:/RUCer/大三下/ai_python/2022202777/期中/期中作业/test_result.csv"
output_file = "D:/RUCer/大三下/ai_python/2022202777/期中/预测结果Lasso_MAE.csv"

train_and_predict(train_file, test_file, output_file, alpha=0.5)


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Lasso 回归评估（使用 MAE）:
  In-sample MAE: 578030.7019
  Out-of-sample MAE: 582983.5198
  Cross-validation MAE: 716298.5087
预测结果已保存: D:/RUCer/大三下/ai_python/2022202777/期中/预测结果Lasso_MAE.csv
