In [None]:
import os

current_path = os.getcwd()
print("当前工作路径:", current_path)

import numpy as np
import pandas as pd
import re
import cn2an

frame_patterns = {
    'room': r'(\d+)室',     
    'hall': r'(\d+)厅',      
    'kitchen': r'(\d+)厨',   
    'bathroom': r'(\d+)卫',
    'apartment': r'(\d+)房间' 
}

def get_lift_ratio(s):
    if pd.isna(s):
        return None
    match = re.search(r'([^梯]+)梯([^户]+)户', s)
    if match:
        try:
            lift_num = cn2an.cn2an(match.group(1), 'normal')
            household_num = cn2an.cn2an(match.group(2), 'normal')
            return lift_num / household_num
        except Exception as e:
            print(f"转换错误：{e}")
            return None
    else:
        return None
    
def get_relative_height(s):
    if pd.isna(s):
        return None
    match = re.search(r'([^()]+)', s)
    if match:
        return match.group(1).strip() 
    else:
        return None

def get_total_floor(s):
    if pd.isna(s):
        return None
    match = re.search(r'\s*\((共(\d+)层)\)', s)
    if match:
        return int(match.group(2))
    else:
        return None
    
direction_mapping = {
    '东': 'east',
    '西': 'west',
    '南': 'south',
    '北': 'north',
    '东南': 'south_east',
    '东北': 'north_east',
    '西南': 'south_west',
    '西北': 'north_west'
}

def process_directions(direction_str):
    directions = direction_str.split()
    processed_directions = []
    for direction in directions:
        if direction in direction_mapping:
            processed_directions.append(direction_mapping[direction])
    return processed_directions
    

In [None]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

def preprocess(df):
    # 重命名列
    df.rename(columns={
        '城市': 'location1', 
        '区域': 'location2',
        '板块': 'location3',
        '环线': 'ring',
        '小区名称': 'location4',
        '价格': 'price',
        '建筑面积': 'area_gross',
        '套内面积': 'area_net',
        '房屋朝向': 'directions',
        '建筑结构': 'structure',
        '房屋户型': 'frame',
        '所在楼层': 'floor',
        '梯户比例': 'lift_ratio',
        '装修情况': 'decoration',
        '配备电梯': 'lift_ornot',
        '别墅类型': 'villa',
        '交易权属': 'transaction_ownership',
        '房屋用途': 'purpose',
        '房屋年限': 'age',
        '产权所属': 'property_ownership',
        '房屋优势': 'advantage',
        '核心卖点': 'outstanding',
        '户型介绍': 'frame_describe',
        '周边配套': 'near',
        '交通出行': 'transport',
        '年份': 'year'
    }, inplace=True)
    
    df['area_gross'] = df['area_gross'].str.replace('㎡', '').astype(float)
    df['area_net'] = df['area_net'].str.replace('㎡', '').astype(float)
    
    for i in range(1, 4):
        df['location' + str(i)] = df['location' + str(i)].astype(int).astype(str)
    
    for key, pattern in frame_patterns.items():
        df[key] = df['frame'].str.extract(pattern, expand=False).fillna(0).astype(int)

    ''' 
    dummies=pd.get_dummies(df['location1'],columns=['location1'],prefix='location1',drop_first=False).astype(int)
    df=pd.concat([df,dummies],axis=1)
    dummies=pd.get_dummies(df['ring'],columns=['ring'],prefix='ring',drop_first=False).astype(int)
    df=pd.concat([df,dummies],axis=1)
    '''



    df['lift_ratio'] = df['lift_ratio'].apply(get_lift_ratio)
    df['lift_ratio'] = df['lift_ratio'].fillna(df[~df['lift_ratio'].isna()]['lift_ratio'].mean())
    df['relative_height']=df['floor'].apply(get_relative_height)
    df['total_floor']=df['floor'].apply(get_total_floor)

    df['directions'] = df['directions'].apply(process_directions)
    dummies = df['directions'].apply(lambda x: pd.Series([1 if d in x else 0 for d in direction_mapping.values()]))
    dummies.columns = direction_mapping.values()
    df = pd.concat([df, dummies], axis=1)
    
    df['ring']=df['ring'].fillna('环线未知')
    df['structure']=df['structure'].fillna('建筑结构未知')
    df['decoration']=df['decoration'].fillna('装修情况未知')
    df['lift_ornot']=df['lift_ornot'].fillna('配备电梯未知')
    df['villa']=df['villa'].fillna('非别墅')
    df['property_ownership']=df['property_ownership'].fillna('产权所属未知')
    df['purpose']=df['purpose'].fillna('房屋用途未知')
    df['age']=df['age'].fillna('新房')
    #df['year'] = pd.to_datetime(df['year']).dt.year.astype(str)
    df['year'] =df['year'].astype(int).astype(str)
    
    
    df['advantage'] = np.where(df['advantage'].isna(), '无描述', '有描述')
    df['outstanding']= np.where(df['outstanding'].isna(), '无描述', '有描述')
    df['frame_describe']= np.where(df['frame_describe'].isna(), '无描述', '有描述')
    df['near'] = np.where(df['near'].isna(), '无描述', '有描述')
    df['transport']=np.where(df['transport'].isna(), '无描述', '有描述')
    
    df = df.drop(['frame','floor','directions','抵押信息','上次交易'], axis=1)
    
    return df

train_df = preprocess(train_df)
test_df = preprocess(test_df)

temp_df = train_df[['area_gross', 'area_net']][~train_df['area_net'].isnull()]
temp_df['net_ratio'] = temp_df['area_net'] / temp_df['area_gross']
net_ratio_mean = temp_df['net_ratio'].mean()
del temp_df

print('net_ratio_mean=', net_ratio_mean)

train_df.loc[train_df['area_net'].isnull(), 'area_net'] = net_ratio_mean * train_df.loc[train_df['area_net'].isnull(), 'area_gross']
test_df.loc[test_df['area_net'].isnull(), 'area_net'] = net_ratio_mean * test_df.loc[test_df['area_net'].isnull(), 'area_gross']

del net_ratio_mean

In [None]:
col_na_count=train_df.isna().sum()
col_na_count=col_na_count[col_na_count!=0]
print('train_df_na is: \n',col_na_count)

col_na_count=test_df.isna().sum()
col_na_count=col_na_count[col_na_count!=0]
print('test_df_na is: \n',col_na_count)


In [None]:
def get_location_relation_df(location_index,train,test_df):
    if location_index==1:raise ValueError("城市数据齐全，不需要进行同级估计。")
    if location_index>4:raise ValueError("请选择正确的地理区分度！")
    location_upper='location'+str(location_index-1)
    location_lower='location'+str(location_index)
    locations_df=pd.concat([train_df[[location_upper,location_lower]],test_df[[location_upper,location_lower]]],axis=0,ignore_index=True)
    location_lower_unique_list=np.sort(locations_df[location_lower].unique())
    train_location_lower_unique_list=np.sort(train_df[location_lower].unique())
    len_list=len(location_lower_unique_list)
    location_upper_list=np.empty(len_list,dtype=str)
    location_lower_list=location_lower_unique_list
    location_lower_isin_train=np.empty(len_list,dtype=bool)
    
    for i in range(len(locations_df)):
        this_location_lower=locations_df[location_lower][i]
        if  this_location_lower in location_lower_unique_list:
            index=np.where(location_lower_list==this_location_lower)[0][0]
            location_upper_list[index]=locations_df[location_upper][i]
            location_lower_isin_train[index]=np.isin(train_location_lower_unique_list,this_location_lower).any()
            location_lower_unique_list=location_lower_unique_list[location_lower_unique_list != this_location_lower]
        if len(location_lower_unique_list)==0:break
    
    location_relation_df=pd.DataFrame({
        location_upper:location_upper_list,
        location_lower:location_lower_list,
        location_lower+'_isin_train':location_lower_isin_train
    })
    return(location_relation_df)




In [None]:
area_class='area_gross'
location_index=4
location_relation_df=get_location_relation_df(location_index,train_df,test_df)
neighbor_num=10
IS_UPPER=True

def get_neighbor_location_list(this_location,location_index=None,location_relation_df=None,train_df=None,neighbor_num=None):
    if IS_UPPER:
        location_upper='location'+str(location_index-1)
        location_lower='location'+str(location_index)
        if location_index is None:raise ValueError("当采用上级法估计邻居时，必须提供主人的区域级别！")
        if location_index == 1:raise ValueError("当取城市级地理区分度无须估计邻居！")
        if location_index > 4 :raise ValueError("请选择正确的地理区分度！") 
        if not ('location'+str(location_index)+'_isin_train' in location_relation_df.columns):
            raise ValueError("地理区分度与地理从属关系不匹配！")
        temp_series=location_relation_df[location_relation_df[location_lower]==this_location][location_upper]
        if temp_series.empty:
            return([])
        else:
            this_location_upper=temp_series.iloc[0]
            neighbor_location_list=list(location_relation_df[(location_relation_df[location_upper] == this_location_upper)&(location_relation_df['location'+str(location_index)+'_isin_train'])][location_lower])
        return(neighbor_location_list)
    else:
        if neighbor_num is None or train_df is None:raise ValueError("若采用距离邻近法估计邻居，则必须提供邻居数与训练集！")
        train_location_list=np.sort(np.unique(train_df['location'+str(location_index)]).astype(int))
        sorted_indices=np.argsort(np.abs(train_location_list-this_location))
        neighbor_location_list=train_location_list[sorted_indices[range(neighbor_num)]]
        return(neighbor_location_list.astype(str))

In [None]:
linear_variable_name_list = []
nonlinear_variable_name_list = ['area_gross', 'lift_ratio','room', 'hall', 'kitchen', 'bathroom', 'apartment']  
dummy_variable_name_list = ['location1','location2','location3','location4','ring','structure', 'decoration', 'lift_ornot', 'villa', 'transaction_ownership', 'purpose', 'age', 'property_ownership','year', 'relative_height', 'east', 'west', 'south', 'north', 'south_east', 'north_east', 'south_west', 'north_west']


def get_Fmatrix_linear_part(df,linear_variable_name_list):
    if not isinstance(df, pd.DataFrame):
        raise ValueError("输入 df 必须是 pandas DataFrame")
    if not isinstance(linear_variable_name_list, list) or not all(isinstance(var, str) for var in linear_variable_name_list):
        raise ValueError("linear_variable_name_list 必须是一个字符串列表")
    if not all(var in df.columns for var in linear_variable_name_list):
        raise ValueError("linear_variable_name_list 中的变量名必须存在于 df 的列中")
    # 复制原始 DataFrame，避免修改原始数据
    result_df = pd.DataFrame()
    for var in linear_variable_name_list:
        result_df[var] = df[var]
    return result_df


def get_Fmatrix_nonlinear_part(df,nonlinear_variable_name_list):
    # 检查输入是否有效
    if not isinstance(df, pd.DataFrame):
        raise ValueError("输入 df 必须是 pandas DataFrame")
    if not isinstance(nonlinear_variable_name_list, list) or not all(isinstance(var, str) for var in nonlinear_variable_name_list):
        raise ValueError("nonlinear_variable_name_list 必须是一个字符串列表")
    if not all(var in df.columns for var in nonlinear_variable_name_list):
        raise ValueError("nonlinear_variable_name_list 中的变量名必须存在于 df 的列中")

    # 复制原始 DataFrame，避免修改原始数据
    result_df = pd.DataFrame()

    # 为每个变量生成非线性项
    for var in nonlinear_variable_name_list:
        result_df[var] = df[var]
        # 负一次项
        result_df[f"({var}+1)^-1"] = 1/(df[var]+1) 
        # 二次项
        result_df[f"{var}^2"] = df[var] ** 2
        # 三次项
        result_df[f"{var}^3"] = df[var] ** 3
        # 对数项
        result_df[f"log({var}+1)"] = np.log(df[var]+1)
        # 对数的平方
        result_df[f"log({var}+1)^2"] = (np.log(df[var]+1)) ** 2
        # 对数的立方
        result_df[f"log({var}+1)^3"] = (np.log(df[var]+1)) ** 3

    return result_df

def get_Fmatrix_dummy_part(df,dummy_variable_name_list,is_test=False,train_df=None):
    if is_test:
        if train_df is None:raise ValueError("为训练集生成虚拟变量时必须输入测试集，以保证测试集的虚拟变量与测试集完全重合！")
    result_df=pd.DataFrame()
    X_temp = pd.get_dummies(df, columns=dummy_variable_name_list, prefix=dummy_variable_name_list, drop_first=False)
    result_df=X_temp.filter(regex='^(' + '|'.join(dummy_variable_name_list) + ')').astype(float).copy().astype(int)

    if  is_test:
        X_train_dummy_part=get_Fmatrix_dummy_part(train_df,dummy_variable_name_list)
        train_exclusive_location_list=np.setdiff1d(train_df['location'+str(location_index)],df['location'+str(location_index)])
        test_exclusive_location_list=np.setdiff1d(df['location'+str(location_index)],train_df['location'+str(location_index)])
        train_exclusive_list=np.setdiff1d(X_train_dummy_part.columns,result_df.columns)
        test_exclusive_list=np.setdiff1d(result_df.columns,X_train_dummy_part.columns)
        ''' 
        print('train_exclusive_location_list=',train_exclusive_location_list)
        print('test_exclusive_location_list=',test_exclusive_location_list)
        print('train_exclusive_list=',train_exclusive_list)
        print('test_exclusive_list=',test_exclusive_list)
        '''


        for train_exclusive in train_exclusive_list:
            result_df[train_exclusive]=0 
        result_df['year_2022']+=result_df['year_2023']
        result_df.drop('year_2023',axis=1,inplace=True)

        if IS_UPPER:
            for test_location in test_exclusive_location_list:
                    neighbor_location_list=get_neighbor_location_list(this_location=test_location,location_index=location_index,location_relation_df=location_relation_df)
                    neighbor_num=len(neighbor_location_list)
                    if neighbor_num!=0:
                        for neighbor_location in neighbor_location_list:
                            result_df['location'+str(location_index)+'_'+neighbor_location]+=1/neighbor_num*result_df['location'+str(location_index)+'_'+test_location]
                    result_df.drop('location'+str(location_index)+'_'+test_location,axis=1,inplace=True)
        else:
            if(location_index>3):raise ValueError("标号临近法不适用于小区！")
            for test_location in test_exclusive_location_list:
                    neighbor_location_list=get_neighbor_location_list(this_location=test_location,train_df=train_df,neighbor_num=neighbor_num)
                    for neighbor_location in neighbor_location_list:
                        result_df['location'+str(location_index)+'_'+neighbor_location]+=1/neighbor_num*result_df['location'+str(location_index)+'_'+test_location]
                    result_df.drop('location'+str(location_index)+'_'+test_location,axis=1,inplace=True)
        train_exclusive_list=np.setdiff1d(X_train_dummy_part.columns,result_df.columns)
        test_exclusive_list=np.setdiff1d(result_df.columns,X_train_dummy_part.columns)
        ''' 
        print('train_exclusive_list=',train_exclusive_list)
        print('test_exclusive_list=',test_exclusive_list)
        '''
        if len(train_exclusive_list)!=0:raise ValueError("X_train_dummy_part仍然有独有变量！")
        if len(test_exclusive_list)!=0:
            for var in test_exclusive_list:
                result_df.drop(var,axis=1,inplace=True)
    return(result_df)     


X_train_part_list=[]
X_test_part_list=[]

X_train_part_list.append(get_Fmatrix_linear_part(train_df,linear_variable_name_list))
X_train_part_list.append(get_Fmatrix_nonlinear_part(train_df,nonlinear_variable_name_list))
X_train_part_list.append(get_Fmatrix_dummy_part(train_df,dummy_variable_name_list))

X_test_part_list.append(get_Fmatrix_linear_part(test_df,linear_variable_name_list))
X_test_part_list.append(get_Fmatrix_nonlinear_part(test_df,nonlinear_variable_name_list))
X_test_part_list.append(get_Fmatrix_dummy_part(test_df,dummy_variable_name_list,is_test=True,train_df=train_df))


''' 
X_train_linear_part=get_Fmatrix_linear_part(train_df,linear_variable_name_list)
X_test_linear_part=get_Fmatrix_linear_part(test_df,linear_variable_name_list)
X_train_nonlinear_part=get_Fmatrix_nonlinear_part(train_df,nonlinear_variable_name_list)
X_test_nonlinear_part=get_Fmatrix_nonlinear_part(test_df,nonlinear_variable_name_list)
X_train_dummy_part=get_Fmatrix_dummy_part(train_df,dummy_variable_name_list)
X_test_dummy_part=get_Fmatrix_dummy_part(test_df,dummy_variable_name_list,is_test=True,train_df=train_df)
'''


In [None]:
X_train_without_interaction=pd.concat(X_train_part_list,axis=1)
X_test_without_interaction=pd.concat(X_test_part_list,axis=1)

interaction_variable_pair_list=[['location1','ring']]

def get_Fmatrix_with_interaction(df,interaction_variable_pair_list):
    result_df=pd.DataFrame()
    name_list_without_interaction=df.columns
    for variable_pair in interaction_variable_pair_list:
        variable_former_list=[name for name in name_list_without_interaction if name.split('_')[0]==variable_pair[0]]
        variable_later_list=[name for name in name_list_without_interaction if name.split('_')[0]==variable_pair[1]]
        for variable_former in variable_former_list:
            for variable_later in variable_later_list:
                result_df[f"{variable_former}*{variable_later}"]=df[variable_former]*df[variable_later]
    result_df=pd.concat([df,result_df],axis=1)
    return(result_df)

X_train=get_Fmatrix_with_interaction(X_train_without_interaction,interaction_variable_pair_list)
X_test=get_Fmatrix_with_interaction(X_test_without_interaction,interaction_variable_pair_list)


In [None]:
X_train=X_train[sorted(X_train.columns)]
X_test=X_test[X_train.columns]


print('X_train.shape=',X_train.shape)
print('X_test.shape=',X_test.shape)

col_na_count=X_train.isna().sum()
col_na_count=col_na_count[col_na_count!=0]
print('train_x_na:',col_na_count)

col_na_count=X_test.isna().sum()
col_na_count=col_na_count[col_na_count!=0]
print('test_x_na:',col_na_count)

col_na_count=np.isinf(X_train).sum()
col_na_count=col_na_count[col_na_count!=0]
print('train_x_if:',col_na_count)

col_na_count=np.isinf(X_test).sum()
col_na_count=col_na_count[col_na_count!=0]
print('test_x_inf:',col_na_count)

train_exclusive_list=np.setdiff1d(X_train.columns.tolist(),X_test.columns.tolist())
test_exclusive_list=np.setdiff1d(X_test.columns.tolist(),X_train.columns.tolist())

print('train_exclusive_list=',train_exclusive_list)
print('test_exclusive_list=',test_exclusive_list)


print('train_test_col_is_not_incident=',np.sum(X_train.columns!=X_test.columns))

use_colname_list=np.array(range(len(X_train.columns))).astype(str)
X_train_use=X_train.copy()
X_train_use.columns=use_colname_list
X_test_use=X_test.copy()
X_test_use.columns=use_colname_list




In [None]:
from sklearn.linear_model import LinearRegression

y=np.log(train_df['price'])
OLS_md = LinearRegression()
OLS_md.fit(X_train_use,y)

output_df=pd.DataFrame({
    'ID':range(len(test_df)),
    'Price':np.exp(OLS_md.predict(X_test_use))
})

output_df.to_csv('submission_2025_4_5_y=log(p).csv',index=False)

