In [1]:
import numpy as np
import pandas as pd
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display, clear_output
warnings.filterwarnings('ignore')
pd.set_option('display.width', 1000)
%matplotlib inline

In [2]:
train = pd.read_csv('house-prices-advanced-regression-techniques/train.csv')
test = pd.read_csv('house-prices-advanced-regression-techniques/test.csv')
print(train.info())#
print(test.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

# 通过观察发现，很多特征之间是有一定的逻辑关联的，而这种逻辑关联往往可以通过特征名来识别，所以我们可以通过一个文本相似度算法来对特征进行分组，将关联性高的特征分到一组，分别使用不同的模型进行训练评估，最后再利用adaboost集成.
# 文本相似度分析
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.cluster.hierarchy import linkage, fcluster
feature_names = train.columns[1:-1].tolist()
# 计算特征名称之间的余弦相似度
vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(1, 2))
tfidf_matrix = vectorizer.fit_transform(feature_names)
cosine_sim = cosine_similarity(tfidf_matrix)

# 使用层次聚类对特征进行分组
linkage_matrix = linkage(cosine_sim, method='complete')
plt.figure(figsize=(10, 5))
dendrogram = plt.gca()
dendrogram.set_xticklabels(feature_names)
cluster_labels = fcluster(linkage_matrix, 1.0, criterion='distance')

# 输出分组结果
grouped_features = pd.DataFrame({'Feature': feature_names, 'Group': cluster_labels})
print(grouped_features)
# 创建特征分组字典
feature_groups = {}
for group in grouped_features['Group'].unique():
    feature_groups[f'Group{group}'] = grouped_features[grouped_features['Group'] == group]['Feature'].tolist()

# 输出每个特征组的前五行
for group_name, features in feature_groups.items():    print(f"{group_name}:\n{train[features].head()}\n")

以上代码是使用文本相似度对特征进行分组，但是效果并不好。我们可以直接把特征值传给GPT4，让它来根据逻辑和常理进行分类，如此能省下很多时间。

In [3]:
# 特征分组，以下是通过GPT4分类，并经过了一定的人为修改
group_1 = ['MSSubClass', 'MSZoning', 'BldgType', 'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd']
group_2 = ['LotFrontage', 'LotArea', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2']
group_3 = ['RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea', 'ExterQual', 'ExterCond']
group_4 = ['Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF']
group_5 = ['Heating', 'HeatingQC', 'CentralAir', 'Fireplaces', 'FireplaceQu', 'Electrical']
group_6 = ['1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd', 'Functional']
group_7 = ['GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond', 'PavedDrive']
group_8 = ['WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'PoolQC', 'Fence', 'MiscFeature', 'MiscVal']
group_9 = ['MoSold', 'YrSold', 'SaleType', 'SaleCondition']

# 提取特征组
train_group_1 = train[group_1]
train_group_2 = train[group_2]
train_group_3 = train[group_3]
train_group_4 = train[group_4]
train_group_5 = train[group_5]
train_group_6 = train[group_6]
train_group_7 = train[group_7]
train_group_8 = train[group_8]
train_group_9 = train[group_9]
y = train['SalePrice']

In [4]:
# 对group1进行特征工程
print(train_group_1)
print(train_group_1.info())
print(train_group_1.describe())

      MSSubClass MSZoning BldgType HouseStyle  OverallQual  OverallCond  YearBuilt  YearRemodAdd
0             60       RL     1Fam     2Story            7            5       2003          2003
1             20       RL     1Fam     1Story            6            8       1976          1976
2             60       RL     1Fam     2Story            7            5       2001          2002
3             70       RL     1Fam     2Story            7            5       1915          1970
4             60       RL     1Fam     2Story            8            5       2000          2000
...          ...      ...      ...        ...          ...          ...        ...           ...
1455          60       RL     1Fam     2Story            6            5       1999          2000
1456          20       RL     1Fam     1Story            6            6       1978          1988
1457          70       RL     1Fam     2Story            7            9       1941          2006
1458          20       RL     

In [5]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler

In [6]:
# 对object类型数据进行标签编码
def encode_categorical_features(df):
    for col in df.columns:
        if df[col].dtype == 'object':
            le = LabelEncoder()
            df[col] = le.fit_transform(df[col].astype(str))
    return df

In [7]:
# 对第一个特征组进行处理,并进行归一化处理,注意要同时对训练集和测试集进行工作
train_group_1_encoded = encode_categorical_features(train_group_1)
test[group_1] = encode_categorical_features(test[group_1])
# 归一化处理
scaler = MinMaxScaler()
normalized_data1 = scaler.fit_transform(train_group_1_encoded)
test[group_1] = scaler.fit_transform(test[group_1])
print(normalized_data1)

[[0.23529412 0.75       0.         ... 0.5        0.94927536 0.88333333]
 [0.         0.75       0.         ... 0.875      0.75362319 0.43333333]
 [0.23529412 0.75       0.         ... 0.5        0.93478261 0.86666667]
 ...
 [0.29411765 0.75       0.         ... 1.         0.5        0.93333333]
 [0.         0.75       0.         ... 0.625      0.56521739 0.76666667]
 [0.         0.75       0.         ... 0.625      0.67391304 0.25      ]]


In [8]:
# 划分训练集和测试集
X_train1, X_test1, y_train1, y_test1 = train_test_split(normalized_data1, y, test_size=0.2, random_state=42)
# 创建线性回归模型实例
ridge1 = LinearRegression()
# 训练模型
ridge1.fit(X_train1, y_train1)
# 预测
y_pred1 = ridge1.predict(X_test1)
# 计算预测误差
rmse1 = np.sqrt(mean_squared_error(y_test1, y_pred1))
print(f"RMSE: {rmse1}")

RMSE: 50374.46035512127


In [9]:
from sklearn.ensemble import RandomForestRegressor

In [10]:
# 对group2进行特征工程
print(train_group_2)
print(train_group_2.info())
print(train_group_2.describe())

      LotFrontage  LotArea Street Alley LotShape LandContour Utilities LotConfig LandSlope Neighborhood Condition1 Condition2
0            65.0     8450   Pave   NaN      Reg         Lvl    AllPub    Inside       Gtl      CollgCr       Norm       Norm
1            80.0     9600   Pave   NaN      Reg         Lvl    AllPub       FR2       Gtl      Veenker      Feedr       Norm
2            68.0    11250   Pave   NaN      IR1         Lvl    AllPub    Inside       Gtl      CollgCr       Norm       Norm
3            60.0     9550   Pave   NaN      IR1         Lvl    AllPub    Corner       Gtl      Crawfor       Norm       Norm
4            84.0    14260   Pave   NaN      IR1         Lvl    AllPub       FR2       Gtl      NoRidge       Norm       Norm
...           ...      ...    ...   ...      ...         ...       ...       ...       ...          ...        ...        ...
1455         62.0     7917   Pave   NaN      Reg         Lvl    AllPub    Inside       Gtl      Gilbert       Norm    

In [11]:
# 经过分析，可以将Alley的缺失值填充为0
train_group_2['Alley'] = train_group_2['Alley'].fillna(0)
test['Alley'] = test['Alley'].fillna(0)

In [13]:
train_group_2_encoded = encode_categorical_features(train_group_2)
# 首先，我们将数据集分为有LotFrontage值的部分和缺少LotFrontage值的部分
data_with_lotfrontage = train_group_2_encoded[train_group_2_encoded['LotFrontage'].notnull()]
data_missing_lotfrontage = train_group_2_encoded[train_group_2['LotFrontage'].isnull()]

# 选择用于预测LotFrontage的其他特征（这里我们选择与LotFrontage相关的特征）
features = ['LotArea', 'Neighborhood','LotConfig','LotShape']
# 获取训练和测试数据
X_train = data_with_lotfrontage[features]
y_train = data_with_lotfrontage['LotFrontage']
X_test = train_group_2_encoded[train_group_2['LotFrontage'].isnull()][features]
t_test = test[test['LotFrontage'].isnull()][features]
# 创建随机森林回归模型实例
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# 训练模型
rf.fit(X_train, y_train)

# 使用模型预测缺失的LotFrontage值
predicted_train = rf.predict(X_test)
predicted_test = rf.predict(t_test)
# 将预测的LotFrontage值填充回原始数据集
train.loc[train['LotFrontage'].isnull(), 'LotFrontage'] = predicted_train
test.loc[test['LotFrontage'].isnull(), 'LotFrontage'] = predicted_test


ValueError: could not convert string to float: 'Gilbert'

In [None]:
# 对第一个特征组进行处理,并进行归一化处理

# # 归一化处理
# normalized_data2 = scaler.fit_transform(train_group_2_encoded)
# normalized_data2 = pd.DataFrame(normalized_data2)
# print(normalized_data2)
# print(train_group_2_encoded)