In [1]:
import torch
import numpy as np
import pandas as pd

### 查看数据集

In [4]:
train_data = pd.read_csv("data/train.csv")
test_data = pd.read_csv("data/test.csv")

print(f'训练集的形状：{train_data.shape}')
print(f'测试集的形状：{test_data.shape}')
print("查看前4个和后4个特征:", train_data.iloc[0:4, [0, 1, 2, 3, -3, -2, -1]])

训练集的形状：(47439, 41)
测试集的形状：(31626, 40)
查看前4个和后4个特征:    Id            Address  Sold Price  \
0   0        540 Pine Ln   3825000.0   
1   1     1727 W 67th St    505000.0   
2   2     28093 Pine Ave    140000.0   
3   3  10750 Braddock Dr   1775000.0   

                                             Summary         City    Zip State  
0  540 Pine Ln, Los Altos, CA 94022 is a single f...    Los Altos  94022    CA  
1  HURRY, HURRY.......Great house 3 bed and 2 bat...  Los Angeles  90047    CA  
2  'THE PERFECT CABIN TO FLIP!  Strawberry deligh...   Strawberry  95375    CA  
3  Rare 2-story Gated 5 bedroom Modern Mediterran...  Culver City  90230    CA  


In [7]:
train_data.describe()

Unnamed: 0,Id,Sold Price,Year built,Lot,Bathrooms,Full bathrooms,Total interior livable area,Total spaces,Garage spaces,Elementary School Score,Elementary School Distance,Middle School Score,Middle School Distance,High School Score,High School Distance,Tax assessed value,Annual tax amount,Listed Price,Last Sold Price,Zip
count,47439.0,47439.0,46394.0,33258.0,43974.0,39574.0,44913.0,46523.0,46522.0,42543.0,42697.0,30734.0,30735.0,42220.0,42438.0,43787.0,43129.0,47439.0,29673.0,47439.0
mean,23719.0,1296050.0,1956.634888,235338.3,2.355642,2.094961,5774.587,1.567117,1.491746,5.720824,1.152411,5.317206,1.691593,6.134344,2.410366,786311.8,9956.843817,1315890.0,807853.7,93279.178587
std,13694.604047,1694452.0,145.802456,11925070.0,1.188805,0.96332,832436.3,9.011608,8.964319,2.10335,2.332367,2.002768,2.462879,1.984711,3.59612,1157796.0,13884.254976,2628695.0,1177903.0,2263.459104
min,0.0,100500.0,0.0,0.0,0.0,1.0,1.0,-15.0,-15.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,85611.0
25%,11859.5,565000.0,1946.0,4991.0,2.0,2.0,1187.0,0.0,0.0,4.0,0.3,4.0,0.6,5.0,0.8,254961.5,3467.0,574500.0,335000.0,90220.0
50%,23719.0,960000.0,1967.0,6502.0,2.0,2.0,1566.0,1.0,1.0,6.0,0.5,5.0,1.0,6.0,1.3,547524.0,7129.0,949000.0,598000.0,94114.0
75%,35578.5,1525000.0,1989.0,10454.0,3.0,2.0,2142.0,2.0,2.0,7.0,1.0,7.0,1.8,8.0,2.4,937162.5,12010.0,1498844.0,950000.0,95073.0
max,47438.0,90000000.0,9999.0,1897474000.0,24.0,17.0,176416400.0,1000.0,1000.0,10.0,57.2,9.0,57.2,10.0,73.9,45900000.0,552485.0,402532000.0,90000000.0,96155.0


In [9]:
train_data.columns

Index(['Id', 'Address', 'Sold Price', 'Summary', 'Type', 'Year built',
       'Heating', 'Cooling', 'Parking', 'Lot', 'Bedrooms', 'Bathrooms',
       'Full bathrooms', 'Total interior livable area', 'Total spaces',
       'Garage spaces', 'Region', 'Elementary School',
       'Elementary School Score', 'Elementary School Distance',
       'Middle School', 'Middle School Score', 'Middle School Distance',
       'High School', 'High School Score', 'High School Distance', 'Flooring',
       'Heating features', 'Cooling features', 'Appliances included',
       'Laundry features', 'Parking features', 'Tax assessed value',
       'Annual tax amount', 'Listed On', 'Listed Price', 'Last Sold On',
       'Last Sold Price', 'City', 'Zip', 'State'],
      dtype='object')

## 数据预处理

### 数值类型特征处理

In [24]:
"""将训练集和测试集组合在一起"""
all_features = pd.concat((train_data.iloc[:, 3:-1], test_data.iloc[:, 1:-1]))

numeric_features = all_features.dtypes[all_features.dtypes != 'object'].index
all_features[numeric_features] = all_features[numeric_features].apply(
	lambda x : (x - x.mean()) / (x.std())
)

In [25]:
# 在标准化数据之后 所有均值消失 可以将缺失值设置为0
all_features[numeric_features] = all_features[numeric_features].fillna(0)

In [26]:
print(f'处理之后数据集的形状：{all_features.shape}')

处理之后数据集的形状：(79065, 38)


### 保存数据

In [30]:
len_train = train_data.shape[0]
train_features = all_features[numeric_features][:len_train]
train_features['label'] = train_data.iloc[:, 2]
test_features = all_features[numeric_features][len_train:]

train_features.to_csv('data/processed_train.csv', index=False)
test_features.to_csv('data/processed_test.csv', index=False)

print(train_features.shape)
print(test_features.shape)

(47439, 19)
(31626, 18)


In [31]:
train_features['label']

0        3825000.0
1         505000.0
2         140000.0
3        1775000.0
4        1175000.0
           ...    
47434     159000.0
47435     255000.0
47436    2300000.0
47437     500000.0
47438     760000.0
Name: label, Length: 47439, dtype: float64