**数据预处理**

创建人工数据集，存储在csv（逗号分隔值）文件

In [1]:
import os

os.makedirs(os.path.join('..','data'),exist_ok=True) # exist_ok：只有在目录不存在时创建目录，目录已存在时不会抛出异常
data_file = os.path.join('..','data','house_tiny.csv')

with open(data_file,'w') as f:
    f.write('NumRooms,Alley,Price\n')
    f.write('NA,Pave,127500\n')
    f.write('2,NA,106000\n')
    f.write('4,NA,178100\n')
    f.write('NA,NA,140000\n')

读取csv文件

In [2]:
import pandas as pd

data = pd.read_csv(data_file)
data

Unnamed: 0,NumRooms,Alley,Price
0,,Pave,127500
1,2.0,,106000
2,4.0,,178100
3,,,140000


异常值处理（插值和删除等）

In [3]:
inputs = data.iloc[:,:2]
outputs = data.iloc[:,2]
# 对每一列进行查询（数值型）
inputs = inputs.fillna(inputs.mean())
inputs

  inputs = inputs.fillna(inputs.mean())


Unnamed: 0,NumRooms,Alley
0,3.0,Pave
1,2.0,
2,4.0,
3,3.0,


inputs 中的类别值或离散值，将NaN视为一个类

In [4]:
inputs = pd.get_dummies(inputs, dummy_na=True)
inputs

Unnamed: 0,NumRooms,Alley_Pave,Alley_nan
0,3.0,1,0
1,2.0,0,1
2,4.0,0,1
3,3.0,0,1


转换成张量

In [5]:
import torch

X,y = torch.tensor(inputs.values),torch.tensor(outputs.values)
X,y

(tensor([[3., 1., 0.],
         [2., 0., 1.],
         [4., 0., 1.],
         [3., 0., 1.]], dtype=torch.float64),
 tensor([127500, 106000, 178100, 140000]))

练习:删除缺失值最多的列

In [6]:
# 方法一
data2 = data.iloc[:,data.isna().sum().values < data.isna().sum().max()]
print('The first way:\n',data2)
# 方法二 
data2 = data.drop(data.isna().sum().idxmax(),axis=1)
print('\nThe Second way:\n',data2)

The first way:
    NumRooms   Price
0       NaN  127500
1       2.0  106000
2       4.0  178100
3       NaN  140000

The Second way:
    NumRooms   Price
0       NaN  127500
1       2.0  106000
2       4.0  178100
3       NaN  140000


分析

In [7]:
print('data Addr:',id(data))

print('\ndelete column by iloc:')
data2 = data

print('\ndata2.isna():',type(data2.isna()))
print(data2.isna())

print('\ndata2.isna().sum():', type(data2.isna().sum()))
print(data2.isna().sum())
print('\ndata2.isna().sum().max():',data2.isna().sum().max())

print('old_data2 Addr:',id(data2))
data2 = data2.iloc[:,data2.isna().sum().values < data2.isna().sum().max()]
print('new_data2 Addr:',id(data2))

print('\n another way by drop:')
data3 = data

print('data3.isna().sum().idxmax():',data3.isna().sum().idxmax())

print('old_data3 Addr:',id(data3))
data3 = data3.drop(data.isna().sum().idxmax(),axis=1)
print('new_data3 Addr',id(data3))

data Addr: 2485605317024

delete column by iloc:

data2.isna(): <class 'pandas.core.frame.DataFrame'>
   NumRooms  Alley  Price
0      True  False  False
1     False   True  False
2     False   True  False
3      True   True  False

data2.isna().sum(): <class 'pandas.core.series.Series'>
NumRooms    2
Alley       3
Price       0
dtype: int64

data2.isna().sum().max(): 3
old_data2 Addr: 2485605317024
new_data2 Addr: 2485566932832

 another way by drop:
data3.isna().sum().idxmax(): Alley
old_data3 Addr: 2485605317024
new_data3 Addr 2485605534448
