In [33]:
import os

os.makedirs(os.path.join('..', 'data'), exist_ok=True)
data_file = os.path.join('..', 'data', 'house_tiny.csv')
with open(data_file, 'w') as f:
    f.write('NumRooms,Alley,Price\n')  # 列名
    f.write('NA,Pave,127500\n')  # 每行表示一个数据样本
    f.write('2,NA,106000\n')
    f.write('4,NA,178100\n')
    f.write('NA,NA,140000\n')

In [34]:
import pandas as pd

data = pd.read_csv(data_file)
print(data)

   NumRooms Alley   Price
0       NaN  Pave  127500
1       2.0   NaN  106000
2       4.0   NaN  178100
3       NaN   NaN  140000


In [35]:
inputs, outputs = data.iloc[:, 0:2], data.iloc[:, 2]
inputs = pd.get_dummies(inputs, dummy_na=True)  # 把缺失值（NaN/None）也当作一个独立的类别来处理，并为它创建一个单独的二进制列
print(inputs)

   NumRooms  Alley_Pave  Alley_nan
0       NaN        True      False
1       2.0       False       True
2       4.0       False       True
3       NaN       False       True


In [36]:
inputs = inputs.fillna(inputs.mean())
print(inputs)

   NumRooms  Alley_Pave  Alley_nan
0       3.0        True      False
1       2.0       False       True
2       4.0       False       True
3       3.0       False       True


接下来，将inputs和outputs转换为张量格式，以便使用张量函数进行操作

In [37]:
import torch

X = torch.tensor(inputs.to_numpy(dtype=float))
y = torch.tensor(outputs.to_numpy(dtype=float))
X, y

(tensor([[3., 1., 0.],
         [2., 0., 1.],
         [4., 0., 1.],
         [3., 0., 1.]], dtype=torch.float64),
 tensor([127500., 106000., 178100., 140000.], dtype=torch.float64))

练习

In [38]:
file = os.path.join('..', 'data', 'salary.csv')
with open(file, 'w') as f:
    f.write('Name,Age,Salary\n')
    f.write('Peter,27,1000\n')
    f.write('Tom,NA,2000\n')
    f.write('Jack,20,NA\n')
    f.write('Jane,NA,NA\n')
    f.write('NA,30,3000\n')
    f.write('Fin,22,NA\n')

In [39]:
data_1 = pd.read_csv(file)
data_1

Unnamed: 0,Name,Age,Salary
0,Peter,27.0,1000.0
1,Tom,,2000.0
2,Jack,20.0,
3,Jane,,
4,,30.0,3000.0
5,Fin,22.0,


In [40]:
inputs, outputs = data_1.iloc[:, 0:2], data_1.iloc[:, 2]
print(inputs)
print(outputs)

    Name   Age
0  Peter  27.0
1    Tom   NaN
2   Jack  20.0
3   Jane   NaN
4    NaN  30.0
5    Fin  22.0
0    1000.0
1    2000.0
2       NaN
3       NaN
4    3000.0
5       NaN
Name: Salary, dtype: float64


In [41]:
data_1.count()

Name      5
Age       4
Salary    3
dtype: int64

In [42]:
data_1.count().idxmin()  # 找到最小值的索引值

'Salary'

In [43]:
data_1 = data_1.drop(columns=data_1.count().idxmin())  # 指定要删除的列
data_1

Unnamed: 0,Name,Age
0,Peter,27.0
1,Tom,
2,Jack,20.0
3,Jane,
4,,30.0
5,Fin,22.0


In [48]:
inputs = inputs.fillna(inputs.mean(numeric_only=True))
inputs

Unnamed: 0,Name,Age
0,Peter,27.0
1,Tom,24.75
2,Jack,20.0
3,Jane,24.75
4,,30.0
5,Fin,22.0


In [52]:
inputs = pd.get_dummies(inputs, dummy_na=True)
inputs

Unnamed: 0,Age,Name_Fin,Name_Jack,Name_Jane,Name_Peter,Name_Tom,Name_nan
0,27.0,False,False,False,True,False,False
1,24.75,False,False,False,False,True,False
2,20.0,False,True,False,False,False,False
3,24.75,False,False,True,False,False,False
4,30.0,False,False,False,False,False,True
5,22.0,True,False,False,False,False,False


In [49]:
outputs = outputs.fillna(outputs.mean())
outputs

0    1000.0
1    2000.0
2    2000.0
3    2000.0
4    3000.0
5    2000.0
Name: Salary, dtype: float64

In [53]:
X = torch.tensor(inputs.to_numpy(dtype=float))
Y = torch.tensor(outputs.to_numpy(dtype=float))
X, Y

(tensor([[27.0000,  0.0000,  0.0000,  0.0000,  1.0000,  0.0000,  0.0000],
         [24.7500,  0.0000,  0.0000,  0.0000,  0.0000,  1.0000,  0.0000],
         [20.0000,  0.0000,  1.0000,  0.0000,  0.0000,  0.0000,  0.0000],
         [24.7500,  0.0000,  0.0000,  1.0000,  0.0000,  0.0000,  0.0000],
         [30.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  1.0000],
         [22.0000,  1.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000]],
        dtype=torch.float64),
 tensor([1000., 2000., 2000., 2000., 3000., 2000.], dtype=torch.float64))