## 二、数据预处理

1. 读取数据集

In [1]:
import os

import torch

os.listdir(".")

['1.Data Manipulation.ipynb',
 '2.Data Preprocessing.ipynb',
 'data.csv',
 'readme.md']

In [2]:
# os 包的常用函数见  https://zhuanlan.zhihu.com/p/150835193
os.makedirs(os.path.join("..", "data"), exist_ok=True)
data_file = os.path.join("..", "data", "house_csv")

# "with open() as f" 是语法糖，省略 try...finally 块和 f.close() 语句
with open(data_file, 'w') as f:
    f.write('''NumRooms,RoofType,Price
NA,NA,127500
2,NA,106000
4,Slate,178100
NA,NA,140000''')

In [3]:
import pandas as pd

data = pd.read_csv(data_file)

print(data)

   NumRooms RoofType   Price
0       NaN      NaN  127500
1       2.0      NaN  106000
2       4.0    Slate  178100
3       NaN      NaN  140000


2. 缺失值处理
   要么插入一个替代值，要么直接删除该项

In [9]:
inputs, targets = data.iloc[:, 0:2], data.iloc[:, 2]
inputs = pd.get_dummies(inputs, dummy_na=True)
print(inputs)

   NumRooms  RoofType_Slate  RoofType_nan
0       NaN           False          True
1       2.0           False          True
2       4.0            True         False
3       NaN           False          True


In [12]:
inputs = inputs.fillna(inputs.mean())
inputs = inputs.astype(int)  # astype(int) 将True / False -> 1 / 0
print(inputs)

   NumRooms  RoofType_Slate  RoofType_nan
0         3               0             1
1         2               0             1
2         4               1             0
3         3               0             1


3. 转换为`Tensor`格式

In [13]:
X, y = torch.tensor(inputs.values), torch.tensor(targets.values)
X, y

(tensor([[3, 0, 1],
         [2, 0, 1],
         [4, 1, 0],
         [3, 0, 1]], dtype=torch.int32),
 tensor([127500, 106000, 178100, 140000]))

---


## 练习
创建包含更多行和列的原始数据集。
    1. 删除缺失值最多的列。
    2. 将预处理后的数据集转换为张量格式。

In [22]:
# 1.删除缺失值最多的列。
# (1) 导入 DataFrame
# (2) 遍历列，记录当前最多 NA 值的列名（或列序号）
# (3) 按照列名（或列序号）删除该列
data_with_na = os.path.join("..", "data", "2.2.5 Exercises.csv")
data = pd.read_csv(data_with_na, sep='\t')
data

Unnamed: 0,1,date,open,high,low,close,volume,Name
0,2,,15.07,15.12,14.63,14.75,,AAL
1,3,2013-02-11,14.89,15.01,14.26,14.46,8882000.0,AAL
2,4,2013-02-12,14.45,14.51,14.1,14.27,8126000.0,AAL
3,5,2013-02-13,14.3,14.94,14.25,14.66,10259500.0,AAL
4,6,2013-02-14,14.94,14.96,13.16,13.99,,AAL
5,7,2013-02-15,13.93,14.61,13.93,14.5,15628000.0,AAL
6,8,2013-02-19,14.33,14.56,14.08,14.26,,AAL
7,9,2013-02-20,14.17,14.26,13.15,13.33,14725200.0,AAL
8,10,2013-02-21,13.62,13.95,12.9,13.37,,AAL
9,11,2013-02-22,13.57,13.6,13.21,13.57,6071400.0,AAL


In [30]:
type(data)
data.shape

(39, 8)

In [None]:
count = data.shape[0]  # 每列最多 shape[0] 个元素。求最小值前初始值设为最大
for col in data.columns:
    if (data[col].count() < count):
        count = data[col].count()#%% md
## 二、数据预处理

1. 读取数据集

In [None]:
import os

import torch

os.listdir(".")

In [None]:
# os 包的常用函数见  https://zhuanlan.zhihu.com/p/150835193
os.makedirs(os.path.join("..", "data"), exist_ok=True)
data_file = os.path.join("..", "data", "house_csv")

# "with open() as f" 是语法糖，省略 try...finally 块和 f.close() 语句
with open(data_file, 'w') as f:
    f.write('''NumRooms,RoofType,Price
NA,NA,127500
2,NA,106000
4,Slate,178100
NA,NA,140000''')

In [None]:
import pandas as pd

data = pd.read_csv(data_file)

print(data)

2. 缺失值处理
   要么插入一个替代值，要么直接删除该项

In [None]:
inputs, targets = data.iloc[:, 0:2], data.iloc[:, 2]
inputs = pd.get_dummies(inputs, dummy_na=True)
print(inputs)

In [None]:
inputs = inputs.fillna(inputs.mean())
inputs = inputs.astype(int)  # astype(int) 将True / False -> 1 / 0
print(inputs)

3. 转换为`Tensor`格式

In [None]:
X, y = torch.tensor(inputs.values), torch.tensor(targets.values)
X, y

---


## 练习
创建包含更多行和列的原始数据集。
    1. 删除缺失值最多的列。
    2. 将预处理后的数据集转换为张量格式。

In [35]:
# 1.删除缺失值最多的列。
# (1) 导入 DataFrame
# (2) 遍历列，记录当前最多 NA 值的列名（或列序号）
# (3) 按照列名（或列序号）删除该列
data_with_na = os.path.join("..", "data", "2.2.5 Exercises.csv")
data = pd.read_csv(data_with_na, sep='\t')
data

Unnamed: 0,1,date,open,high,low,close,volume,Name
0,2,,15.07,15.12,14.63,14.75,,AAL
1,3,2013-02-11,14.89,15.01,14.26,14.46,8882000.0,AAL
2,4,2013-02-12,14.45,14.51,14.1,14.27,8126000.0,AAL
3,5,2013-02-13,14.3,14.94,14.25,14.66,10259500.0,AAL
4,6,2013-02-14,14.94,14.96,13.16,13.99,,AAL
5,7,2013-02-15,13.93,14.61,13.93,14.5,15628000.0,AAL
6,8,2013-02-19,14.33,14.56,14.08,14.26,,AAL
7,9,2013-02-20,14.17,14.26,13.15,13.33,14725200.0,AAL
8,10,2013-02-21,13.62,13.95,12.9,13.37,,AAL
9,11,2013-02-22,13.57,13.6,13.21,13.57,6071400.0,AAL


In [None]:
type(data)
data.shape

In [67]:
count = data.shape[0]  # 每列最多 shape[0] 个元素。求最小值前初始值设为最大
cur_loc_name = ""
for col in data.columns:
    if data[col].count() < count:
        count = data[col].count()
        cur_loc_name = col

res = data.drop(columns=[cur_loc_name])   # drop() 是拷贝修改后的新值
res = res.iloc[:,2:6]
res

Unnamed: 0,open,high,low,close
0,15.07,15.12,14.63,14.75
1,14.89,15.01,14.26,14.46
2,14.45,14.51,14.1,14.27
3,14.3,14.94,14.25,14.66
4,14.94,14.96,13.16,13.99
5,13.93,14.61,13.93,14.5
6,14.33,14.56,14.08,14.26
7,14.17,14.26,13.15,13.33
8,13.62,13.95,12.9,13.37
9,13.57,13.6,13.21,13.57


In [69]:
# 2. 将预处理后的数据集转换为张量格式

# 格式统一，可用from_numpy()
torch.from_numpy(res.values)

tensor([[15.0700, 15.1200, 14.6300, 14.7500],
        [14.8900, 15.0100, 14.2600, 14.4600],
        [14.4500, 14.5100, 14.1000, 14.2700],
        [14.3000, 14.9400, 14.2500, 14.6600],
        [14.9400, 14.9600, 13.1600, 13.9900],
        [13.9300, 14.6100, 13.9300, 14.5000],
        [14.3300, 14.5600, 14.0800, 14.2600],
        [14.1700, 14.2600, 13.1500, 13.3300],
        [13.6200, 13.9500, 12.9000, 13.3700],
        [13.5700, 13.6000, 13.2100, 13.5700],
        [13.6000, 13.7600, 13.0000, 13.0200],
        [13.1400, 13.4200, 12.7000, 13.2600],
        [13.2800, 13.6200, 13.1800, 13.4100],
        [13.4900, 13.6300, 13.3900,     nan],
        [13.3700, 13.9500, 13.3200, 13.6100],
        [13.5000, 14.0700,     nan, 13.9000],
        [14.0100, 14.0500,     nan, 14.0500],
        [14.5200, 14.6800, 14.2500, 14.5700],
        [14.7000, 14.9300, 14.5000, 14.8200],
        [14.9900, 15.2000, 14.8400, 14.9200],
        [14.8500,     nan, 14.7100, 15.1300],
        [15.1400, 15.6000, 14.9500