# 数据操作

In [1]:
import torch

## 1.张量信息

In [2]:
x = torch.arange(12)
x

tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11])

In [3]:
x.shape  # 大小

torch.Size([12])

In [4]:
x.numel() # 元素个数

12

## 2.合并张量

In [5]:
X = torch.arange(12, dtype=torch.float32).reshape((3, 4))
Y = torch.tensor([[12, 13, 14, 15], [16, 17, 18, 19], [20, 21, 22, 23]])
X, Y

(tensor([[ 0.,  1.,  2.,  3.],
         [ 4.,  5.,  6.,  7.],
         [ 8.,  9., 10., 11.]]),
 tensor([[12, 13, 14, 15],
         [16, 17, 18, 19],
         [20, 21, 22, 23]]))

In [6]:
# dim是从最外围开始的
torch.cat((X, Y), dim=0), torch.cat((X, Y), dim=1)

(tensor([[ 0.,  1.,  2.,  3.],
         [ 4.,  5.,  6.,  7.],
         [ 8.,  9., 10., 11.],
         [12., 13., 14., 15.],
         [16., 17., 18., 19.],
         [20., 21., 22., 23.]]),
 tensor([[ 0.,  1.,  2.,  3., 12., 13., 14., 15.],
         [ 4.,  5.,  6.,  7., 16., 17., 18., 19.],
         [ 8.,  9., 10., 11., 20., 21., 22., 23.]]))

## 3.运算

In [7]:
X == Y  # 逻辑运算

tensor([[False, False, False, False],
        [False, False, False, False],
        [False, False, False, False]])

In [8]:
X.sum()  # 返回所有元素之和

tensor(66.)

## 4.广播机制

In [9]:
a = torch.arange(3).reshape(3, 1)
b = torch.arange(2).reshape(1, 2)

In [10]:
a, b

(tensor([[0],
         [1],
         [2]]),
 tensor([[0, 1]]))

In [11]:
a + b

tensor([[0, 1],
        [1, 2],
        [2, 3]])

In [12]:
c = torch.arange(6).reshape(2, 3, 1)
d = torch.arange(18).reshape(3, 2, 3)
c, d

(tensor([[[0],
          [1],
          [2]],
 
         [[3],
          [4],
          [5]]]),
 tensor([[[ 0,  1,  2],
          [ 3,  4,  5]],
 
         [[ 6,  7,  8],
          [ 9, 10, 11]],
 
         [[12, 13, 14],
          [15, 16, 17]]]))

In [13]:
# c + d  # 报错，说明三维的广播机制第一位必须相同

## 5.Python内存机制

In [14]:
before = id(Y)
Y = Y + X
id(Y) == before

False

In [15]:
# 两种方法避免重新分配内存
Z = torch.zeros(Y.shape)
before = id(Z)
Z = Y + X  # 1.执行原地操作
id(Z) == before

False

In [16]:
before = id(X)
X += Y  # 2.自加操作
id(X) == before

True

## 6.Numpy转换

In [17]:
A = X.numpy()
B = torch.tensor(A)
type(A), type(B)

(numpy.ndarray, torch.Tensor)

In [18]:
a = torch.tensor([1.5])
a.item()  # 提取标量

1.5

# 数据预处理

In [19]:
ls

04数据操作.ipynb                     README.md
05线性代数.ipynb                     [34mdata[m[m/
06矩阵计算&07自动求导.ipynb          [34mfigure[m[m/


## 1.创建和读取csv文件

In [20]:
import os

In [21]:
os.makedirs(os.path.join('data'), exist_ok=True)
data_file = os.path.join('data', 'house_tiny.csv')

In [22]:
with open(data_file, 'w') as f:
    f.write('NumRooms,Alley,Price\n')  # 列名
    f.write('NA,Pave,127500\n')
    f.write('2,NA,106000\n')
    f.write('4,NA,178100\n')
    f.write('NA,NA,140000\n')

In [23]:
import pandas as pd

In [24]:
data = pd.read_csv(data_file)
data
type(data)

pandas.core.frame.DataFrame

## 2.处理缺失数据

In [25]:
# 插值法
inputs, outputs = data.iloc[:, 0:2], data.iloc[:, 2]
inputs = inputs.fillna(inputs.mean())
inputs

  inputs = inputs.fillna(inputs.mean())


Unnamed: 0,NumRooms,Alley
0,3.0,Pave
1,2.0,
2,4.0,
3,3.0,


In [26]:
inputs = pd.get_dummies(inputs, dummy_na=True)
inputs

Unnamed: 0,NumRooms,Alley_Pave,Alley_nan
0,3.0,1,0
1,2.0,0,1
2,4.0,0,1
3,3.0,0,1


In [27]:
type(inputs)

pandas.core.frame.DataFrame

In [28]:
inputs.values

array([[3., 1., 0.],
       [2., 0., 1.],
       [4., 0., 1.],
       [3., 0., 1.]])

In [29]:
X, y = torch.tensor(inputs.values), torch.tensor(outputs.values)
X, y

(tensor([[3., 1., 0.],
         [2., 0., 1.],
         [4., 0., 1.],
         [3., 0., 1.]], dtype=torch.float64),
 tensor([127500, 106000, 178100, 140000]))

In [30]:
# 删除法：删除确实值最多的列
os.makedirs(os.path.join('data'), exist_ok=True)
data_file = os.path.join('data', 'house_tiny_more.csv')
with open(data_file, 'w') as f:
    f.write('NumRooms,Alley,Owner,Price\n')  # 列名
    f.write('NA,Pave,Perry,127500\n')
    f.write('2,NA,NA,106000\n')
    f.write('4,NA,NA,178100\n')
    f.write('NA,NA,NA,140000\n')

In [31]:
data = pd.read_csv(data_file)
data

Unnamed: 0,NumRooms,Alley,Owner,Price
0,,Pave,Perry,127500
1,2.0,,,106000
2,4.0,,,178100
3,,,,140000


In [32]:
nan_num = data.isnull().sum(axis=0)
nan_num

NumRooms    2
Alley       3
Owner       3
Price       0
dtype: int64

In [33]:
nan_num.idxmax()  # 找到第一个最大值的序号

'Alley'

In [34]:
data.drop(nan_num.idxmax(), axis=1)  

Unnamed: 0,NumRooms,Owner,Price
0,,Perry,127500
1,2.0,,106000
2,4.0,,178100
3,,,140000
