## 程序功能：基于数据均值和方差对数据标准化

In [1]:
import numpy as np
import csv
from time import *
from sklearn.preprocessing import StandardScaler

### 读取数据集文件

In [2]:
begin_time = time()                     # 读取文件开始时间
data_numerization = open("kddcup.data.numerization_corrected.txt") 
lines = data_numerization.readlines()
line_nums = len(lines)
x_data = np.zeros((line_nums, 42))      # 创建line_nums行 para_num列的矩阵
for i in range(line_nums):
    line = lines[i].strip().split(',')
    x_data[i, :] = line[0:42]           # 获取42个特征
data_numerization.close()
print('数据集大小：',x_data.shape)

# 耗时分析
end_time = time()                      # 读取文件结束时间
total_time = end_time-begin_time       # 读取文件耗时
print('读取文件耗时：',total_time,'s')

数据集大小： (4898430, 42)
读取文件耗时： 174.93708682060242 s


### 指定GPU

In [3]:
import torch
from torch import nn
print(torch.cuda.is_available())        # 查看GPU是否可用
print(torch.cuda.device_count())        # GPU数量， 1
print(torch.cuda.current_device())      # 当前GPU的索引
print(torch.cuda.get_device_name(0))    # 输出GPU名称

device = torch.device('cuda:0')         # 指定device为0号GPU，若使用CPU则填写"cpu"

True
1
0
GeForce RTX 2060


### 函数功能：数据标准化

#### 方案一

In [4]:
# 在上CPU，循环计算
def Zscore_Normalization(x, n):
    if np.std(x) == 0:
        x_data[:, n] = 0
    else:
        i = 0
        while i<len(x):
            x_data[i][n] = (x[i] - np.mean(x)) / np.std(x)
            i = i + 1
    print("The ", n , "feature  is normalizing.")

#### 方案二

In [5]:
# 在GPU上并行加速，循环计算
def Zscore_Normalization(x, n):
    if np.std(x) == 0:
        x_data[:, n] = 0
    else:
        mean = torch.tensor(np.mean(x), device='cuda:0')
        std = torch.tensor(np.std(x), device='cuda:0')
        x = torch.tensor(x, device='cuda:0')
        i = 0
        while i<len(x):
            x_data[i][n] = ((x[i] - mean) / std).cpu().numpy()
            i = i + 1
    print("The ", n , "feature  is normalizing.")

#### 方案三

In [6]:
# 在GPU上并行加速，并利用pytorch的Tensor的广播机制做矩阵计算
def Zscore_Normalization(x, n):
    if np.std(x) == 0:
        x_data[:, n] = 0
    else:
        mean = torch.tensor(np.mean(x), device='cuda:0')
        std = torch.tensor(np.std(x), device='cuda:0')
        x = torch.tensor(x, device='cuda:0').view(-1,1)
        x_data[:, n] = ((x - mean) / std).cpu().numpy().T
    print("The ", n , "feature  is normalizing.")

#### 方案四

In [7]:
# 在上CPU，并利用numpy的ndarray数组的广播机制做矩阵计算
def Zscore_Normalization(x, n):
    if np.std(x) == 0:
        x_data[:, n] = 0
    else:
        x_data[:, n] = (x - np.mean(x)) / np.std(x)
    print("The ", n , "feature  is normalizing.")

#### 获取某列特征，并依次标准化

In [8]:
begin_time = time()                     # 标准化开始时间
for i in range(42):
    Zscore_Normalization(x_data[:, i], i)

# 耗时分析
end_time = time()                      # 标准化结束时间
total_time = end_time-begin_time       # 标准化耗时
print('标准化耗时：',total_time,'s')

The  0 feature  is normalizing.
The  1 feature  is normalizing.
The  2 feature  is normalizing.
The  3 feature  is normalizing.
The  4 feature  is normalizing.
The  5 feature  is normalizing.
The  6 feature  is normalizing.
The  7 feature  is normalizing.
The  8 feature  is normalizing.
The  9 feature  is normalizing.
The  10 feature  is normalizing.
The  11 feature  is normalizing.
The  12 feature  is normalizing.
The  13 feature  is normalizing.
The  14 feature  is normalizing.
The  15 feature  is normalizing.
The  16 feature  is normalizing.
The  17 feature  is normalizing.
The  18 feature  is normalizing.
The  19 feature  is normalizing.
The  20 feature  is normalizing.
The  21 feature  is normalizing.
The  22 feature  is normalizing.
The  23 feature  is normalizing.
The  24 feature  is normalizing.
The  25 feature  is normalizing.
The  26 feature  is normalizing.
The  27 feature  is normalizing.
The  28 feature  is normalizing.
The  29 feature  is normalizing.
The  30 feature  is 

### 方案五

In [9]:
# 利用Sklearn库的StandardScaler实现数据标准化
begin_time = time()                               # 标准化开始时间
x_data = StandardScaler().fit_transform(x_data)   # 标准化，返回值为标准化后的数据

# 耗时分析
end_time = time()                                 # 标准化结束时间
total_time = end_time-begin_time                  # 标准化耗时
print('标准化耗时：',total_time,'s')

### 将标准化后的数据集写入文件

In [10]:
data_normalizing = open("kddcup.data.numerization_corrected_normalizing_StandardScaler.txt",'w', newline='')
csv_writer = csv.writer(data_normalizing)
i = 0
while i<len(x_data[:, 0]):
    csv_writer.writerow(x_data[i, :])
    i = i + 1
data_normalizing.close()
print('数据标准化done！')

数据标准化done！
