# 结合数据范围生成伪数据

1. 确定数据字段名，包括特征名称以及标签名称，便于后续代码使用

2. 根据不同特征数据范围，值类型，生成用于测试模型训练过程的伪数据

In [1]:
import pandas as pd
import random
import pickle
import time
import numpy as np

# 0. 字段名指定

## 特征字段名

key为字段名，对应的值为二元tuple，表示数据范围的下界和上界

In [2]:
feature_range_dict = {
    'q':(0,2),
    'powerloss':(0,10_000),
    'f':(130,146),
    'vout':(15,19),
    'iout':(0,2500),
    'pa':(0,65),
    'prx':(0,65),
    'bty_temp':(0,50),
    'chnl':(0,8),
    'ce_pkg':(-127,128),
    'rpp_pkg':(0,65),
    'ss_pkg':(0,255)
}

## 标签字段名

从key对应的tuple中随机选择，tuple不止2个数字。

In [3]:
label_enum_dict = {
    'location':(0,1)
}

# 1. 数据生成

根据字段对应的数值范围随机生成`volume`组（行）数据，并且对应标签，最终合并为`pandas.DataFrame`并输出为`csv`格式。

In [4]:
# 数据总数
volume = 2000

In [5]:
data_dict = {}

for k in feature_range_dict:
    upper_bound = feature_range_dict[k][1]
    lower_bound = feature_range_dict[k][0]
    data_dict[k] = [np.round(random.random()*(upper_bound-lower_bound)+lower_bound,2)
                                                        for i in range(volume)]

In [6]:
# 标签总数
len_label = len(label_enum_dict['location'])

data_dict['location'] = [label_enum_dict['location'][random.randint(0,len_label-1)] for i in range(volume)]

In [7]:
# 合并为一个dataframe

In [8]:
all_df = pd.DataFrame(data=data_dict)

In [9]:
all_df

Unnamed: 0,q,powerloss,f,vout,iout,pa,prx,bty_temp,chnl,ce_pkg,rpp_pkg,ss_pkg,location
0,0.69,7695.29,132.96,17.25,1429.84,43.87,28.87,21.68,5.31,3.51,14.84,50.94,0
1,0.97,9664.46,143.65,17.28,1560.53,2.10,43.45,16.62,7.76,-26.50,7.28,171.56,0
2,1.05,5507.92,141.29,16.98,1242.61,59.19,42.80,10.70,4.01,-27.59,20.97,246.86,0
3,1.59,475.65,137.89,16.38,1053.25,11.03,28.87,31.86,1.19,-63.91,29.55,99.43,1
4,0.88,9344.26,142.11,16.19,869.88,16.16,38.69,45.30,3.98,-96.23,42.80,178.67,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,0.06,9886.33,145.86,17.77,2466.15,2.43,42.79,17.77,0.66,-64.51,9.95,63.76,1
1996,0.70,1884.72,131.70,18.71,2098.92,24.01,32.94,43.13,5.85,43.80,58.26,56.95,1
1997,0.99,4620.61,138.95,15.97,101.14,37.60,2.60,22.76,4.65,31.74,50.19,240.43,0
1998,1.94,2193.35,145.14,18.23,1419.46,45.97,52.62,46.60,7.57,-6.55,55.83,238.40,1


# 2. 训练/测试数据划分

划分**测试集**比例伪`ratio`，其余为训练集，划分之后的数据保存在`../data/`文件夹下，分别为`train.csv`，`test.csv`。

In [10]:
ratio = 0.2

In [11]:
test_df = all_df.sample(int(len(all_df)*ratio))

In [12]:
train_df = all_df[~all_df.index.isin(test_df.index)]

In [13]:
train_df

Unnamed: 0,q,powerloss,f,vout,iout,pa,prx,bty_temp,chnl,ce_pkg,rpp_pkg,ss_pkg,location
0,0.69,7695.29,132.96,17.25,1429.84,43.87,28.87,21.68,5.31,3.51,14.84,50.94,0
2,1.05,5507.92,141.29,16.98,1242.61,59.19,42.80,10.70,4.01,-27.59,20.97,246.86,0
3,1.59,475.65,137.89,16.38,1053.25,11.03,28.87,31.86,1.19,-63.91,29.55,99.43,1
4,0.88,9344.26,142.11,16.19,869.88,16.16,38.69,45.30,3.98,-96.23,42.80,178.67,1
5,0.76,8723.38,135.10,16.28,32.41,57.28,3.30,20.26,6.39,-56.76,24.67,217.64,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1994,1.19,219.94,133.68,18.96,1888.73,19.25,62.39,38.99,3.13,123.83,45.27,196.81,0
1995,0.06,9886.33,145.86,17.77,2466.15,2.43,42.79,17.77,0.66,-64.51,9.95,63.76,1
1996,0.70,1884.72,131.70,18.71,2098.92,24.01,32.94,43.13,5.85,43.80,58.26,56.95,1
1998,1.94,2193.35,145.14,18.23,1419.46,45.97,52.62,46.60,7.57,-6.55,55.83,238.40,1


In [14]:
test_df

Unnamed: 0,q,powerloss,f,vout,iout,pa,prx,bty_temp,chnl,ce_pkg,rpp_pkg,ss_pkg,location
937,1.39,7446.25,141.15,16.46,2217.89,25.57,10.39,19.73,4.97,91.85,59.74,27.38,0
137,1.78,8150.96,138.82,17.73,127.12,20.14,23.83,38.82,6.05,8.32,37.52,250.57,0
484,1.28,7208.11,135.86,17.22,1363.76,19.97,2.25,9.92,7.05,-99.74,25.95,180.16,0
378,1.21,7410.40,130.41,17.16,1253.10,27.39,27.57,45.61,1.19,78.93,21.73,20.43,1
450,0.92,6185.60,142.12,15.84,1721.00,38.37,21.93,34.30,1.99,-124.73,9.24,23.56,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1352,1.67,9251.21,134.42,16.34,1087.26,40.10,7.62,46.19,0.01,36.60,30.72,177.24,0
8,1.64,7125.13,139.30,15.57,23.10,27.26,40.02,39.53,7.48,36.43,33.30,217.21,0
570,0.75,9407.88,135.00,16.94,441.83,36.68,40.97,45.80,0.72,100.10,60.04,232.68,1
795,0.72,4015.89,144.27,17.78,190.46,7.48,52.03,0.47,5.64,54.96,59.74,90.15,1


### 无重叠部分

In [15]:
set(train_df.index)&set(test_df.index)

set()

## 文件保存

In [17]:
train_df.to_csv('../data/train.csv',index_label='index')

In [18]:
test_df.to_csv('../data/test.csv',index_label='index')