# 结合数据范围生成伪数据

1. 确定数据字段名，包括特征名称以及标签名称，便于后续代码使用

2. 根据不同特征数据范围，值类型，生成用于测试模型训练过程的伪数据

In [2]:
import pandas as pd
import random
import pickle
import time
import numpy as np

In [28]:
import json

# 0. 字段名指定

## 特征字段名

key为字段名，对应的值为二元tuple，表示数据范围的下界和上界

In [3]:
# feature_range_dict = {
#     'q':(0,2),
#     'powerloss':(0,10_000),
#     'f':(130,146),
#     'vout':(15,19),
#     'iout':(0,2500),
#     'pa':(0,65),
#     'prx':(0,65),
#     'bty_temp':(0,50),
#     'chnl':(0,8),
#     'ce_pkg':(-127,128),
#     'rpp_pkg':(0,65),
#     'ss_pkg':(0,255)
# }

## 240403更新字段名以及范围

In [4]:
feature_range_dict = {
    'q/qm':(0,200),
    'ploss':(-10_000,10_000),
    'fre':(130,146),
    'vpa':(0,40_0000),
    'papower':(0,65_000),
    'ch':(0,8),
    'ce':(-127,128),
    'rppower':(0,65_000),
    'ss':(0,255)
}

## 保存字段名以及范围

In [29]:
with open('../data/settings/feature.json','w',encoding='utf-8') as file:
    json.dump(obj=feature_range_dict, 
              fp=file, 
              ensure_ascii=False)

## 标签字段名

从key对应的tuple中随机选择，tuple不止2个数字。

In [5]:
# label_enum_dict = {
#     'location':(0,1)
# }

## 240403更新字段名以及范围

In [31]:
label_enum_dict = {
    'index':(0,1)
}

In [32]:
label_name = list(label_enum_dict.keys())[0]

In [33]:
with open('../data/settings/label.json','w',encoding='utf-8') as file:
    json.dump(obj=label_enum_dict, 
              fp=file, 
              ensure_ascii=False)

# 1. 数据生成

根据字段对应的数值范围随机生成`volume`组（行）数据，并且对应标签，最终合并为`pandas.DataFrame`并输出为`csv`格式。

In [7]:
# 数据总数
volume = 2000

In [8]:
data_dict = {}

for k in feature_range_dict:
    upper_bound = feature_range_dict[k][1]
    lower_bound = feature_range_dict[k][0]
    data_dict[k] = [np.round(random.random()*(upper_bound-lower_bound)+lower_bound,2)
                                                        for i in range(volume)]

In [16]:
# 标签总数
len_label = len(label_enum_dict[label_name])

data_dict[label_name] = [label_enum_dict[label_name][random.randint(0,len_label-1)] for i in range(volume)]

In [17]:
# 合并为一个dataframe

In [18]:
all_df = pd.DataFrame(data=data_dict)

In [19]:
all_df

Unnamed: 0,q/qm,ploss,fre,vpa,papower,ch,ce,rppower,ss,label
0,198.77,-7927.89,136.95,144057.92,52106.39,2.14,-76.36,30783.94,40.74,0
1,96.76,-4195.86,138.96,385606.71,44998.05,3.88,-17.85,22583.66,62.06,0
2,149.07,7182.13,130.27,166750.47,52636.74,7.18,-100.37,20416.82,142.55,1
3,142.85,-7676.84,133.78,358046.00,56869.08,1.79,-118.42,8575.75,162.81,1
4,127.28,1737.13,130.07,352906.42,15344.73,1.87,-61.86,63146.95,246.69,0
...,...,...,...,...,...,...,...,...,...,...
1995,77.66,4481.46,144.74,233672.72,1254.42,2.54,-16.97,3209.58,133.14,1
1996,108.30,-6507.48,140.75,272898.79,3591.44,3.73,-84.97,8075.41,54.10,1
1997,34.29,2112.78,139.07,106782.66,1469.88,3.88,14.91,14936.08,187.90,1
1998,35.28,7787.81,141.65,260014.61,11648.36,2.29,32.69,21777.53,36.87,0


# 2. 训练/测试数据划分

划分**测试集**比例伪`ratio`，其余为训练集，划分之后的数据保存在`../data/`文件夹下，分别为`train.csv`，`test.csv`。

In [20]:
ratio = 0.2

In [21]:
test_df = all_df.sample(int(len(all_df)*ratio))

In [22]:
train_df = all_df[~all_df.index.isin(test_df.index)]

In [23]:
train_df

Unnamed: 0,q/qm,ploss,fre,vpa,papower,ch,ce,rppower,ss,label
2,149.07,7182.13,130.27,166750.47,52636.74,7.18,-100.37,20416.82,142.55,1
3,142.85,-7676.84,133.78,358046.00,56869.08,1.79,-118.42,8575.75,162.81,1
5,32.41,7406.75,139.70,167346.88,3628.09,7.15,-108.40,52907.06,131.24,1
7,120.80,-8267.27,139.37,343808.91,4818.37,7.79,17.19,45174.25,66.85,0
8,77.19,-3954.32,135.37,20229.85,34621.88,5.26,-67.05,5402.63,4.85,1
...,...,...,...,...,...,...,...,...,...,...
1995,77.66,4481.46,144.74,233672.72,1254.42,2.54,-16.97,3209.58,133.14,1
1996,108.30,-6507.48,140.75,272898.79,3591.44,3.73,-84.97,8075.41,54.10,1
1997,34.29,2112.78,139.07,106782.66,1469.88,3.88,14.91,14936.08,187.90,1
1998,35.28,7787.81,141.65,260014.61,11648.36,2.29,32.69,21777.53,36.87,0


In [24]:
test_df

Unnamed: 0,q/qm,ploss,fre,vpa,papower,ch,ce,rppower,ss,label
1592,123.87,-4254.67,141.31,313603.31,54133.45,7.16,89.72,47018.35,89.84,0
1216,156.69,6794.92,142.29,73244.08,9096.87,3.54,-125.11,19089.67,37.63,0
322,115.38,6381.90,140.56,34030.19,52450.63,2.91,-11.55,21042.39,208.55,0
1568,139.13,1685.76,140.32,268036.84,24740.19,0.84,105.76,18372.77,31.39,1
1129,71.27,-3718.19,143.46,283132.98,3122.53,6.28,-119.73,63915.03,106.23,0
...,...,...,...,...,...,...,...,...,...,...
1117,185.97,-4841.51,132.52,398671.91,3608.81,7.11,-78.20,56714.00,233.16,1
56,143.11,8353.45,145.64,169688.65,6280.25,0.88,-36.27,55246.25,72.03,0
290,96.11,-9426.75,144.18,395041.16,57105.27,6.54,10.98,3851.12,172.91,0
802,87.83,7283.53,133.56,68501.92,50577.08,4.78,-118.11,27634.08,197.92,0


### 无重叠部分

In [25]:
set(train_df.index)&set(test_df.index)

set()

## 文件保存

In [26]:
train_df.to_csv('../data/train240403.csv',index_label='index')

In [27]:
test_df.to_csv('../data/test240403.csv',index_label='index')