In [1]:
# 1.加载原始数据
from scipy.io import arff
import pandas as pd

# download site :https://moa.cms.waikato.ac.nz/datasets/
resource_path = 'E:/FedStream/real_data_set/elecNormNew_arff/elecNormNew.arff'
# 读取 arff 文件
data, meta = arff.loadarff(resource_path)

# 将数据转换为 DataFrame
df = pd.DataFrame(data)

# 输出前6行数据
print(df.head(6))


   date   day    period  nswprice  nswdemand  vicprice  vicdemand  transfer  \
0   0.0  b'2'  0.000000  0.056443   0.439155  0.003467   0.422915  0.414912   
1   0.0  b'2'  0.021277  0.051699   0.415055  0.003467   0.422915  0.414912   
2   0.0  b'2'  0.042553  0.051489   0.385004  0.003467   0.422915  0.414912   
3   0.0  b'2'  0.063830  0.045485   0.314639  0.003467   0.422915  0.414912   
4   0.0  b'2'  0.085106  0.042482   0.251116  0.003467   0.422915  0.414912   
5   0.0  b'2'  0.106383  0.041161   0.207528  0.003467   0.422915  0.414912   

     class  
0    b'UP'  
1    b'UP'  
2    b'UP'  
3    b'UP'  
4  b'DOWN'  
5  b'DOWN'  


In [4]:
# 查看DataFrame的基本信息，如行数、列数、数据类型等
print(df.info())

# 查看DataFrame的描述性统计信息（仅对数值列有效）
print(df.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45312 entries, 0 to 45311
Data columns (total 9 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   date       45312 non-null  float64
 1   day        45312 non-null  object 
 2   period     45312 non-null  float64
 3   nswprice   45312 non-null  float64
 4   nswdemand  45312 non-null  float64
 5   vicprice   45312 non-null  float64
 6   vicdemand  45312 non-null  float64
 7   transfer   45312 non-null  float64
 8   class      45312 non-null  object 
dtypes: float64(7), object(2)
memory usage: 3.1+ MB
None
               date        period      nswprice     nswdemand      vicprice  \
count  45312.000000  45312.000000  45312.000000  45312.000000  45312.000000   
mean       0.499080      0.500000      0.057868      0.425418      0.003467   
std        0.340308      0.294756      0.039991      0.163323      0.010213   
min        0.000000      0.000000      0.000000      0.000000      0.000000   


In [5]:
# 2.数据格式转换
import numpy as np

# 假设 df 是您的 DataFrame，且它包含 'day' 和 'class' 列

# day 列转换
# 将字节字符串转换为整数
df['day'] = df['day'].apply(lambda x: int(x.decode('utf-8')))

# 如果您想要一个列表而不是 DataFrame 列，可以这样做
day_list = df['day'].tolist()

# 确保 day_list 是从 1 到 7 的整数列表
day_list = [i if i in range(1, 8) else None for i in day_list]  # 如果存在非 1-7 的值，将其替换为 None

# class 列转换
# 创建映射字典
class_mapping = {b'UP': 0, b'DOWN': 1}

# 应用映射到 class 列
df['class'] = df['class'].map(lambda x: class_mapping[x])

# 如果您想要一个列表而不是 DataFrame 列，可以这样做
class_list = df['class'].tolist()

# 输出转换后的 day 和 class 信息
# print("day:", day_list)
# print("class:", class_list)

In [6]:
# 查看DataFrame的基本信息，如行数、列数、数据类型等
print(df.info())

# 查看DataFrame的描述性统计信息（仅对数值列有效）
print(df.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45312 entries, 0 to 45311
Data columns (total 9 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   date       45312 non-null  float64
 1   day        45312 non-null  int64  
 2   period     45312 non-null  float64
 3   nswprice   45312 non-null  float64
 4   nswdemand  45312 non-null  float64
 5   vicprice   45312 non-null  float64
 6   vicdemand  45312 non-null  float64
 7   transfer   45312 non-null  float64
 8   class      45312 non-null  int64  
dtypes: float64(7), int64(2)
memory usage: 3.1 MB
None
               date           day        period      nswprice     nswdemand  \
count  45312.000000  45312.000000  45312.000000  45312.000000  45312.000000   
mean       0.499080      4.003178      0.500000      0.057868      0.425418   
std        0.340308      1.998695      0.294756      0.039991      0.163323   
min        0.000000      1.000000      0.000000      0.000000      0.000000   
25

In [8]:
# 3.2/8分测试集和训练集
import pandas as pd
from sklearn.model_selection import train_test_split

# 假设 df 是你的 DataFrame

# 确保所有的列都是数值型的，否则你需要处理非数值型数据
# 例如，你可以使用 LabelEncoder 或 OneHotEncoder 来编码分类变量

# 分割 DataFrame 到特征 X 和目标 y
# 假设最后一列是目标变量，其他列是特征
X = df.iloc[:, :-1]  # 特征
y = df.iloc[:, -1]   # 目标

# 使用 train_test_split 分割数据
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 如果你想要保持 DataFrame 的格式，你可以将 numpy 数组转换回 DataFrame
train_df = pd.DataFrame(X_train, columns=X.columns)
train_df['target'] = y_train

test_df = pd.DataFrame(X_test, columns=X.columns)
test_df['target'] = y_test

# 现在 train_df 包含训练集的特征和目标，test_df 包含测试集的特征和目标
# 将训练集保存为 CSV 文件
train_df.to_csv('E:/FedStream/real_data_set/realdataset0427/elecNorm/train_dataset.csv', index=False)

# 将测试集保存为 CSV 文件
test_df.to_csv('E:/FedStream/real_data_set/realdataset0427/elecNorm/test_dataset.csv', index=False)

In [10]:
# 4.联邦学习数据集
import os
client_nums = 10
per_round_data_nums = 100
base_path = 'E:/FedStream/real_data_set/realdataset0427/elecNorm/Electricity_client_random/'
for r in range(100):    # 0,1,2...36
    for c in range(client_nums):     # 0,1,2...9
        start_index = (r*10 + c)*per_round_data_nums
        # end_index = start_index+per_round_data_nums
        path = os.path.join(base_path,f'client_{c}')
        if not os.path.exists(path):
            os.makedirs(path)
        selected_data = train_df.iloc[start_index:start_index + per_round_data_nums]
        file_path = os.path.join(path,f'round_{r}.csv')
        selected_data.to_csv(file_path, index=False)
        print(f"client: {c} round:{r}")

client: 0 round:0
client: 1 round:0
client: 2 round:0
client: 3 round:0
client: 4 round:0
client: 5 round:0
client: 6 round:0
client: 7 round:0
client: 8 round:0
client: 9 round:0
client: 0 round:1
client: 1 round:1
client: 2 round:1
client: 3 round:1
client: 4 round:1
client: 5 round:1
client: 6 round:1
client: 7 round:1
client: 8 round:1
client: 9 round:1
client: 0 round:2
client: 1 round:2
client: 2 round:2
client: 3 round:2
client: 4 round:2
client: 5 round:2
client: 6 round:2
client: 7 round:2
client: 8 round:2
client: 9 round:2
client: 0 round:3
client: 1 round:3
client: 2 round:3
client: 3 round:3
client: 4 round:3
client: 5 round:3
client: 6 round:3
client: 7 round:3
client: 8 round:3
client: 9 round:3
client: 0 round:4
client: 1 round:4
client: 2 round:4
client: 3 round:4
client: 4 round:4
client: 5 round:4
client: 6 round:4
client: 7 round:4
client: 8 round:4
client: 9 round:4
client: 0 round:5
client: 1 round:5
client: 2 round:5
client: 3 round:5
client: 4 round:5
client: 5 

In [5]:
# 查看day列有几种不同的值
unique_days = df['day'].unique()

# 输出不同的值
print(unique_days)

# 输出不同值的数量
print(len(unique_days))


[b'2' b'3' b'4' b'5' b'6' b'7' b'1']
7


In [6]:
# 查看day列有几种不同的值
unique_days = df['class'].unique()

# 输出不同的值
print(unique_days)

# 输出不同值的数量
print(len(unique_days))


[b'UP' b'DOWN']
2


In [9]:
import numpy as np

# 假设 df 是您的 DataFrame，且它包含 'day' 和 'class' 列

# day 列转换
# 将字节字符串转换为整数
df['day'] = df['day'].apply(lambda x: int(x.decode('utf-8')))

# 如果您想要一个列表而不是 DataFrame 列，可以这样做
day_list = df['day'].tolist()

# 确保 day_list 是从 1 到 7 的整数列表
day_list = [i if i in range(1, 8) else None for i in day_list]  # 如果存在非 1-7 的值，将其替换为 None

# class 列转换
# 创建映射字典
class_mapping = {b'UP': 0, b'DOWN': 1}

# 应用映射到 class 列
df['class'] = df['class'].map(lambda x: class_mapping[x])

# 如果您想要一个列表而不是 DataFrame 列，可以这样做
class_list = df['class'].tolist()

# 输出转换后的 day 和 class 信息
# print("day:", day_list)
# print("class:", class_list)

AttributeError: 'int' object has no attribute 'decode'

In [13]:
print(df.head(5))

print(df.info())

   date  day    period  nswprice  nswdemand  vicprice  vicdemand  transfer  \
0   0.0    2  0.000000  0.056443   0.439155  0.003467   0.422915  0.414912   
1   0.0    2  0.021277  0.051699   0.415055  0.003467   0.422915  0.414912   
2   0.0    2  0.042553  0.051489   0.385004  0.003467   0.422915  0.414912   
3   0.0    2  0.063830  0.045485   0.314639  0.003467   0.422915  0.414912   
4   0.0    2  0.085106  0.042482   0.251116  0.003467   0.422915  0.414912   

   class  
0      0  
1      0  
2      0  
3      0  
4      1  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45312 entries, 0 to 45311
Data columns (total 9 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   date       45312 non-null  float64
 1   day        45312 non-null  int64  
 2   period     45312 non-null  float64
 3   nswprice   45312 non-null  float64
 4   nswdemand  45312 non-null  float64
 5   vicprice   45312 non-null  float64
 6   vicdemand  45312 non-null  float

In [14]:
print(len(df))

45312


In [15]:
print(45312*0.2)

9062.4


In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split

# 假设 df 是你的 DataFrame

# 确保所有的列都是数值型的，否则你需要处理非数值型数据
# 例如，你可以使用 LabelEncoder 或 OneHotEncoder 来编码分类变量

# 分割 DataFrame 到特征 X 和目标 y
# 假设最后一列是目标变量，其他列是特征
X = df.iloc[:, :-1]  # 特征
y = df.iloc[:, -1]   # 目标

# 使用 train_test_split 分割数据
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 如果你想要保持 DataFrame 的格式，你可以将 numpy 数组转换回 DataFrame
train_df = pd.DataFrame(X_train, columns=X.columns)
train_df['target'] = y_train

test_df = pd.DataFrame(X_test, columns=X.columns)
test_df['target'] = y_test

# 现在 train_df 包含训练集的特征和目标，test_df 包含测试集的特征和目标
# 将训练集保存为 CSV 文件
train_df.to_csv('E:/Real DataSet/elecNorm/train_dataset.csv', index=False)

# 将测试集保存为 CSV 文件
test_df.to_csv('E:/Real DataSet/elecNorm/test_dataset.csv', index=False)

In [18]:
print(len(train_df))
print(train_df.head(5))
print(train_df.info)

36249
           date  day    period  nswprice  nswdemand  vicprice  vicdemand  \
9440   0.027078    2  0.680851  0.077939   0.446742  0.003467   0.422915   
21026  0.451794    6  0.042553  0.071935   0.566498  0.004861   0.521233   
39510  0.898190    6  0.127660  0.044734   0.315085  0.002896   0.404454   
41039  0.902615    2  1.000000  0.091750   0.494496  0.006076   0.495339   
20825  0.451573    1  0.872340  0.044164   0.689081  0.002422   0.429052   

       transfer  target  
9440   0.414912       1  
21026  0.496053       0  
39510  0.707895       1  
41039  0.498246       0  
20825  0.726316       0  
<bound method DataFrame.info of            date  day    period  nswprice  nswdemand  vicprice  vicdemand  \
9440   0.027078    2  0.680851  0.077939   0.446742  0.003467   0.422915   
21026  0.451794    6  0.042553  0.071935   0.566498  0.004861   0.521233   
39510  0.898190    6  0.127660  0.044734   0.315085  0.002896   0.404454   
41039  0.902615    2  1.000000  0.091750   0.

In [19]:
print(36249/10/100)
# 10 个客户端 每个客户端每轮100个数据，36轮

36.249


In [20]:
import os
client_nums = 10
per_round_data_nums = 100
base_path = 'E:/Real DataSet/elecNorm/Electricity_client/'
for r in range(100):    # 0,1,2...36
    for c in range(client_nums):     # 0,1,2...9
        start_index = (r*10 + c)*per_round_data_nums
        # end_index = start_index+per_round_data_nums
        path = os.path.join(base_path,f'client_{c}')
        if not os.path.exists(path):
            os.makedirs(path)
        selected_data = train_df.iloc[start_index:start_index + per_round_data_nums]
        file_path = os.path.join(path,f'round_{r}.csv')
        selected_data.to_csv(file_path, index=False)
        print(f"client: {c} round:{r}")

client: {c} round:{r}
client: {c} round:{r}
client: {c} round:{r}
client: {c} round:{r}
client: {c} round:{r}
client: {c} round:{r}
client: {c} round:{r}
client: {c} round:{r}
client: {c} round:{r}
client: {c} round:{r}
client: {c} round:{r}
client: {c} round:{r}
client: {c} round:{r}
client: {c} round:{r}
client: {c} round:{r}
client: {c} round:{r}
client: {c} round:{r}
client: {c} round:{r}
client: {c} round:{r}
client: {c} round:{r}
client: {c} round:{r}
client: {c} round:{r}
client: {c} round:{r}
client: {c} round:{r}
client: {c} round:{r}
client: {c} round:{r}
client: {c} round:{r}
client: {c} round:{r}
client: {c} round:{r}
client: {c} round:{r}
client: {c} round:{r}
client: {c} round:{r}
client: {c} round:{r}
client: {c} round:{r}
client: {c} round:{r}
client: {c} round:{r}
client: {c} round:{r}
client: {c} round:{r}
client: {c} round:{r}
client: {c} round:{r}
client: {c} round:{r}
client: {c} round:{r}
client: {c} round:{r}
client: {c} round:{r}
client: {c} round:{r}
client: {c