In [None]:
import wandb
wandb.login()

In [None]:
import sys
sys.path.append("..")
import os
import time
import launchpad as lp
import datetime
from pathlib import Path
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import ExponentialLR, ReduceLROnPlateau
import numpy as np
import h5py as h5
import matplotlib.pyplot as plt
from utils.PDE_Net import DeepONet_NS, FNO2d, weight_init
from utils.DataGenerate_FNO import Dataset_FNO, Normalize, InNormalize
from torch.utils.tensorboard import SummaryWriter
from argparse import Namespace
from utils.utilities3 import *

torch.set_default_dtype(torch.float32)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
dx = 0.5
n_x = int(8/dx)
n_y = int(3/dx)
p_x = int(800/n_x+1)
p_y = int(300/n_y+1)
self_split = 2

config = Namespace(
	project_name = 'FNO',
	epochs = 100,
	
	dx = dx,
	n_x = n_x,
	n_y = n_y,
	p_x = p_x,
	p_y = p_y,
	self_split = self_split,

	path_trained_model = r'trained_model',
	path_distance = r'../train_data/distance',
	path_label = r'../train_data/label',
	path_distance_test = r'../test_data/distance',
	path_label_test = r'../test_data/label',

	batch_size = 15,
	modes = 12,
	width = 40,

	activation = 'Sigmoid',
# activation = torch.nn.__dict__[wandb.config.activation]()
# optimizer = torch.optim.__dict__[wandb.config.optim_type](params=model.parameters(), lr=wandb.config.learning_rate)
	optim_type = 'SGD',
	learning_rate = 0.0026392500276311696,
	dropout = 0,
	weight_decay = 0.00035181732255007376,
	max_norm = 3.637374450623043,
)

In [None]:
sweep_config = {
	'method': 'random',
}
metric = {
	'name': 'loss_train',
	'goal': 'minimize'
}
sweep_config['metric'] = metric

sweep_config['parameters'] = {}
sweep_config['parameters'].update({
	'project_name': {'value': 'FNO'},
	'epochs': {'value': 100},
	'dx': {'value': 0.5},
	'n_x': {'value': 16},
	'n_y': {'value': 6},
	'p_x': {'value': 51},
	'p_y': {'value': 51},
	'self_split': {'value': 2},

	'path_trained_model': {'value': r'trained_model'},
	'path_distance': {'value': r'../train_data/distance'},
	'path_label': {'value': r'../train_data/label'},
	'path_distance_test': {'value': r'../test_data/distance'},
	'path_label_test': {'value': r'../test_data/label'},
})
sweep_config['parameters'].update({
	'batch_size': {'values': [100,150]},
	'modes': {'distribution': 'q_uniform', 'q': 4, 'min': 8, 'max': 20},
	'width': {'distribution': 'q_uniform', 'q': 20, 'min': 20, 'max': 120},

	'activation': {'values': ['Tanh', 'ReLU', 'Sigmoid', 'LeakyReLU', 'GELU']},
# activation = torch.nn.__dict__[wandb.config.activation]()
# optimizer = torch.optim.__dict__[wandb.config.optim_type](params=model.parameters(), lr=wandb.config.learning_rate)
	'optim_type': {'values': ['Adam','SGD','AdamW']},
	'learning_rate': {'distribution': 'log_uniform_values', 'min': 1e-5, 'max': 1e-1},
	'dropout': {'distribution': 'q_uniform', 'q': 0.2, 'min': 0, 'max': 0.6},
	'weight_decay': {'distribution': 'log_uniform_values', 'min': 1e-5, 'max': 1e-1},
	'max_norm': {'distribution': 'uniform', 'min': 1, 'max': 10},
})
sweep_config['early_terminate'] = {
	'type':'hyperband',
	'min_iter':3,
	'eta':2,
	's':3
}

# 初始化sweep controller
sweep_id = wandb.sweep(sweep_config, project='FNO')

In [None]:
def create_dataloader(config):
	train_distance = torch.empty(0, config.p_x, config.p_y, 3)
	train_label = torch.empty(0, config.p_x, config.p_y, 1)
	test_distance = torch.empty(0, config.p_x, config.p_y, 3)
	test_label = torch.empty(0, config.p_x, config.p_y, 1)

	file_list = os.listdir(config.path_distance)
	file_list.sort()
	for file in file_list:
		if file.endswith('.npy'):
				distance = np.load(os.path.join(config.path_distance, file))
				train_distance = torch.cat((train_distance, torch.Tensor(distance)), 0)

	file_list = os.listdir(config.path_label)
	file_list.sort()
	for file in file_list:
		if file.endswith('.npy'):
				label = np.load(os.path.join(config.path_label, file))
				train_label = torch.cat((train_label, torch.Tensor(label)), 0)

	file_list = os.listdir(config.path_distance_test)
	file_list.sort()
	for file in file_list:
		if file.endswith('.npy'):
				distance = np.load(os.path.join(config.path_distance_test, file))
				test_distance = torch.cat((test_distance, torch.Tensor(distance)), 0)

	file_list = os.listdir(config.path_label_test)
	file_list.sort()
	for file in file_list:
		if file.endswith('.npy'):
				label = np.load(os.path.join(config.path_label_test, file))
				test_label = torch.cat((test_label, torch.Tensor(label)), 0)

	train_distance = torch.Tensor(train_distance)
	train_label = torch.Tensor(train_label)
	test_distance = torch.Tensor(test_distance)
	test_label = torch.Tensor(test_label)

	x_normalizer = GaussianNormalizer(train_distance)
	x_data = x_normalizer.encode(train_distance)
	y_normalizer = GaussianNormalizer(train_label)
	y_data = y_normalizer.encode(train_label)

	x_normalizer_test = GaussianNormalizer(test_distance)
	x_test = x_normalizer_test.encode(test_distance)
	y_normalizer_test = GaussianNormalizer(test_label)
	y_test = y_normalizer_test.encode(test_label)

	train_loader = DataLoader(torch.utils.data.TensorDataset(x_data, y_data), batch_size=config.batch_size, shuffle=True)
	test_loader = DataLoader(torch.utils.data.TensorDataset(x_test, y_test), batch_size=config.batch_size, shuffle=False)

	return train_loader, test_loader, x_normalizer, y_normalizer, x_normalizer_test, y_normalizer_test

In [None]:
def train_epoch(config,model,optimizer,myloss,scheduler,train_loader,test_loader,x_normalizer,y_normalizer,device):
	model.train()
	train_loss_epoch = 0
	for batch in train_loader:
		loss_train = 0
		data_x,data_y = batch

		x = data_x.float().to(device)
		y = data_y.float().to(device)
		# batch_min = data_min.float().to(device)
		# batch_max = data_max.float().to(device)

		pred = model(x)

		pred = y_normalizer.decode(pred)
		y = y_normalizer.decode(y)

		loss_train = myloss(pred.clone(), y.clone())
		regularization_loss = 0
		for param in model.parameters():
			regularization_loss += torch.norm(param, p=2)
		loss_train = loss_train + config.weight_decay * regularization_loss
		train_loss_epoch = train_loss_epoch + loss_train.item()
		
		optimizer.zero_grad()
		loss_train.backward(retain_graph=True)
		torch.nn.utils.clip_grad_norm_(model.parameters(), config.max_norm)
		optimizer.step()
	train_loss_epoch = train_loss_epoch / len(train_loader)
	# print('epoch:', epoch, 'loss_train:', train_loss_epoch)
	# scheduler.step(train_loss_epoch)
	return model, optimizer, train_loss_epoch

def eval_epoch(config,model,optimizer,myloss,scheduler,train_loader,test_loader,x_normalizer_test,y_normalizer_test,device):
	model.eval()
	test_loss_epoch = 0
	with torch.no_grad():
		for batch_test in test_loader:
			loss_test = 0
			test_x,test_y = batch_test
			
			test_x = test_x.float().to(device)			# [length,time_step,51,51,3]
			test_y = test_y.float().to(device)			# [length,time_step,51,51,3]
			# batch_min_test = test_min.float().to(device)
			# batch_max_test = test_max.float().to(device)

			pred_test = model(test_x)

			pred_test = y_normalizer_test.decode(pred_test)
			test_y = y_normalizer_test.decode(test_y)

			loss_test = myloss(pred_test.clone(), test_y.clone())

			test_loss_epoch = test_loss_epoch + loss_test.item()
		test_loss_epoch = test_loss_epoch/len(test_loader)
		# print('epoch:', epoch, 'loss_test:', test_loss_epoch)
	return test_loss_epoch

In [None]:
def train(is_model_saved=False):
	nowtime = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')
	# 服务器里搜索时用的是: wandb.init(): 没有参数
	with wandb.init():
		train_loader, test_loader, x_normalizer, y_normalizer, x_normalizer_test, y_normalizer_test = create_dataloader(wandb.config)
		device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
		activation = torch.nn.__dict__[wandb.config.activation]()
		model = FNO2d(wandb.config.modes, wandb.config.modes, wandb.config.width, activation, wandb.config.dropout)
		model.apply(weight_init)
		begin_epoch = 0

		if is_model_saved:
			file_list = os.listdir(wandb.config.path_trained_model)
			file_list.sort()
			if len(file_list) > 0:
				last_model = os.listdir(wandb.config.path_trained_model)[-1]
			# save_path = os.path.join(path, 'model_{}.pth'.format(begin_epoch))
			load_path = os.path.join(wandb.config.path_trained_model, last_model)
			begin_epoch = load_model(load_path, optimizer, model)
		model.to(device)

		optimizer = torch.optim.__dict__[wandb.config.optim_type](params=model.parameters(), lr=wandb.config.learning_rate)
		
		myloss = nn.MSELoss()
		scheduler = ReduceLROnPlateau(optimizer, 'min', factor=0.5, patience=100, verbose=True, min_lr=1e-6)
		#======================================================
		model.run_id = wandb.run.id
		#======================================================
		# config,model,optimizer,myloss,scheduler,train_loader,test_loader,x_normalizer,y_normalizer,device
		for epoch in range(begin_epoch, wandb.config.epochs+1):
			model, optimizer, train_loss_epoch = train_epoch(wandb.config,model,optimizer,myloss,scheduler,train_loader,test_loader,x_normalizer,y_normalizer,device)
			test_loss_epoch = eval_epoch(wandb.config,model,optimizer,myloss,scheduler,train_loader,test_loader,x_normalizer_test, y_normalizer_test,device)
			nowtime = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')
			print('epoch:', epoch, 'loss_train:', train_loss_epoch, 'loss_test:', test_loss_epoch)
			#======================================================
			wandb.log({'epoch': epoch, 'loss_train': train_loss_epoch, 'loss_test': test_loss_epoch})
			#======================================================
			if epoch % 50 == 0:
				save_path = os.path.join(wandb.config.path_trained_model, 'FNO_{}_{}.pth'.format(sweep_id,epoch))
				save_model(save_path, epoch, optimizer, model)
		wandb.finish()
	return model, optimizer

In [None]:
def train(is_model_saved=False):
	# 服务器里搜索时用的是: wandb.init(): 没有参数
	train_loader, test_loader, x_normalizer, y_normalizer, x_normalizer_test, y_normalizer_test = create_dataloader(config)
	device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
	activation = torch.nn.__dict__[config.activation]()
	model = FNO2d(config.modes, config.modes, config.width, activation, config.dropout)
	model.apply(weight_init)
	begin_epoch = 0

	if is_model_saved:
		file_list = os.listdir(config.path_trained_model)
		file_list.sort()
		if len(file_list) > 0:
			last_model = os.listdir(config.path_trained_model)[-1]
		# save_path = os.path.join(path, 'model_{}.pth'.format(begin_epoch))
		load_path = os.path.join(config.path_trained_model, last_model)
		begin_epoch = load_model(load_path, optimizer, model)
	model.to(device)

	optimizer = torch.optim.__dict__[config.optim_type](params=model.parameters(), lr=config.learning_rate)
	
	myloss = nn.MSELoss()
	scheduler = ReduceLROnPlateau(optimizer, 'min', factor=0.5, patience=100, verbose=True, min_lr=1e-6)
	#======================================================
	nowtime = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')
	wandb.init(project=config.project_name, config=config.__dict__, name=nowtime, save_code=True)
	model.run_id = wandb.run.id
	#======================================================
	# config,model,optimizer,myloss,scheduler,train_loader,test_loader,x_normalizer,y_normalizer,device
	for epoch in range(begin_epoch, config.epochs+1):
		model, optimizer, train_loss_epoch = train_epoch(config,model,optimizer,myloss,scheduler,train_loader,test_loader,x_normalizer,y_normalizer,device)
		test_loss_epoch = eval_epoch(config,model,optimizer,myloss,scheduler,train_loader,test_loader,x_normalizer_test, y_normalizer_test,device)
		nowtime = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')
		print('epoch:', epoch, 'loss_train:', train_loss_epoch, 'loss_test:', test_loss_epoch)
		#======================================================
		wandb.log({'epoch': epoch, 'loss_train': train_loss_epoch, 'loss_test': test_loss_epoch})
		#======================================================
		if epoch % 50 == 0:
			save_path = os.path.join(config.path_trained_model, 'FNO_{}_{}.pth'.format(sweep_id,epoch))
			save_model(save_path, epoch, optimizer, model)
	wandb.finish()
	return model, optimizer

In [None]:
model, optimizer = train(is_model_saved=False)

In [None]:
# 启动sweep agent
wandb.agent(sweep_id, train, count=50)

### 并行

In [None]:
args = Namespace(
	team_name = '423team',
	project_name = 'FNO',
	expertiment_name = 'FNO',
	wandb_log_path = r"../../wandb_results/",
	worker_num = 2,
)

In [None]:
class SweepWorker():
	def __init__(self,args,sweep_id):
		self.args = args
		self.sweep_id = sweep_id
	
	def run(self):
		wandb.agent(self.sweep_id, function=train)
		time.sleep(5)
		lp.stop()

In [None]:
def make_program(worker_num,args,sweep_id):
	program = lp.Program('wandb_sweep')
	with program.group('sweep_worker'):
		for i in range(worker_num):
			program.add_node(lp.CourierNode(SweepWorker,args,sweep_id))
	return program

In [None]:
def set_wandb(args):
	run_dir = Path(args.wandb_log_path)/args.project_name/args.expertiment_name
	if not run_dir.exists():
		run_dir.mkdir(str(run_dir))
	
	os.environ["ANDB_ENTITY"] = args.team_name
	os.environ["WANDB_PROJECT"] = args.project_name
	os.environ["WANDB_DIR"] = str(run_dir)

In [None]:
def test_sweep(args):
	set_wandb(args)

	sweep_id = wandb.sweep(sweep_config)
	program = make_program(args.worker_num,args,sweep_id)
	lp.launch(program, launch_type='local_mp')

In [None]:
test_sweep(args)