diff --git a/Project.toml b/Project.toml
index f939bba1..20f449aa 100644
--- a/Project.toml
+++ b/Project.toml
@@ -6,13 +6,10 @@ version = "0.10.5"
 [deps]
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
 CellularAutomata = "878138dc-5b27-11ea-1a71-cb95d38d6b29"
-Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
-PartialFunctions = "570af359-4316-4cb7-8c74-252c00c2016b"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
-Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
 WeightInitializers = "d49dbf32-c5c2-4618-8acc-27bb2598ef2d"
 
@@ -29,19 +26,17 @@ Adapt = "4.1.1"
 Aqua = "0.8"
 CellularAutomata = "0.0.2"
 DifferentialEquations = "7.15.0"
-Distances = "0.10"
 LIBSVM = "0.8"
 LinearAlgebra = "1.10"
 MLJLinearModels = "0.9.2, 0.10"
 NNlib = "0.9.26"
-PartialFunctions = "1.2"
 Random = "1.10"
 Reexport = "1.2.2"
 SafeTestsets = "0.1"
 Statistics = "1.10"
 StatsBase = "0.34.4"
 Test = "1"
-WeightInitializers = "1.0.4"
+WeightInitializers = "1.0.5"
 julia = "1.10"
 
 [extras]
@@ -51,7 +46,9 @@ LIBSVM = "b1bec4e5-fd48-53fe-b0cb-9723c09d164b"
 MLJLinearModels = "6ee0df7b-362f-4a72-a706-9e79364fb692"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 SafeTestsets = "1bc83da4-3b8d-516f-aca4-4fe02f6d838f"
+Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [targets]
-test = ["Aqua", "Test", "SafeTestsets", "Random", "DifferentialEquations", "MLJLinearModels", "LIBSVM"]
+test = ["Aqua", "Test", "SafeTestsets", "Random", "DifferentialEquations",
+    "MLJLinearModels", "LIBSVM", "Statistics"]
diff --git a/src/ReservoirComputing.jl b/src/ReservoirComputing.jl
index e5cc8b68..8cf16ea7 100644
--- a/src/ReservoirComputing.jl
+++ b/src/ReservoirComputing.jl
@@ -1,126 +1,16 @@
 module ReservoirComputing
 
-using Adapt
-using CellularAutomata
-using Distances
-using LinearAlgebra
-using NNlib
-using PartialFunctions
-using Random
+using Adapt: adapt
+using CellularAutomata: CellularAutomaton
+using LinearAlgebra: eigvals, mul!, I
+using NNlib: fast_act, sigmoid
+using Random: Random, AbstractRNG
 using Reexport: Reexport, @reexport
-using Statistics
 using StatsBase: sample
 using WeightInitializers: DeviceAgnostic, PartialFunction, Utils
 @reexport using WeightInitializers
 
-#define global types
 abstract type AbstractReservoirComputer end
-abstract type AbstractOutputLayer end
-abstract type AbstractPrediction end
-#should probably move some of these
-abstract type AbstractGRUVariant end
-
-#general output layer struct
-struct OutputLayer{T, I, S, L} <: AbstractOutputLayer
-    training_method::T
-    output_matrix::I
-    out_size::S
-    last_value::L
-end
-
-#prediction types
-"""
-    Generative(prediction_len)
-
-A prediction strategy that enables models to generate autonomous multi-step
-forecasts by recursively feeding their own outputs back as inputs for
-subsequent prediction steps.
-
-# Parameters
-
-  - `prediction_len::Int`: The number of future steps to predict.
-
-# Description
-
-The `Generative` prediction method allows a model to perform multi-step
-forecasting by using its own previous predictions as inputs for future predictions.
-This approach is especially useful in time series analysis, where each prediction
-depends on the preceding data points.
-
-At each step, the model takes the current input, generates a prediction,
-and then incorporates that prediction into the input for the next step.
-This recursive process continues until the specified
-number of prediction steps (`prediction_len`) is reached.
-"""
-struct Generative{T} <: AbstractPrediction
-    prediction_len::T
-end
-
-struct Predictive{I, T} <: AbstractPrediction
-    prediction_data::I
-    prediction_len::T
-end
-
-"""
-    Predictive(prediction_data)
-
-A prediction strategy for supervised learning tasks,
-where a model predicts labels based on a provided set
-of input features (`prediction_data`).
-
-# Parameters
-
-  - `prediction_data`: The input data used for prediction, typically structured as a matrix
-    where each column represents a sample, and each row represents a feature.
-
-# Description
-
-The `Predictive` prediction method is a standard approach
-in supervised machine learning tasks. It uses the provided input data
-(`prediction_data`) to produce corresponding labels or outputs based
-on the learned relationships in the model. Unlike generative prediction,
-this method does not recursively feed predictions into the model;
-instead, it operates on fixed input data to produce a single batch of predictions.
-
-This method is suitable for tasks like classification,
-regression, or other use cases where the input features
-and the number of steps are predefined.
-"""
-function Predictive(prediction_data)
-    prediction_len = size(prediction_data, 2)
-    Predictive(prediction_data, prediction_len)
-end
-
-#fallbacks for initializers #eventually to remove once migrated to WeightInitializers.jl
-for initializer in (:rand_sparse, :delay_line, :delay_line_backward, :cycle_jumps,
-    :simple_cycle, :pseudo_svd,
-    :scaled_rand, :weighted_init, :informed_init, :minimal_init)
-    @eval begin
-        function ($initializer)(dims::Integer...; kwargs...)
-            return $initializer(Utils.default_rng(), Float32, dims...; kwargs...)
-        end
-        function ($initializer)(rng::AbstractRNG, dims::Integer...; kwargs...)
-            return $initializer(rng, Float32, dims...; kwargs...)
-        end
-        function ($initializer)(::Type{T}, dims::Integer...; kwargs...) where {T <: Number}
-            return $initializer(Utils.default_rng(), T, dims...; kwargs...)
-        end
-
-        # Partial application
-        function ($initializer)(rng::AbstractRNG; kwargs...)
-            return PartialFunction.Partial{Nothing}($initializer, rng, kwargs)
-        end
-        function ($initializer)(::Type{T}; kwargs...) where {T <: Number}
-            return PartialFunction.Partial{T}($initializer, nothing, kwargs)
-        end
-        function ($initializer)(rng::AbstractRNG, ::Type{T}; kwargs...) where {T <: Number}
-            return PartialFunction.Partial{T}($initializer, rng, kwargs)
-        end
-        function ($initializer)(; kwargs...)
-            return PartialFunction.Partial{Nothing}($initializer, nothing, kwargs)
-        end
-    end
-end
 
 #general
 include("states.jl")
@@ -130,8 +20,7 @@ include("predict.jl")
 include("train/linear_regression.jl")
 
 #esn
-include("esn/esn_input_layers.jl")
-include("esn/esn_reservoirs.jl")
+include("esn/esn_inits.jl")
 include("esn/esn_reservoir_drivers.jl")
 include("esn/esn.jl")
 include("esn/deepesn.jl")
@@ -155,9 +44,7 @@ export scaled_rand, weighted_init, informed_init, minimal_init
 export rand_sparse, delay_line, delay_line_backward, cycle_jumps, simple_cycle, pseudo_svd
 export RNN, MRNN, GRU, GRUParams, FullyGated, Minimal
 export train
-export ESN
-export HybridESN, KnowledgeModel
-export DeepESN
+export ESN, HybridESN, KnowledgeModel, DeepESN
 export RECA, sample
 export RandomMapping, RandomMaps
 export Generative, Predictive, OutputLayer
diff --git a/src/esn/deepesn.jl b/src/esn/deepesn.jl
index 7ce458fd..d9018128 100644
--- a/src/esn/deepesn.jl
+++ b/src/esn/deepesn.jl
@@ -44,27 +44,22 @@ temporal features.
     Default is an RNN model.
   - `nla_type`: The type of non-linear activation used in the reservoir.
     Default is `NLADefault()`.
-  - `states_type`: Defines the type of states used in the ESN (e.g., standard states).
-    Default is `StandardStates()`.
-  - `washout`: The number of initial timesteps to be discarded in the ESN's training phase.
-    Default is 0.
-  - `rng`: Random number generator used for initializing weights. Default is the package's
-    default random number generator.
+  - `states_type`: Defines the type of states used in the ESN
+    (e.g., standard states). Default is `StandardStates()`.
+  - `washout`: The number of initial timesteps to be discarded
+    in the ESN's training phase. Default is 0.
+  - `rng`: Random number generator used for initializing weights.
+    Default is `Utils.default_rng()`.
   - `matrix_type`: The type of matrix used for storing the training data.
     Default is inferred from `train_data`.
 
 # Example
 
 ```julia
-# Prepare your training data
-train_data = [your_training_data_here]
+train_data = rand(Float32, 3, 100)
 
 # Create a DeepESN with specific parameters
-deepESN = DeepESN(train_data, 10, 100; depth=3, washout=100)
-
-# Proceed with training and prediction (pseudocode)
-train(deepESN, target_data)
-prediction = predict(deepESN, new_data)
+deepESN = DeepESN(train_data, 3, 100; depth=3, washout=100)
 ```
 """
 function DeepESN(train_data,
@@ -82,7 +77,7 @@ function DeepESN(train_data,
         matrix_type=typeof(train_data))
     if states_type isa AbstractPaddedStates
         in_size = size(train_data, 1) + 1
-        train_data = vcat(Adapt.adapt(matrix_type, ones(1, size(train_data, 2))),
+        train_data = vcat(adapt(matrix_type, ones(1, size(train_data, 2))),
             train_data)
     end
 
diff --git a/src/esn/esn.jl b/src/esn/esn.jl
index b585db2a..07824cc4 100644
--- a/src/esn/esn.jl
+++ b/src/esn/esn.jl
@@ -15,33 +15,41 @@ end
 """
     ESN(train_data; kwargs...) -> ESN
 
-Creates an Echo State Network (ESN) using specified parameters and training data, suitable for various machine learning tasks.
+Creates an Echo State Network (ESN).
 
-# Parameters
+# Arguments
 
-  - `train_data`: Matrix of training data (columns as time steps, rows as features).
+  - `train_data`: Matrix of training data `num_features x time_steps`.
   - `variation`: Variation of ESN (default: `Default()`).
   - `input_layer`: Input layer of ESN.
   - `reservoir`: Reservoir of the ESN.
   - `bias`: Bias vector for each time step.
+  - `rng`: Random number generator used for initializing weights.
+    Default is `Utils.default_rng()`.
   - `reservoir_driver`: Mechanism for evolving reservoir states (default: `RNN()`).
   - `nla_type`: Non-linear activation type (default: `NLADefault()`).
   - `states_type`: Format for storing states (default: `StandardStates()`).
   - `washout`: Initial time steps to discard (default: `0`).
   - `matrix_type`: Type of matrices used internally (default: type of `train_data`).
 
-# Returns
-
-  - An initialized ESN instance with specified parameters.
-
 # Examples
 
-```julia
-using ReservoirComputing
-
-train_data = rand(10, 100)  # 10 features, 100 time steps
-
-esn = ESN(train_data; reservoir=RandSparseReservoir(200), washout=10)
+```jldoctest
+julia> train_data = rand(Float32, 10, 100)  # 10 features, 100 time steps
+10×100 Matrix{Float32}:
+ 0.567676   0.154756  0.584611  0.294015   …  0.573946    0.894333    0.429133
+ 0.327073   0.729521  0.804667  0.263944      0.559342    0.020167    0.897862
+ 0.453606   0.800058  0.568311  0.749441      0.0713146   0.464795    0.532854
+ 0.0173253  0.536959  0.722116  0.910328      0.00224048  0.00202501  0.631075
+ 0.366744   0.119761  0.100593  0.125122      0.700562    0.675474    0.102947
+ 0.539737   0.768351  0.54681   0.648672   …  0.256738    0.223784    0.94327
+ 0.558099   0.42676   0.1948    0.735625      0.0989234   0.119342    0.624182
+ 0.0603135  0.929999  0.263439  0.0372732     0.066125    0.332769    0.25562
+ 0.4463     0.334423  0.444679  0.311695      0.0494497   0.27171     0.214925
+ 0.987182   0.898593  0.295241  0.233098      0.789699    0.453692    0.759205
+
+julia> esn = ESN(train_data, 10, 300; washout=10)
+ESN(10 => 300)
 ```
 """
 function ESN(train_data,
@@ -58,7 +66,7 @@ function ESN(train_data,
         matrix_type=typeof(train_data))
     if states_type isa AbstractPaddedStates
         in_size = size(train_data, 1) + 1
-        train_data = vcat(Adapt.adapt(matrix_type, ones(1, size(train_data, 2))),
+        train_data = vcat(adapt(matrix_type, ones(1, size(train_data, 2))),
             train_data)
     end
 
@@ -86,6 +94,10 @@ function (esn::AbstractEchoStateNetwork)(prediction::AbstractPrediction,
         kwargs...)
 end
 
+function Base.show(io::IO, esn::ESN)
+    print(io, "ESN(", size(esn.train_data, 1), " => ", size(esn.reservoir_matrix, 1), ")")
+end
+
 #training dispatch on esn
 """
     train(esn::AbstractEchoStateNetwork, target_data, training_method = StandardRidge(0.0))
@@ -98,27 +110,27 @@ Trains an Echo State Network (ESN) using the provided target data and a specifie
   - `target_data`: Supervised training data for the ESN.
   - `training_method`: The method for training the ESN (default: `StandardRidge(0.0)`).
 
-# Returns
-
-  - The trained ESN model. Its type and structure depend on `training_method` and the ESN's implementation.
-
-# Returns
-
-The trained ESN model. The exact type and structure of the return value depends on the
-`training_method` and the specific ESN implementation.
-
-```julia
-using ReservoirComputing
-
-# Initialize an ESN instance and target data
-esn = ESN(train_data; reservoir=RandSparseReservoir(200), washout=10)
-target_data = rand(size(train_data, 2))
-
-# Train the ESN using the default training method
-trained_esn = train(esn, target_data)
-
-# Train the ESN using a custom training method
-trained_esn = train(esn, target_data; training_method=StandardRidge(1.0))
+# Example
+
+```jldoctest
+julia> train_data = rand(Float32, 10, 100)  # 10 features, 100 time steps
+10×100 Matrix{Float32}:
+ 0.11437   0.425367  0.585867   0.34078   …  0.0531493  0.761425  0.883164
+ 0.301373  0.497806  0.279603   0.802417     0.49873    0.270156  0.333333
+ 0.135224  0.660179  0.394233   0.512753     0.901221   0.784377  0.687691
+ 0.510203  0.877234  0.614245   0.978405     0.332775   0.768826  0.527077
+ 0.955027  0.398322  0.312156   0.981938     0.473357   0.156704  0.476101
+ 0.353024  0.997632  0.164328   0.470783  …  0.745613   0.85797   0.465201
+ 0.966044  0.194299  0.599167   0.040475     0.0996013  0.325959  0.770103
+ 0.292068  0.495138  0.481299   0.214566     0.819573   0.155951  0.227168
+ 0.133498  0.451058  0.0761995  0.90421      0.994212   0.332164  0.545112
+ 0.214467  0.791524  0.124105   0.951805     0.947166   0.954244  0.889733
+
+julia> esn = ESN(train_data, 10, 300; washout=10)
+ESN(10 => 300)
+
+julia> output_layer = train(esn, rand(Float32, 3, 90))
+OutputLayer successfully trained with output size: 3
 ```
 """
 function train(esn::AbstractEchoStateNetwork,
diff --git a/src/esn/esn_inits.jl b/src/esn/esn_inits.jl
new file mode 100644
index 00000000..929d23f3
--- /dev/null
+++ b/src/esn/esn_inits.jl
@@ -0,0 +1,688 @@
+### input layers
+"""
+    scaled_rand([rng::AbstractRNG=Utils.default_rng()], [T=Float32], dims...;
+        scaling=0.1)
+
+Create and return a matrix with random values, uniformly distributed within
+a range defined by `scaling`.
+
+# Arguments
+
+  - `rng`: Random number generator. Default is `Utils.default_rng()`
+    from WeightInitializers.
+  - `T`: Type of the elements in the reservoir matrix.
+    Default is `Float32`.
+  - `dims`: Dimensions of the matrix. Should follow `res_size x in_size`.
+  - `scaling`: A scaling factor to define the range of the uniform distribution.
+    The matrix elements will be randomly chosen from the
+    range `[-scaling, scaling]`. Defaults to `0.1`.
+
+# Examples
+
+```jldoctest
+julia> res_input = scaled_rand(8, 3)
+8×3 Matrix{Float32}:
+ -0.0669356  -0.0292692  -0.0188943
+  0.0159724   0.004071   -0.0737949
+  0.026355   -0.0191563   0.0714962
+ -0.0177412   0.0279123   0.0892906
+ -0.0184405   0.0567368   0.0190222
+  0.0944272   0.0679244   0.0148647
+ -0.0799005  -0.0891089  -0.0444782
+ -0.0970182   0.0934286   0.03553
+```
+"""
+function scaled_rand(rng::AbstractRNG, ::Type{T}, dims::Integer...;
+        scaling=T(0.1)) where {T <: Number}
+    res_size, in_size = dims
+    layer_matrix = (DeviceAgnostic.rand(rng, T, res_size, in_size) .- T(0.5)) .*
+                   (T(2) * scaling)
+    return layer_matrix
+end
+
+"""
+    weighted_init([rng::AbstractRNG=Utils.default_rng()], [T=Float32], dims...;
+        scaling=0.1)
+
+Create and return a matrix representing a weighted input layer.
+This initializer generates a weighted input matrix with random non-zero
+elements distributed uniformly within the range [-`scaling`, `scaling`] [^Lu2017].
+
+# Arguments
+
+  - `rng`: Random number generator. Default is `Utils.default_rng()`
+    from WeightInitializers.
+  - `T`: Type of the elements in the reservoir matrix.
+    Default is `Float32`.
+  - `dims`: Dimensions of the matrix. Should follow `res_size x in_size`.
+  - `scaling`: The scaling factor for the weight distribution.
+    Defaults to `0.1`.
+
+# Examples
+
+```jldoctest
+julia> res_input = weighted_init(8, 3)
+6×3 Matrix{Float32}:
+  0.0452399   0.0          0.0
+ -0.0348047   0.0          0.0
+  0.0        -0.0386004    0.0
+  0.0         0.00981022   0.0
+  0.0         0.0          0.0577838
+  0.0         0.0         -0.0562827
+```
+
+[^Lu2017]: Lu, Zhixin, et al.
+    "Reservoir observers: Model-free inference of unmeasured variables in
+    chaotic systems."
+    Chaos: An Interdisciplinary Journal of Nonlinear Science 27.4 (2017): 041102.
+"""
+function weighted_init(rng::AbstractRNG, ::Type{T}, dims::Integer...;
+        scaling=T(0.1)) where {T <: Number}
+    approx_res_size, in_size = dims
+    res_size = Int(floor(approx_res_size / in_size) * in_size)
+    layer_matrix = DeviceAgnostic.zeros(rng, T, res_size, in_size)
+    q = floor(Int, res_size / in_size)
+
+    for i in 1:in_size
+        layer_matrix[((i - 1) * q + 1):((i) * q), i] = (DeviceAgnostic.rand(rng, T, q) .-
+                                                        T(0.5)) .* (T(2) * scaling)
+    end
+
+    return layer_matrix
+end
+
+"""
+    informed_init([rng::AbstractRNG=Utils.default_rng()], [T=Float32], dims...;
+        scaling=0.1, model_in_size, gamma=0.5)
+
+Create an input layer for informed echo state networks [^Pathak2018].
+
+# Arguments
+
+  - `rng`: Random number generator. Default is `Utils.default_rng()`
+    from WeightInitializers.
+  - `T`: Type of the elements in the reservoir matrix.
+    Default is `Float32`.
+  - `dims`: Dimensions of the matrix. Should follow `res_size x in_size`.
+  - `scaling`: The scaling factor for the input matrix.
+    Default is 0.1.
+  - `model_in_size`: The size of the input model.
+  - `gamma`: The gamma value. Default is 0.5.
+
+# Examples
+
+[^Pathak2018]: Pathak, Jaideep, et al. "Hybrid forecasting of chaotic processes:
+    Using machine learning in conjunction with a knowledge-based model."
+    Chaos: An Interdisciplinary Journal of Nonlinear Science 28.4 (2018).
+"""
+function informed_init(rng::AbstractRNG, ::Type{T}, dims::Integer...;
+        scaling=T(0.1), model_in_size, gamma=T(0.5)) where {T <: Number}
+    res_size, in_size = dims
+    state_size = in_size - model_in_size
+
+    if state_size <= 0
+        throw(DimensionMismatch("in_size must be greater than model_in_size"))
+    end
+
+    input_matrix = DeviceAgnostic.zeros(rng, T, res_size, in_size)
+    zero_connections = DeviceAgnostic.zeros(rng, T, in_size)
+    num_for_state = floor(Int, res_size * gamma)
+    num_for_model = floor(Int, res_size * (1 - gamma))
+
+    for i in 1:num_for_state
+        idxs = findall(Bool[zero_connections .== input_matrix[i, :]
+                            for i in 1:size(input_matrix, 1)])
+        random_row_idx = idxs[DeviceAgnostic.rand(rng, T, 1:end)]
+        random_clm_idx = range(1, state_size; step=1)[DeviceAgnostic.rand(rng, T, 1:end)]
+        input_matrix[random_row_idx, random_clm_idx] = (DeviceAgnostic.rand(rng, T) -
+                                                        T(0.5)) .* (T(2) * scaling)
+    end
+
+    for i in 1:num_for_model
+        idxs = findall(Bool[zero_connections .== input_matrix[i, :]
+                            for i in 1:size(input_matrix, 1)])
+        random_row_idx = idxs[DeviceAgnostic.rand(rng, T, 1:end)]
+        random_clm_idx = range(state_size + 1, in_size; step=1)[DeviceAgnostic.rand(
+            rng, T, 1:end)]
+        input_matrix[random_row_idx, random_clm_idx] = (DeviceAgnostic.rand(rng, T) -
+                                                        T(0.5)) .* (T(2) * scaling)
+    end
+
+    return input_matrix
+end
+
+"""
+    minimal_init([rng::AbstractRNG=Utils.default_rng()], [T=Float32], dims...;
+        sampling_type=:bernoulli, weight=0.1, irrational=pi, start=1, p=0.5)
+
+Create a layer matrix with uniform weights determined by `weight`. The sign difference
+is randomly determined by the `sampling` chosen.
+
+# Arguments
+
+  - `rng`: Random number generator. Default is `Utils.default_rng()`
+    from WeightInitializers.
+  - `T`: Type of the elements in the reservoir matrix.
+    Default is `Float32`.
+  - `dims`: Dimensions of the matrix. Should follow `res_size x in_size`.
+  - `weight`: The weight used to fill the layer matrix. Default is 0.1.
+  - `sampling_type`: The sampling parameters used to generate the input matrix.
+    Default is `:bernoulli`.
+  - `irrational`: Irrational number chosen for sampling if `sampling_type=:irrational`.
+    Default is `pi`.
+  - `start`: Starting value for the irrational sample. Default is 1
+  - `p`: Probability for the Bernoulli sampling. Lower probability increases negative
+    value. Higher probability increases positive values. Default is 0.5
+
+# Examples
+
+```jldoctest
+julia> res_input = minimal_init(8, 3)
+8×3 Matrix{Float32}:
+  0.1  -0.1   0.1
+ -0.1   0.1   0.1
+ -0.1  -0.1   0.1
+ -0.1  -0.1  -0.1
+  0.1   0.1   0.1
+ -0.1  -0.1  -0.1
+ -0.1  -0.1   0.1
+  0.1  -0.1   0.1
+
+julia> res_input = minimal_init(8, 3; sampling_type=:irrational)
+8×3 Matrix{Float32}:
+ -0.1   0.1  -0.1
+  0.1  -0.1  -0.1
+  0.1   0.1  -0.1
+  0.1   0.1   0.1
+ -0.1  -0.1  -0.1
+  0.1   0.1   0.1
+  0.1   0.1  -0.1
+ -0.1   0.1  -0.1
+
+julia> res_input = minimal_init(8, 3; p=0.1) # lower p -> more negative signs
+8×3 Matrix{Float32}:
+ -0.1  -0.1  -0.1
+ -0.1  -0.1  -0.1
+ -0.1  -0.1  -0.1
+ -0.1  -0.1  -0.1
+  0.1  -0.1  -0.1
+ -0.1  -0.1  -0.1
+ -0.1  -0.1  -0.1
+ -0.1  -0.1  -0.1
+
+julia> res_input = minimal_init(8, 3; p=0.8)# higher p -> more positive signs
+8×3 Matrix{Float32}:
+  0.1   0.1  0.1
+ -0.1   0.1  0.1
+ -0.1   0.1  0.1
+  0.1   0.1  0.1
+  0.1   0.1  0.1
+  0.1  -0.1  0.1
+ -0.1   0.1  0.1
+  0.1   0.1  0.1
+```
+"""
+function minimal_init(rng::AbstractRNG, ::Type{T}, dims::Integer...;
+        sampling_type::Symbol=:bernoulli, weight::Number=T(0.1), irrational::Real=pi,
+        start::Int=1, p::Number=T(0.5)) where {T <: Number}
+    res_size, in_size = dims
+    if sampling_type == :bernoulli
+        layer_matrix = _create_bernoulli(p, res_size, in_size, weight, rng, T)
+    elseif sampling_type == :irrational
+        layer_matrix = _create_irrational(irrational,
+            start,
+            res_size,
+            in_size,
+            weight,
+            rng,
+            T)
+    else
+        error("Sampling type not allowed. Please use one of :bernoulli or :irrational")
+    end
+    return layer_matrix
+end
+
+function _create_bernoulli(p::Number, res_size::Int, in_size::Int, weight::Number,
+        rng::AbstractRNG, ::Type{T}) where {T <: Number}
+    input_matrix = DeviceAgnostic.zeros(rng, T, res_size, in_size)
+    for i in 1:res_size
+        for j in 1:in_size
+            if DeviceAgnostic.rand(rng, T) < p
+                input_matrix[i, j] = weight
+            else
+                input_matrix[i, j] = -weight
+            end
+        end
+    end
+    return input_matrix
+end
+
+function _create_irrational(irrational::Irrational, start::Int, res_size::Int,
+        in_size::Int, weight::Number, rng::AbstractRNG,
+        ::Type{T}) where {T <: Number}
+    setprecision(BigFloat, Int(ceil(log2(10) * (res_size * in_size + start + 1))))
+    ir_string = string(BigFloat(irrational)) |> collect
+    deleteat!(ir_string, findall(x -> x == '.', ir_string))
+    ir_array = DeviceAgnostic.zeros(rng, T, length(ir_string))
+    input_matrix = DeviceAgnostic.zeros(rng, T, res_size, in_size)
+
+    for i in 1:length(ir_string)
+        ir_array[i] = parse(Int, ir_string[i])
+    end
+
+    for i in 1:res_size
+        for j in 1:in_size
+            random_number = DeviceAgnostic.rand(rng, T)
+            input_matrix[i, j] = random_number < 0.5 ? -weight : weight
+        end
+    end
+
+    return T.(input_matrix)
+end
+
+### reservoirs
+
+"""
+    rand_sparse([rng::AbstractRNG=Utils.default_rng()], [T=Float32], dims...;
+        radius=1.0, sparsity=0.1, std=1.0)
+
+Create and return a random sparse reservoir matrix.
+The matrix will be of size specified by `dims`, with specified `sparsity`
+and scaled spectral radius according to `radius`.
+
+# Arguments
+
+  - `rng`: Random number generator. Default is `Utils.default_rng()`
+    from WeightInitializers.
+  - `T`: Type of the elements in the reservoir matrix.
+    Default is `Float32`.
+  - `dims`: Dimensions of the reservoir matrix.
+  - `radius`: The desired spectral radius of the reservoir.
+    Defaults to 1.0.
+  - `sparsity`: The sparsity level of the reservoir matrix,
+    controlling the fraction of zero elements. Defaults to 0.1.
+
+# Examples
+
+```jldoctest
+julia> res_matrix = rand_sparse(5, 5; sparsity=0.5)
+5×5 Matrix{Float32}:
+ 0.0        0.0        0.0        0.0      0.0
+ 0.0        0.794565   0.0        0.26164  0.0
+ 0.0        0.0       -0.931294   0.0      0.553706
+ 0.723235  -0.524727   0.0        0.0      0.0
+ 1.23723    0.0        0.181824  -1.5478   0.465328
+```
+"""
+function rand_sparse(rng::AbstractRNG, ::Type{T}, dims::Integer...;
+        radius=T(1.0), sparsity=T(0.1), std=T(1.0)) where {T <: Number}
+    lcl_sparsity = T(1) - sparsity #consistency with current implementations
+    reservoir_matrix = sparse_init(rng, T, dims...; sparsity=lcl_sparsity, std=std)
+    rho_w = maximum(abs.(eigvals(reservoir_matrix)))
+    reservoir_matrix .*= radius / rho_w
+    if Inf in unique(reservoir_matrix) || -Inf in unique(reservoir_matrix)
+        error("Sparsity too low for size of the matrix. Increase res_size or increase sparsity")
+    end
+    return reservoir_matrix
+end
+
+"""
+    delay_line([rng::AbstractRNG=Utils.default_rng()], [T=Float32], dims...;
+        weight=0.1)
+
+Create and return a delay line reservoir matrix [^Rodan2010].
+
+# Arguments
+
+  - `rng`: Random number generator. Default is `Utils.default_rng()`
+    from WeightInitializers.
+  - `T`: Type of the elements in the reservoir matrix.
+    Default is `Float32`.
+  - `dims`: Dimensions of the reservoir matrix.
+  - `weight`: Determines the value of all connections in the reservoir.
+    Default is 0.1.
+
+# Examples
+
+```jldoctest
+julia> res_matrix = delay_line(5, 5)
+5×5 Matrix{Float32}:
+ 0.0  0.0  0.0  0.0  0.0
+ 0.1  0.0  0.0  0.0  0.0
+ 0.0  0.1  0.0  0.0  0.0
+ 0.0  0.0  0.1  0.0  0.0
+ 0.0  0.0  0.0  0.1  0.0
+
+julia> res_matrix = delay_line(5, 5; weight=1)
+5×5 Matrix{Float32}:
+ 0.0  0.0  0.0  0.0  0.0
+ 1.0  0.0  0.0  0.0  0.0
+ 0.0  1.0  0.0  0.0  0.0
+ 0.0  0.0  1.0  0.0  0.0
+ 0.0  0.0  0.0  1.0  0.0
+```
+
+[^Rodan2010]: Rodan, Ali, and Peter Tino. "Minimum complexity echo state network."
+    IEEE transactions on neural networks 22.1 (2010): 131-144.
+"""
+function delay_line(rng::AbstractRNG, ::Type{T}, dims::Integer...;
+        weight=T(0.1)) where {T <: Number}
+    reservoir_matrix = DeviceAgnostic.zeros(rng, T, dims...)
+    @assert length(dims) == 2&&dims[1] == dims[2] "The dimensions
+    must define a square matrix (e.g., (100, 100))"
+
+    for i in 1:(dims[1] - 1)
+        reservoir_matrix[i + 1, i] = weight
+    end
+
+    return reservoir_matrix
+end
+
+"""
+    delay_line_backward([rng::AbstractRNG=Utils.default_rng()], [T=Float32], dims...;
+        weight = 0.1, fb_weight = 0.2)
+
+Create a delay line backward reservoir with the specified by `dims` and weights.
+Creates a matrix with backward connections as described in [^Rodan2010].
+
+# Arguments
+
+  - `rng`: Random number generator. Default is `Utils.default_rng()`
+    from WeightInitializers.
+  - `T`: Type of the elements in the reservoir matrix.
+    Default is `Float32`.
+  - `dims`: Dimensions of the reservoir matrix.
+  - `weight`: The weight determines the absolute value of
+    forward connections in the reservoir. Default is 0.1
+  - `fb_weight`: Determines the absolute value of backward connections
+    in the reservoir. Default is 0.2
+
+# Examples
+
+```jldoctest
+julia> res_matrix = delay_line_backward(5, 5)
+5×5 Matrix{Float32}:
+ 0.0  0.2  0.0  0.0  0.0
+ 0.1  0.0  0.2  0.0  0.0
+ 0.0  0.1  0.0  0.2  0.0
+ 0.0  0.0  0.1  0.0  0.2
+ 0.0  0.0  0.0  0.1  0.0
+
+julia> res_matrix = delay_line_backward(Float16, 5, 5)
+5×5 Matrix{Float16}:
+ 0.0  0.2  0.0  0.0  0.0
+ 0.1  0.0  0.2  0.0  0.0
+ 0.0  0.1  0.0  0.2  0.0
+ 0.0  0.0  0.1  0.0  0.2
+ 0.0  0.0  0.0  0.1  0.0
+```
+
+[^Rodan2010]: Rodan, Ali, and Peter Tino. "Minimum complexity echo state network."
+    IEEE transactions on neural networks 22.1 (2010): 131-144.
+"""
+function delay_line_backward(rng::AbstractRNG, ::Type{T}, dims::Integer...;
+        weight=T(0.1), fb_weight=T(0.2)) where {T <: Number}
+    res_size = first(dims)
+    reservoir_matrix = DeviceAgnostic.zeros(rng, T, dims...)
+
+    for i in 1:(res_size - 1)
+        reservoir_matrix[i + 1, i] = weight
+        reservoir_matrix[i, i + 1] = fb_weight
+    end
+
+    return reservoir_matrix
+end
+
+"""
+    cycle_jumps([rng::AbstractRNG=Utils.default_rng()], [T=Float32], dims...; 
+        cycle_weight = 0.1, jump_weight = 0.1, jump_size = 3)
+
+Create a cycle jumps reservoir with the specified dimensions,
+cycle weight, jump weight, and jump size.
+
+# Arguments
+
+  - `rng`: Random number generator. Default is `Utils.default_rng()`
+    from WeightInitializers.
+  - `T`: Type of the elements in the reservoir matrix.
+    Default is `Float32`.
+  - `dims`: Dimensions of the reservoir matrix.
+  - `cycle_weight`:  The weight of cycle connections.
+    Default is 0.1.
+  - `jump_weight`: The weight of jump connections.
+    Default is 0.1.
+  - `jump_size`:  The number of steps between jump connections.
+    Default is 3.
+
+# Examples
+
+```jldoctest
+julia> res_matrix = cycle_jumps(5, 5)
+5×5 Matrix{Float32}:
+ 0.0  0.0  0.0  0.1  0.1
+ 0.1  0.0  0.0  0.0  0.0
+ 0.0  0.1  0.0  0.0  0.0
+ 0.1  0.0  0.1  0.0  0.0
+ 0.0  0.0  0.0  0.1  0.0
+
+julia> res_matrix = cycle_jumps(5, 5; jump_size=2)
+5×5 Matrix{Float32}:
+ 0.0  0.0  0.1  0.0  0.1
+ 0.1  0.0  0.0  0.0  0.0
+ 0.1  0.1  0.0  0.0  0.1
+ 0.0  0.0  0.1  0.0  0.0
+ 0.0  0.0  0.1  0.1  0.0
+```
+
+[^Rodan2012]: Rodan, Ali, and Peter Tiňo. "Simple deterministically constructed cycle reservoirs
+    with regular jumps." Neural computation 24.7 (2012): 1822-1852.
+"""
+function cycle_jumps(rng::AbstractRNG, ::Type{T}, dims::Integer...;
+        cycle_weight::Number=T(0.1), jump_weight::Number=T(0.1),
+        jump_size::Int=3) where {T <: Number}
+    res_size = first(dims)
+    reservoir_matrix = DeviceAgnostic.zeros(rng, T, dims...)
+
+    for i in 1:(res_size - 1)
+        reservoir_matrix[i + 1, i] = cycle_weight
+    end
+
+    reservoir_matrix[1, res_size] = cycle_weight
+
+    for i in 1:jump_size:(res_size - jump_size)
+        tmp = (i + jump_size) % res_size
+        if tmp == 0
+            tmp = res_size
+        end
+        reservoir_matrix[i, tmp] = jump_weight
+        reservoir_matrix[tmp, i] = jump_weight
+    end
+
+    return reservoir_matrix
+end
+
+"""
+    simple_cycle([rng::AbstractRNG=Utils.default_rng()], [T=Float32], dims...; 
+        weight = 0.1)
+
+Create a simple cycle reservoir with the specified dimensions and weight.
+
+# Arguments
+
+  - `rng`: Random number generator. Default is `Utils.default_rng()`
+    from WeightInitializers.
+  - `T`: Type of the elements in the reservoir matrix. Default is `Float32`.
+  - `dims`: Dimensions of the reservoir matrix.
+  - `weight`: Weight of the connections in the reservoir matrix.
+    Default is 0.1.
+
+# Examples
+
+```jldoctest
+julia> res_matrix = simple_cycle(5, 5)
+5×5 Matrix{Float32}:
+ 0.0  0.0  0.0  0.0  0.1
+ 0.1  0.0  0.0  0.0  0.0
+ 0.0  0.1  0.0  0.0  0.0
+ 0.0  0.0  0.1  0.0  0.0
+ 0.0  0.0  0.0  0.1  0.0
+
+julia> res_matrix = simple_cycle(5, 5; weight=11)
+5×5 Matrix{Float32}:
+  0.0   0.0   0.0   0.0  11.0
+ 11.0   0.0   0.0   0.0   0.0
+  0.0  11.0   0.0   0.0   0.0
+  0.0   0.0  11.0   0.0   0.0
+  0.0   0.0   0.0  11.0   0.0
+```
+
+[^Rodan2010]: Rodan, Ali, and Peter Tino. "Minimum complexity echo state network."
+    IEEE transactions on neural networks 22.1 (2010): 131-144.
+"""
+function simple_cycle(rng::AbstractRNG, ::Type{T}, dims::Integer...;
+        weight=T(0.1)) where {T <: Number}
+    reservoir_matrix = DeviceAgnostic.zeros(rng, T, dims...)
+
+    for i in 1:(dims[1] - 1)
+        reservoir_matrix[i + 1, i] = weight
+    end
+
+    reservoir_matrix[1, dims[1]] = weight
+    return reservoir_matrix
+end
+
+"""
+    pseudo_svd([rng::AbstractRNG=Utils.default_rng()], [T=Float32], dims...; 
+        max_value=1.0, sparsity=0.1, sorted = true, reverse_sort = false)
+
+Returns an initializer to build a sparse reservoir matrix with the given
+`sparsity` by using a pseudo-SVD approach as described in [^yang].
+
+# Arguments
+
+  - `rng`: Random number generator. Default is `Utils.default_rng()`
+    from WeightInitializers.
+  - `T`: Type of the elements in the reservoir matrix.
+    Default is `Float32`.
+  - `dims`: Dimensions of the reservoir matrix.
+  - `max_value`: The maximum absolute value of elements in the matrix.
+    Default is 1.0
+  - `sparsity`: The desired sparsity level of the reservoir matrix.
+    Default is 0.1
+  - `sorted`: A boolean indicating whether to sort the singular values before
+    creating the diagonal matrix. Default is `true`.
+  - `reverse_sort`: A boolean indicating whether to reverse the sorted
+    singular values. Default is `false`.
+
+# Examples
+
+```jldoctest
+julia> res_matrix = pseudo_svd(5, 5)
+5×5 Matrix{Float32}:
+ 0.306998  0.0       0.0       0.0       0.0
+ 0.0       0.325977  0.0       0.0       0.0
+ 0.0       0.0       0.549051  0.0       0.0
+ 0.0       0.0       0.0       0.726199  0.0
+ 0.0       0.0       0.0       0.0       1.0
+```
+
+[^yang]: Yang, Cuili, et al. "_Design of polynomial echo state networks for time series prediction._" Neurocomputing 290 (2018): 148-160.
+"""
+function pseudo_svd(rng::AbstractRNG, ::Type{T}, dims::Integer...;
+        max_value::Number=T(1.0), sparsity::Number=0.1, sorted::Bool=true,
+        reverse_sort::Bool=false) where {T <: Number}
+    reservoir_matrix = create_diag(rng, T, dims[1],
+        max_value;
+        sorted=sorted,
+        reverse_sort=reverse_sort)
+    tmp_sparsity = get_sparsity(reservoir_matrix, dims[1])
+
+    while tmp_sparsity <= sparsity
+        reservoir_matrix *= create_qmatrix(rng, T, dims[1],
+            rand_range(rng, T, dims[1]),
+            rand_range(rng, T, dims[1]),
+            DeviceAgnostic.rand(rng, T) * T(2) - T(1))
+        tmp_sparsity = get_sparsity(reservoir_matrix, dims[1])
+    end
+
+    return reservoir_matrix
+end
+
+#hacky workaround for the moment
+function rand_range(rng, T, n::Int)
+    return Int(1 + floor(DeviceAgnostic.rand(rng, T) * n))
+end
+
+function create_diag(rng::AbstractRNG, ::Type{T}, dim::Number, max_value::Number;
+        sorted::Bool=true, reverse_sort::Bool=false) where {T <: Number}
+    diagonal_matrix = DeviceAgnostic.zeros(rng, T, dim, dim)
+    if sorted == true
+        if reverse_sort == true
+            diagonal_values = sort(
+                DeviceAgnostic.rand(rng, T, dim) .* max_value; rev=true)
+            diagonal_values[1] = max_value
+        else
+            diagonal_values = sort(DeviceAgnostic.rand(rng, T, dim) .* max_value)
+            diagonal_values[end] = max_value
+        end
+    else
+        diagonal_values = DeviceAgnostic.rand(rng, T, dim) .* max_value
+    end
+
+    for i in 1:dim
+        diagonal_matrix[i, i] = diagonal_values[i]
+    end
+
+    return diagonal_matrix
+end
+
+function create_qmatrix(rng::AbstractRNG, ::Type{T}, dim::Number,
+        coord_i::Number, coord_j::Number, theta::Number) where {T <: Number}
+    qmatrix = DeviceAgnostic.zeros(rng, T, dim, dim)
+
+    for i in 1:dim
+        qmatrix[i, i] = 1.0
+    end
+
+    qmatrix[coord_i, coord_i] = cos(theta)
+    qmatrix[coord_j, coord_j] = cos(theta)
+    qmatrix[coord_i, coord_j] = -sin(theta)
+    qmatrix[coord_j, coord_i] = sin(theta)
+    return qmatrix
+end
+
+function get_sparsity(M, dim)
+    return size(M[M .!= 0], 1) / (dim * dim - size(M[M .!= 0], 1)) #nonzero/zero elements
+end
+
+### fallbacks
+#fallbacks for initializers #eventually to remove once migrated to WeightInitializers.jl
+for initializer in (:rand_sparse, :delay_line, :delay_line_backward, :cycle_jumps,
+    :simple_cycle, :pseudo_svd,
+    :scaled_rand, :weighted_init, :informed_init, :minimal_init)
+    @eval begin
+        function ($initializer)(dims::Integer...; kwargs...)
+            return $initializer(Utils.default_rng(), Float32, dims...; kwargs...)
+        end
+        function ($initializer)(rng::AbstractRNG, dims::Integer...; kwargs...)
+            return $initializer(rng, Float32, dims...; kwargs...)
+        end
+        function ($initializer)(::Type{T}, dims::Integer...; kwargs...) where {T <: Number}
+            return $initializer(Utils.default_rng(), T, dims...; kwargs...)
+        end
+
+        # Partial application
+        function ($initializer)(rng::AbstractRNG; kwargs...)
+            return PartialFunction.Partial{Nothing}($initializer, rng, kwargs)
+        end
+        function ($initializer)(::Type{T}; kwargs...) where {T <: Number}
+            return PartialFunction.Partial{T}($initializer, nothing, kwargs)
+        end
+        function ($initializer)(rng::AbstractRNG, ::Type{T}; kwargs...) where {T <: Number}
+            return PartialFunction.Partial{T}($initializer, rng, kwargs)
+        end
+        function ($initializer)(; kwargs...)
+            return PartialFunction.Partial{Nothing}($initializer, nothing, kwargs)
+        end
+    end
+end
diff --git a/src/esn/esn_input_layers.jl b/src/esn/esn_input_layers.jl
deleted file mode 100644
index 04487f96..00000000
--- a/src/esn/esn_input_layers.jl
+++ /dev/null
@@ -1,240 +0,0 @@
-"""
-    scaled_rand(rng::AbstractRNG, ::Type{T}, dims::Integer...; scaling=T(0.1)) where {T <: Number}
-
-Create and return a matrix with random values, uniformly distributed within a range defined by `scaling`. This function is useful for initializing matrices, such as the layers of a neural network, with scaled random values.
-
-# Arguments
-
-  - `rng`: An instance of `AbstractRNG` for random number generation.
-  - `T`: The data type for the elements of the matrix.
-  - `dims`: Dimensions of the matrix. It must be a 2-element tuple specifying the number of rows and columns (e.g., `(res_size, in_size)`).
-  - `scaling`: A scaling factor to define the range of the uniform distribution. The matrix elements will be randomly chosen from the range `[-scaling, scaling]`. Defaults to `T(0.1)`.
-
-# Returns
-
-A matrix of type with dimensions specified by `dims`. Each element of the matrix is a random number uniformly distributed between `-scaling` and `scaling`.
-
-# Example
-
-```julia
-rng = Random.default_rng()
-matrix = scaled_rand(rng, Float64, (100, 50); scaling=0.2)
-```
-"""
-function scaled_rand(rng::AbstractRNG,
-        ::Type{T},
-        dims::Integer...;
-        scaling=T(0.1)) where {T <: Number}
-    res_size, in_size = dims
-    layer_matrix = (DeviceAgnostic.rand(rng, T, res_size, in_size) .- T(0.5)) .*
-                   (T(2) * scaling)
-    return layer_matrix
-end
-
-"""
-    weighted_init(rng::AbstractRNG, ::Type{T}, dims::Integer...; scaling=T(0.1)) where {T <: Number}
-
-Create and return a matrix representing a weighted input layer for Echo State Networks (ESNs). This initializer generates a weighted input matrix with random non-zero elements distributed uniformly within the range [-`scaling`, `scaling`], inspired by the approach in [^Lu].
-
-# Arguments
-
-  - `rng`: An instance of `AbstractRNG` for random number generation.
-  - `T`: The data type for the elements of the matrix.
-  - `dims`: A 2-element tuple specifying the approximate reservoir size and input size (e.g., `(approx_res_size, in_size)`).
-  - `scaling`: The scaling factor for the weight distribution. Defaults to `T(0.1)`.
-
-# Returns
-
-A matrix representing the weighted input layer as defined in [^Lu2017]. The matrix dimensions will be adjusted to ensure each input unit connects to an equal number of reservoir units.
-
-# Example
-
-```julia
-rng = Random.default_rng()
-input_layer = weighted_init(rng, Float64, (3, 300); scaling=0.2)
-```
-
-# References
-
-[^Lu2017]: Lu, Zhixin, et al.
-    "Reservoir observers: Model-free inference of unmeasured variables in chaotic systems."
-    Chaos: An Interdisciplinary Journal of Nonlinear Science 27.4 (2017): 041102.
-"""
-function weighted_init(rng::AbstractRNG,
-        ::Type{T},
-        dims::Integer...;
-        scaling=T(0.1)) where {T <: Number}
-    approx_res_size, in_size = dims
-    res_size = Int(floor(approx_res_size / in_size) * in_size)
-    layer_matrix = DeviceAgnostic.zeros(rng, T, res_size, in_size)
-    q = floor(Int, res_size / in_size)
-
-    for i in 1:in_size
-        layer_matrix[((i - 1) * q + 1):((i) * q), i] = (DeviceAgnostic.rand(rng, T, q) .-
-                                                        T(0.5)) .* (T(2) * scaling)
-    end
-
-    return layer_matrix
-end
-
-"""
-    informed_init(rng::AbstractRNG, ::Type{T}, dims::Integer...; scaling=T(0.1), model_in_size, gamma=T(0.5)) where {T <: Number}
-
-Create a layer of a neural network.
-
-# Arguments
-
-  - `rng::AbstractRNG`: The random number generator.
-  - `T::Type`: The data type.
-  - `dims::Integer...`: The dimensions of the layer.
-  - `scaling::T = T(0.1)`: The scaling factor for the input matrix.
-  - `model_in_size`: The size of the input model.
-  - `gamma::T = T(0.5)`: The gamma value.
-
-# Returns
-
-  - `input_matrix`: The created input matrix for the layer.
-
-# Example
-
-```julia
-rng = Random.default_rng()
-dims = (100, 200)
-model_in_size = 50
-input_matrix = informed_init(rng, Float64, dims; model_in_size=model_in_size)
-```
-"""
-function informed_init(rng::AbstractRNG, ::Type{T}, dims::Integer...;
-        scaling=T(0.1), model_in_size, gamma=T(0.5)) where {T <: Number}
-    res_size, in_size = dims
-    state_size = in_size - model_in_size
-
-    if state_size <= 0
-        throw(DimensionMismatch("in_size must be greater than model_in_size"))
-    end
-
-    input_matrix = DeviceAgnostic.zeros(rng, T, res_size, in_size)
-    zero_connections = DeviceAgnostic.zeros(rng, T, in_size)
-    num_for_state = floor(Int, res_size * gamma)
-    num_for_model = floor(Int, res_size * (1 - gamma))
-
-    for i in 1:num_for_state
-        idxs = findall(Bool[zero_connections .== input_matrix[i, :]
-                            for i in 1:size(input_matrix, 1)])
-        random_row_idx = idxs[DeviceAgnostic.rand(rng, T, 1:end)]
-        random_clm_idx = range(1, state_size; step=1)[DeviceAgnostic.rand(rng, T, 1:end)]
-        input_matrix[random_row_idx, random_clm_idx] = (DeviceAgnostic.rand(rng, T) -
-                                                        T(0.5)) .* (T(2) * scaling)
-    end
-
-    for i in 1:num_for_model
-        idxs = findall(Bool[zero_connections .== input_matrix[i, :]
-                            for i in 1:size(input_matrix, 1)])
-        random_row_idx = idxs[DeviceAgnostic.rand(rng, T, 1:end)]
-        random_clm_idx = range(state_size + 1, in_size; step=1)[DeviceAgnostic.rand(
-            rng, T, 1:end)]
-        input_matrix[random_row_idx, random_clm_idx] = (DeviceAgnostic.rand(rng, T) -
-                                                        T(0.5)) .* (T(2) * scaling)
-    end
-
-    return input_matrix
-end
-
-"""
-    irrational_sample_init(rng::AbstractRNG, ::Type{T}, dims::Integer...;
-    weight = 0.1,
-    sampling = IrrationalSample(; irrational = pi, start = 1)
-    ) where {T <: Number}
-
-Create a layer matrix using the provided random number generator and sampling parameters.
-
-# Arguments
-
-  - `rng::AbstractRNG`: The random number generator used to generate random numbers.
-  - `dims::Integer...`: The dimensions of the layer matrix.
-  - `weight`: The weight used to fill the layer matrix. Default is 0.1.
-  - `sampling`: The sampling parameters used to generate the input matrix. Default is IrrationalSample(irrational = pi, start = 1).
-
-# Returns
-
-The layer matrix generated using the provided random number generator and sampling parameters.
-
-# Example
-
-```julia
-using Random
-rng = Random.default_rng()
-dims = (3, 2)
-weight = 0.5
-layer_matrix = irrational_sample_init(rng, Float64, dims; weight=weight,
-    sampling=IrrationalSample(; irrational=sqrt(2), start=1))
-```
-"""
-function minimal_init(rng::AbstractRNG, ::Type{T}, dims::Integer...;
-        sampling_type::Symbol=:bernoulli,
-        weight::Number=T(0.1),
-        irrational::Real=pi,
-        start::Int=1,
-        p::Number=T(0.5)) where {T <: Number}
-    res_size, in_size = dims
-    if sampling_type == :bernoulli
-        layer_matrix = _create_bernoulli(p, res_size, in_size, weight, rng, T)
-    elseif sampling_type == :irrational
-        layer_matrix = _create_irrational(irrational,
-            start,
-            res_size,
-            in_size,
-            weight,
-            rng,
-            T)
-    else
-        error("Sampling type not allowed. Please use one of :bernoulli or :irrational")
-    end
-    return layer_matrix
-end
-
-function _create_bernoulli(p::Number,
-        res_size::Int,
-        in_size::Int,
-        weight::Number,
-        rng::AbstractRNG,
-        ::Type{T}) where {T <: Number}
-    input_matrix = DeviceAgnostic.zeros(rng, T, res_size, in_size)
-    for i in 1:res_size
-        for j in 1:in_size
-            if DeviceAgnostic.rand(rng, T) < p
-                input_matrix[i, j] = weight
-            else
-                input_matrix[i, j] = -weight
-            end
-        end
-    end
-    return input_matrix
-end
-
-function _create_irrational(irrational::Irrational,
-        start::Int,
-        res_size::Int,
-        in_size::Int,
-        weight::Number,
-        rng::AbstractRNG,
-        ::Type{T}) where {T <: Number}
-    setprecision(BigFloat, Int(ceil(log2(10) * (res_size * in_size + start + 1))))
-    ir_string = string(BigFloat(irrational)) |> collect
-    deleteat!(ir_string, findall(x -> x == '.', ir_string))
-    ir_array = DeviceAgnostic.zeros(rng, T, length(ir_string))
-    input_matrix = DeviceAgnostic.zeros(rng, T, res_size, in_size)
-
-    for i in 1:length(ir_string)
-        ir_array[i] = parse(Int, ir_string[i])
-    end
-
-    for i in 1:res_size
-        for j in 1:in_size
-            random_number = DeviceAgnostic.rand(rng, T)
-            input_matrix[i, j] = random_number < 0.5 ? -weight : weight
-        end
-    end
-
-    return T.(input_matrix)
-end
diff --git a/src/esn/esn_predict.jl b/src/esn/esn_predict.jl
index ba6c80da..46e30e95 100644
--- a/src/esn/esn_predict.jl
+++ b/src/esn/esn_predict.jl
@@ -93,16 +93,16 @@ end
 
 function allocate_outpad(hesn::HybridESN, states_type, out)
     pad_length = length(out) + size(hesn.model.model_data[:, 1], 1)
-    out_tmp = Adapt.adapt(typeof(out), zeros(pad_length))
+    out_tmp = adapt(typeof(out), zeros(pad_length))
     return allocate_singlepadding(states_type, out_tmp)
 end
 
 function allocate_singlepadding(::AbstractPaddedStates, out)
-    Adapt.adapt(typeof(out), zeros(size(out, 1) + 1))
+    adapt(typeof(out), zeros(size(out, 1) + 1))
 end
 function allocate_singlepadding(::StandardStates, out)
-    Adapt.adapt(typeof(out), zeros(size(out, 1)))
+    adapt(typeof(out), zeros(size(out, 1)))
 end
 function allocate_singlepadding(::ExtendedStates, out)
-    Adapt.adapt(typeof(out), zeros(size(out, 1)))
+    adapt(typeof(out), zeros(size(out, 1)))
 end
diff --git a/src/esn/esn_reservoir_drivers.jl b/src/esn/esn_reservoir_drivers.jl
index 52c1a2c3..46e4cda8 100644
--- a/src/esn/esn_reservoir_drivers.jl
+++ b/src/esn/esn_reservoir_drivers.jl
@@ -1,31 +1,25 @@
 abstract type AbstractReservoirDriver end
 
 """
-    create_states(
-        reservoir_driver::AbstractReservoirDriver,
-        train_data,
-        washout,
-        reservoir_matrix,
-        input_matrix,
-        bias_vector
-    )
+    create_states(reservoir_driver::AbstractReservoirDriver, train_data, washout,
+        reservoir_matrix, input_matrix, bias_vector)
 
-Create and return the trained Echo State Network (ESN) states according to the specified reservoir driver.
+Create and return the trained Echo State Network (ESN) states according to the
+specified reservoir driver.
 
 # Arguments
 
-  - `reservoir_driver::AbstractReservoirDriver`: The reservoir driver that determines how the ESN states evolve over time.
+  - `reservoir_driver`: The reservoir driver that determines how the ESN states evolve
+    over time.
   - `train_data`: The training data used to train the ESN.
-  - `washout::Int`: The number of initial time steps to discard during training to allow the reservoir dynamics to wash out the initial conditions.
-  - `reservoir_matrix`: The reservoir matrix representing the dynamic, recurrent part of the ESN.
-  - `input_matrix`: The input matrix that defines the connections between input features and reservoir nodes.
-  - `bias_vector`: The bias vector to be added at each time step during the reservoir update.
-
-# Returns
-
-  - A matrix of trained ESN states, where each column represents the state at a specific time step.
-
-This function is responsible for creating and returning the states of the ESN during training based on the provided training data and parameters.
+  - `washout`: The number of initial time steps to discard during training to allow the
+    reservoir dynamics to wash out the initial conditions.
+  - `reservoir_matrix`: The reservoir matrix representing the dynamic, recurrent part of
+    the ESN.
+  - `input_matrix`: The input matrix that defines the connections between input features
+    and reservoir nodes.
+  - `bias_vector`: The bias vector to be added at each time step during the reservoir
+    update.
 """
 function create_states(reservoir_driver::AbstractReservoirDriver,
         train_data,
@@ -36,9 +30,9 @@ function create_states(reservoir_driver::AbstractReservoirDriver,
     train_len = size(train_data, 2) - washout
     res_size = size(reservoir_matrix, 1)
 
-    states = Adapt.adapt(typeof(train_data), zeros(res_size, train_len))
+    states = adapt(typeof(train_data), zeros(res_size, train_len))
     tmp_array = allocate_tmp(reservoir_driver, typeof(train_data), res_size)
-    _state = Adapt.adapt(typeof(train_data), zeros(res_size, 1))
+    _state = adapt(typeof(train_data), zeros(res_size, 1))
 
     for i in 1:washout
         yv = @view train_data[:, i]
@@ -65,9 +59,9 @@ function create_states(reservoir_driver::AbstractReservoirDriver,
     train_len = size(train_data, 2) - washout
     res_size = sum([size(reservoir_matrix[i], 1) for i in 1:length(reservoir_matrix)])
 
-    states = Adapt.adapt(typeof(train_data), zeros(res_size, train_len))
+    states = adapt(typeof(train_data), zeros(res_size, train_len))
     tmp_array = allocate_tmp(reservoir_driver, typeof(train_data), res_size)
-    _state = Adapt.adapt(typeof(train_data), zeros(res_size))
+    _state = adapt(typeof(train_data), zeros(res_size))
 
     for i in 1:washout
         for j in 1:length(reservoir_matrix)
@@ -99,7 +93,8 @@ end
     RNN(activation_function, leaky_coefficient)
     RNN(;activation_function=tanh, leaky_coefficient=1.0)
 
-Returns a Recurrent Neural Network (RNN) initializer for the Echo State Network (ESN).
+Returns a Recurrent Neural Network (RNN) initializer for
+echo state networks (`ESN`).
 
 # Arguments
 
@@ -108,13 +103,12 @@ Returns a Recurrent Neural Network (RNN) initializer for the Echo State Network
 
 # Keyword Arguments
 
-  - `activation_function`: The activation function used in the RNN. Defaults to `tanh`.
-  - `leaky_coefficient`: The leaky coefficient used in the RNN. Defaults to 1.0.
-
-This function creates an RNN object with the specified activation function and leaky coefficient,
-which can be used as a reservoir driver in the ESN.
+  - `activation_function`: The activation function used in the RNN.
+    Defaults to `tanh_fast`.
+  - `leaky_coefficient`: The leaky coefficient used in the RNN.
+    Defaults to 1.0.
 """
-function RNN(; activation_function=NNlib.fast_act(tanh), leaky_coefficient=1.0)
+function RNN(; activation_function=fast_act(tanh), leaky_coefficient=1.0)
     RNN(activation_function, leaky_coefficient)
 end
 
@@ -148,7 +142,7 @@ function next_state!(out, rnn::RNN, x, y, W::Vector, W_in, b, tmp_array)
 end
 
 function allocate_tmp(::RNN, tmp_type, res_size)
-    return [Adapt.adapt(tmp_type, zeros(res_size, 1)) for i in 1:2]
+    return [adapt(tmp_type, zeros(res_size, 1)) for i in 1:2]
 end
 
 #multiple RNN driver
@@ -163,25 +157,32 @@ end
     MRNN(;activation_function=[tanh, sigmoid], leaky_coefficient=1.0,
         scaling_factor=fill(leaky_coefficient, length(activation_function)))
 
-Returns a Multiple RNN (MRNN) initializer for the Echo State Network (ESN), introduced in [^lun].
+Returns a Multiple RNN (MRNN) initializer for the Echo State Network (ESN),
+introduced in [^Lun2015].
 
 # Arguments
 
-  - `activation_function`: A vector of activation functions used in the MRNN.
+  - `activation_function`: A vector of activation functions used
+    in the MRNN.
   - `leaky_coefficient`: The leaky coefficient used in the MRNN.
-  - `scaling_factor`: A vector of scaling factors for combining activation functions.
+  - `scaling_factor`: A vector of scaling factors for combining activation
+    functions.
 
 # Keyword Arguments
 
-  - `activation_function`: A vector of activation functions used in the MRNN. Defaults to `[tanh, sigmoid]`.
-  - `leaky_coefficient`: The leaky coefficient used in the MRNN. Defaults to 1.0.
-  - `scaling_factor`: A vector of scaling factors for combining activation functions. Defaults to an array of the same size as `activation_function` with all elements set to `leaky_coefficient`.
-
-This function creates an MRNN object with the specified activation functions, leaky coefficient, and scaling factors, which can be used as a reservoir driver in the ESN.
+  - `activation_function`: A vector of activation functions used in the MRNN.
+    Defaults to `[tanh, sigmoid]`.
+  - `leaky_coefficient`: The leaky coefficient used in the MRNN.
+    Defaults to 1.0.
+  - `scaling_factor`: A vector of scaling factors for combining activation functions.
+    Defaults to an array of the same size as `activation_function` with all
+    elements set to `leaky_coefficient`.
 
-# Reference:
+This function creates an MRNN object with the specified activation functions,
+leaky coefficient, and scaling factors, which can be used as a reservoir driver
+in the ESN.
 
-[^lun]: Lun, Shu-Xian, et al.
+[^Lun2015]: Lun, Shu-Xian, et al.
     "_A novel model of leaky integrator echo state network for
     time-series prediction._" Neurocomputing 159 (2015): 58-66.
 """
@@ -208,20 +209,11 @@ function next_state!(out, mrnn::MRNN, x, y, W, W_in, b, tmp_array)
     return out
 end
 
-#=
-function next_state!(out, mrnn::MRNN, x, y, W, W_in, b, tmp_array)
-    rnn_next_state = (1-mrnn.leaky_coefficient).*x
-    for i=1:length(mrnn.scaling_factor)
-        rnn_next_state += mrnn.scaling_factor[i]*mrnn.activation_function[i].((W*x).+(W_in*y).+b)
-    end
-    rnn_next_state
-end
-=#
-
 function allocate_tmp(::MRNN, tmp_type, res_size)
-    return [Adapt.adapt(tmp_type, zeros(res_size, 1)) for i in 1:2]
+    return [adapt(tmp_type, zeros(res_size, 1)) for i in 1:2]
 end
 
+abstract type AbstractGRUVariant end
 #GRU-based driver
 struct GRU{F, L, R, V, B} #not an abstractreservoirdriver
     activation_function::F
@@ -235,19 +227,15 @@ end
 """
     FullyGated()
 
-Returns a Fully Gated Recurrent Unit (FullyGated) initializer for the Echo State Network (ESN).
-
-This function creates a FullyGated object, which can be used as a reservoir driver in the ESN.
-The FullyGated variant is described in the literature reference [^cho].
+Returns a Fully Gated Recurrent Unit (FullyGated) initializer
+for the Echo State Network (ESN).
 
-# Returns
+Returns the standard gated recurrent unit [^Cho2014] as a driver for the
+echo state network (`ESN`).
 
-  - `FullyGated`: A FullyGated reservoir driver.
-
-# Reference
-
-[^cho]: Cho, Kyunghyun, et al.
-    "_Learning phrase representations using RNN encoder-decoder for statistical machine translation._"
+[^Cho2014]: Cho, Kyunghyun, et al.
+    "_Learning phrase representations using RNN encoder-decoder
+    for statistical machine translation._"
     arXiv preprint arXiv:1406.1078 (2014).
 """
 struct FullyGated <: AbstractGRUVariant end
@@ -255,9 +243,10 @@ struct FullyGated <: AbstractGRUVariant end
 """
     Minimal()
 
-Returns a minimal GRU ESN initializer as described in [^Zhou].
+Returns a minimal GRU ESN initializer as described in [^Zhou2016].
 
-[^Zhou]: Zhou, Guo-Bing, et al. "_Minimal gated unit for recurrent neural networks._"
+[^Zhou2016]: Zhou, Guo-Bing, et al. "_Minimal gated unit for recurrent
+    neural networks._"
     International Journal of Automation and Computing 13.3 (2016): 226-234.
 """
 struct Minimal <: AbstractGRUVariant end
@@ -270,27 +259,28 @@ struct Minimal <: AbstractGRUVariant end
         bias = fill(DenseLayer(), 2),
         variant = FullyGated())
 
-Returns a Gated Recurrent Unit (GRU) reservoir driver for Echo State Networks (ESNs). This driver is based on the GRU architecture [^Cho], which is designed to capture temporal dependencies in data and is commonly used in various machine learning applications.
+Returns a Gated Recurrent Unit (GRU) reservoir driver for Echo State Network (`ESN`).
+This driver is based on the GRU architecture [^Cho2014].
 
 # Arguments
 
-  - `activation_function`: An array of activation functions for the GRU layers. By default, it uses sigmoid activation functions for the update gate, reset gate, and tanh for the hidden state.
-  - `inner_layer`: An array of inner layers used in the GRU architecture. By default, it uses two dense layers.
-  - `reservoir`: An array of reservoir layers. By default, it uses two random sparse reservoirs.
-  - `bias`: An array of bias layers for the GRU. By default, it uses two dense layers.
-  - `variant`: The GRU variant to use. By default, it uses the "FullyGated" variant.
-
-# Returns
-
-A GRUParams object containing the parameters needed for the GRU-based reservoir driver.
-
-# References
-
-[^Cho]: Cho, Kyunghyun, et al.
+  - `activation_function`: An array of activation functions for the GRU layers.
+    By default, it uses sigmoid activation functions for the update gate, reset gate,
+    and tanh for the hidden state.
+  - `inner_layer`: An array of inner layers used in the GRU architecture.
+    By default, it uses two dense layers.
+  - `reservoir`: An array of reservoir layers.
+    By default, it uses two random sparse reservoirs.
+  - `bias`: An array of bias layers for the GRU.
+    By default, it uses two dense layers.
+  - `variant`: The GRU variant to use.
+    By default, it uses the "FullyGated" variant.
+
+[^Cho2014]: Cho, Kyunghyun, et al.
     "_Learning phrase representations using RNN encoder-decoder for statistical machine translation._"
     arXiv preprint arXiv:1406.1078 (2014).
 """
-function GRU(; activation_function=[NNlib.sigmoid, NNlib.sigmoid, tanh],
+function GRU(; activation_function=[sigmoid, sigmoid, tanh],
         inner_layer=fill(scaled_rand, 2),
         reservoir=fill(rand_sparse, 2),
         bias=fill(scaled_rand, 2),
@@ -354,7 +344,7 @@ function next_state!(out, gru::GRUParams, x, y, W, W_in, b, tmp_array)
 end
 
 function allocate_tmp(::GRUParams, tmp_type, res_size)
-    return [Adapt.adapt(tmp_type, zeros(res_size, 1)) for i in 1:9]
+    return [adapt(tmp_type, zeros(res_size, 1)) for i in 1:9]
 end
 
 #W=U, W_in=W in papers. x=h, and y=x. I know, it's confusing. ( on the left our notation)
diff --git a/src/esn/esn_reservoirs.jl b/src/esn/esn_reservoirs.jl
deleted file mode 100644
index 36276099..00000000
--- a/src/esn/esn_reservoirs.jl
+++ /dev/null
@@ -1,304 +0,0 @@
-"""
-    rand_sparse(rng::AbstractRNG, ::Type{T}, dims::Integer...; radius=1.0, sparsity=0.1)
-
-Create and return a random sparse reservoir matrix for use in Echo State Networks (ESNs). The matrix will be of size specified by `dims`, with specified `sparsity` and scaled spectral radius according to `radius`.
-
-# Arguments
-
-  - `rng`: An instance of `AbstractRNG` for random number generation.
-  - `T`: The data type for the elements of the matrix.
-  - `dims`: Dimensions of the reservoir matrix.
-  - `radius`: The desired spectral radius of the reservoir. Defaults to 1.0.
-  - `sparsity`: The sparsity level of the reservoir matrix, controlling the fraction of zero elements. Defaults to 0.1.
-
-# Returns
-
-A matrix representing the random sparse reservoir.
-
-# References
-
-This type of reservoir initialization is commonly used in ESNs for capturing temporal dependencies in data.
-"""
-function rand_sparse(rng::AbstractRNG,
-        ::Type{T},
-        dims::Integer...;
-        radius=T(1.0),
-        sparsity=T(0.1),
-        std=T(1.0)) where {T <: Number}
-    lcl_sparsity = T(1) - sparsity #consistency with current implementations
-    reservoir_matrix = sparse_init(rng, T, dims...; sparsity=lcl_sparsity, std=std)
-    rho_w = maximum(abs.(eigvals(reservoir_matrix)))
-    reservoir_matrix .*= radius / rho_w
-    if Inf in unique(reservoir_matrix) || -Inf in unique(reservoir_matrix)
-        error("Sparsity too low for size of the matrix. Increase res_size or increase sparsity")
-    end
-    return reservoir_matrix
-end
-
-"""
-    delay_line(rng::AbstractRNG, ::Type{T}, dims::Integer...; weight=0.1) where {T <: Number}
-
-Create and return a delay line reservoir matrix for use in Echo State Networks (ESNs). A delay line reservoir is a deterministic structure where each unit is connected only to its immediate predecessor with a specified weight. This method is particularly useful for tasks that require specific temporal processing.
-
-# Arguments
-
-  - `rng`: An instance of `AbstractRNG` for random number generation. This argument is not used in the current implementation but is included for consistency with other initialization functions.
-  - `T`: The data type for the elements of the matrix.
-  - `dims`: Dimensions of the reservoir matrix. Typically, this should be a tuple of two equal integers representing a square matrix.
-  - `weight`: The weight determines the absolute value of all connections in the reservoir. Defaults to 0.1.
-
-# Returns
-
-A delay line reservoir matrix with dimensions specified by `dims`. The matrix is initialized such that each element in the `i+1`th row and `i`th column is set to `weight`, and all other elements are zeros.
-
-# Example
-
-```julia
-reservoir = delay_line(Float64, 100, 100; weight=0.2)
-```
-
-# References
-
-This type of reservoir initialization is described in:
-Rodan, Ali, and Peter Tino. "Minimum complexity echo state network." IEEE Transactions on Neural Networks 22.1 (2010): 131-144.
-"""
-function delay_line(rng::AbstractRNG,
-        ::Type{T},
-        dims::Integer...;
-        weight=T(0.1)) where {T <: Number}
-    reservoir_matrix = DeviceAgnostic.zeros(rng, T, dims...)
-    @assert length(dims) == 2&&dims[1] == dims[2] "The dimensions must define a square matrix (e.g., (100, 100))"
-
-    for i in 1:(dims[1] - 1)
-        reservoir_matrix[i + 1, i] = weight
-    end
-
-    return reservoir_matrix
-end
-
-"""
-    delay_line_backward(rng::AbstractRNG, ::Type{T}, dims::Integer...;
-        weight = T(0.1), fb_weight = T(0.2)) where {T <: Number}
-
-Create a delay line backward reservoir with the specified by `dims` and weights. Creates a matrix with backward connections
-as described in [^Rodan2010]. The `weight` and `fb_weight` can be passed as either arguments or
-keyword arguments, and they determine the absolute values of the connections in the reservoir.
-
-# Arguments
-
-  - `rng::AbstractRNG`: Random number generator.
-  - `T::Type`: Type of the elements in the reservoir matrix.
-  - `dims::Integer...`: Dimensions of the reservoir matrix.
-  - `weight::T`: The weight determines the absolute value of forward connections in the reservoir, and is set to 0.1 by default.
-  - `fb_weight::T`: The `fb_weight` determines the absolute value of backward connections in the reservoir, and is set to 0.2 by default.
-
-# Returns
-
-Reservoir matrix with the dimensions specified by `dims` and weights.
-
-# References
-
-[^Rodan2010]: Rodan, Ali, and Peter Tino. "Minimum complexity echo state network."
-    IEEE transactions on neural networks 22.1 (2010): 131-144.
-"""
-function delay_line_backward(rng::AbstractRNG,
-        ::Type{T},
-        dims::Integer...;
-        weight=T(0.1),
-        fb_weight=T(0.2)) where {T <: Number}
-    res_size = first(dims)
-    reservoir_matrix = DeviceAgnostic.zeros(rng, T, dims...)
-
-    for i in 1:(res_size - 1)
-        reservoir_matrix[i + 1, i] = weight
-        reservoir_matrix[i, i + 1] = fb_weight
-    end
-
-    return reservoir_matrix
-end
-
-"""
-    cycle_jumps(rng::AbstractRNG, ::Type{T}, dims::Integer...;
-        cycle_weight = T(0.1), jump_weight = T(0.1), jump_size = 3) where {T <: Number}
-
-Create a cycle jumps reservoir with the specified dimensions, cycle weight, jump weight, and jump size.
-
-# Arguments
-
-  - `rng::AbstractRNG`: Random number generator.
-  - `T::Type`: Type of the elements in the reservoir matrix.
-  - `dims::Integer...`: Dimensions of the reservoir matrix.
-  - `cycle_weight::T = T(0.1)`:  The weight of cycle connections.
-  - `jump_weight::T = T(0.1)`: The weight of jump connections.
-  - `jump_size::Int = 3`:  The number of steps between jump connections.
-
-# Returns
-
-Reservoir matrix with the specified dimensions, cycle weight, jump weight, and jump size.
-
-# References
-
-[^Rodan2012]: Rodan, Ali, and Peter Tiňo. "Simple deterministically constructed cycle reservoirs
-    with regular jumps." Neural computation 24.7 (2012): 1822-1852.
-"""
-function cycle_jumps(rng::AbstractRNG,
-        ::Type{T},
-        dims::Integer...;
-        cycle_weight::Number=T(0.1),
-        jump_weight::Number=T(0.1),
-        jump_size::Int=3) where {T <: Number}
-    res_size = first(dims)
-    reservoir_matrix = DeviceAgnostic.zeros(rng, T, dims...)
-
-    for i in 1:(res_size - 1)
-        reservoir_matrix[i + 1, i] = cycle_weight
-    end
-
-    reservoir_matrix[1, res_size] = cycle_weight
-
-    for i in 1:jump_size:(res_size - jump_size)
-        tmp = (i + jump_size) % res_size
-        if tmp == 0
-            tmp = res_size
-        end
-        reservoir_matrix[i, tmp] = jump_weight
-        reservoir_matrix[tmp, i] = jump_weight
-    end
-
-    return reservoir_matrix
-end
-
-"""
-    simple_cycle(rng::AbstractRNG, ::Type{T}, dims::Integer...;
-        weight = T(0.1)) where {T <: Number}
-
-Create a simple cycle reservoir with the specified dimensions and weight.
-
-# Arguments
-
-  - `rng::AbstractRNG`: Random number generator.
-  - `T::Type`: Type of the elements in the reservoir matrix.
-  - `dims::Integer...`: Dimensions of the reservoir matrix.
-  - `weight::T = T(0.1)`: Weight of the connections in the reservoir matrix.
-
-# Returns
-
-Reservoir matrix with the dimensions specified by `dims` and weights.
-
-# References
-
-[^Rodan2010]: Rodan, Ali, and Peter Tino. "Minimum complexity echo state network."
-    IEEE transactions on neural networks 22.1 (2010): 131-144.
-"""
-function simple_cycle(rng::AbstractRNG,
-        ::Type{T},
-        dims::Integer...;
-        weight=T(0.1)) where {T <: Number}
-    reservoir_matrix = DeviceAgnostic.zeros(rng, T, dims...)
-
-    for i in 1:(dims[1] - 1)
-        reservoir_matrix[i + 1, i] = weight
-    end
-
-    reservoir_matrix[1, dims[1]] = weight
-    return reservoir_matrix
-end
-
-"""
-    pseudo_svd(rng::AbstractRNG, ::Type{T}, dims::Integer...; 
-        max_value, sparsity, sorted = true, reverse_sort = false) where {T <: Number}
-
-    Returns an initializer to build a sparse reservoir matrix with the given `sparsity` by using a pseudo-SVD approach as described in [^yang].
-
-# Arguments
-
-  - `rng::AbstractRNG`: Random number generator.
-  - `T::Type`: Type of the elements in the reservoir matrix.
-  - `dims::Integer...`: Dimensions of the reservoir matrix.
-  - `max_value`: The maximum absolute value of elements in the matrix.
-  - `sparsity`: The desired sparsity level of the reservoir matrix.
-  - `sorted`: A boolean indicating whether to sort the singular values before creating the diagonal matrix. By default, it is set to `true`.
-  - `reverse_sort`: A boolean indicating whether to reverse the sorted singular values. By default, it is set to `false`.
-
-# Returns
-
-Reservoir matrix with the specified dimensions, max value, and sparsity.
-
-# References
-
-This reservoir initialization method, based on a pseudo-SVD approach, is inspired by the work in [^yang], which focuses on designing polynomial echo state networks for time series prediction.
-
-[^yang]: Yang, Cuili, et al. "_Design of polynomial echo state networks for time series prediction._" Neurocomputing 290 (2018): 148-160.
-"""
-function pseudo_svd(rng::AbstractRNG,
-        ::Type{T},
-        dims::Integer...;
-        max_value::Number=T(1.0),
-        sparsity::Number=0.1,
-        sorted::Bool=true,
-        reverse_sort::Bool=false) where {T <: Number}
-    reservoir_matrix = create_diag(rng, T, dims[1],
-        max_value;
-        sorted=sorted,
-        reverse_sort=reverse_sort)
-    tmp_sparsity = get_sparsity(reservoir_matrix, dims[1])
-
-    while tmp_sparsity <= sparsity
-        reservoir_matrix *= create_qmatrix(rng, T, dims[1],
-            rand_range(rng, T, dims[1]),
-            rand_range(rng, T, dims[1]),
-            DeviceAgnostic.rand(rng, T) * T(2) - T(1))
-        tmp_sparsity = get_sparsity(reservoir_matrix, dims[1])
-    end
-
-    return reservoir_matrix
-end
-
-#hacky workaround for the moment
-function rand_range(rng, T, n::Int)
-    return Int(1 + floor(DeviceAgnostic.rand(rng, T) * n))
-end
-
-function create_diag(rng::AbstractRNG, ::Type{T}, dim::Number, max_value::Number;
-        sorted::Bool=true, reverse_sort::Bool=false) where {T <: Number}
-    diagonal_matrix = DeviceAgnostic.zeros(rng, T, dim, dim)
-    if sorted == true
-        if reverse_sort == true
-            diagonal_values = sort(
-                DeviceAgnostic.rand(rng, T, dim) .* max_value; rev=true)
-            diagonal_values[1] = max_value
-        else
-            diagonal_values = sort(DeviceAgnostic.rand(rng, T, dim) .* max_value)
-            diagonal_values[end] = max_value
-        end
-    else
-        diagonal_values = DeviceAgnostic.rand(rng, T, dim) .* max_value
-    end
-
-    for i in 1:dim
-        diagonal_matrix[i, i] = diagonal_values[i]
-    end
-
-    return diagonal_matrix
-end
-
-function create_qmatrix(rng::AbstractRNG, ::Type{T}, dim::Number,
-        coord_i::Number,
-        coord_j::Number,
-        theta::Number) where {T <: Number}
-    qmatrix = DeviceAgnostic.zeros(rng, T, dim, dim)
-
-    for i in 1:dim
-        qmatrix[i, i] = 1.0
-    end
-
-    qmatrix[coord_i, coord_i] = cos(theta)
-    qmatrix[coord_j, coord_j] = cos(theta)
-    qmatrix[coord_i, coord_j] = -sin(theta)
-    qmatrix[coord_j, coord_i] = sin(theta)
-    return qmatrix
-end
-
-function get_sparsity(M, dim)
-    return size(M[M .!= 0], 1) / (dim * dim - size(M[M .!= 0], 1)) #nonzero/zero elements
-end
diff --git a/src/esn/hybridesn.jl b/src/esn/hybridesn.jl
index 13c8b642..129b4f8f 100644
--- a/src/esn/hybridesn.jl
+++ b/src/esn/hybridesn.jl
@@ -24,8 +24,8 @@ end
 """
 KnowledgeModel(prior_model, u0, tspan, datasize)
 
-Constructs a `Hybrid` variation of Echo State Networks (ESNs) integrating a knowledge-based model
-(`prior_model`) with ESNs for advanced training and prediction in chaotic systems.
+Constructs a `Hybrid` variation of Echo State Networks (ESNs) [^Pathak2018]
+integrating a knowledge-based model (`prior_model`) with ESNs.
 
 # Parameters
 
@@ -34,15 +34,7 @@ Constructs a `Hybrid` variation of Echo State Networks (ESNs) integrating a know
   - `tspan`: Time span as a tuple, indicating the duration for model operation.
   - `datasize`: The size of the data to be processed.
 
-# Returns
-
-  - A `Hybrid` struct instance representing the combined ESN and knowledge-based model.
-
-This method is effective for chaotic processes as highlighted in [^Pathak].
-
-Reference:
-
-[^Pathak]: Jaideep Pathak et al.
+[^Pathak2018]: Jaideep Pathak et al.
     "Hybrid Forecasting of Chaotic Processes:
     Using Machine Learning in Conjunction with a Knowledge-Based Model" (2018).
 """
@@ -59,11 +51,7 @@ end
     HybridESN(model, train_data, in_size, res_size; kwargs...)
 
 Construct a Hybrid Echo State Network (ESN) model that integrates
-traditional Echo State Networks with a predefined knowledge model for
-enhanced performance on chaotic systems or complex datasets. This
-constructor allows for the creation of a customized ESN architecture by
-specifying the reservoir size, input size, and various other parameters that
-influence the network's behavior and learning capacity.
+traditional Echo State Networks with a predefined knowledge model [^Pathak2018].
 
 # Parameters
 
@@ -79,41 +67,29 @@ influence the network's behavior and learning capacity.
 
 # Optional Keyword Arguments
 
-  - `input_layer`: A function to initialize the input matrix. Default is `scaled_rand`.
-  - `reservoir`: A function to initialize the reservoir matrix. Default is `rand_sparse`.
-  - `bias`: A function to initialize the bias vector. Default is `zeros64`.
-  - `reservoir_driver`: The driving system for the reservoir. Default is an RNN model.
+  - `input_layer`: A function to initialize the input matrix.
+    Default is `scaled_rand`.
+  - `reservoir`: A function to initialize the reservoir matrix.
+    Default is `rand_sparse`.
+  - `bias`: A function to initialize the bias vector.
+    Default is `zeros32`.
+  - `reservoir_driver`: The driving system for the reservoir.
+    Default is an RNN model.
   - `nla_type`: The type of non-linear activation used in the reservoir.
     Default is `NLADefault()`.
-  - `states_type`: Defines the type of states used in the ESN (e.g., standard states).
-    Default is `StandardStates()`.
-  - `washout`: The number of initial timesteps to be discarded in the ESN's training phase.
-    Default is 0.
-  - `rng`: Random number generator used for initializing weights. Default is the package's
-    default random number generator.
-  - `T`: The data type for the matrices (e.g., `Float32`). Influences computational
-    efficiency and precision.
-  - `matrix_type`: The type of matrix used for storing the training data. Default is
-    inferred from `train_data`.
-
-# Returns
-
-  - A `HybridESN` instance configured according to the provided parameters and
-    suitable for further training and prediction tasks.
-
-# Example
-
-```julia
-# Define a KnowledgeModel
-km = KnowledgeModel(prior_model_function, u0, (0, 100), 1000)
-
-# Create a HybridESN
-hesn = HybridESN(km, train_data, 10, 100; washout=100)
-
-# Train and predict
-train(hesn, target_data)
-prediction = hesn(prediction_object, output_layer)
-```
+  - `states_type`: Defines the type of states used in the
+    ESN. Default is `StandardStates()`.
+  - `washout`: The number of initial timesteps to be
+    discarded in the ESN's training phase. Default is 0.
+  - `rng`: Random number generator used for initializing weights.
+    Default is `Utils.default_rng()`.
+  - `T`: The data type for the matrices (e.g., `Float32`).
+  - `matrix_type`: The type of matrix used for storing the training data.
+    Default is inferred from `train_data`.
+
+[^Pathak2018]: Jaideep Pathak et al.
+    "Hybrid Forecasting of Chaotic Processes:
+    Using Machine Learning in Conjunction with a Knowledge-Based Model" (2018).
 """
 function HybridESN(model,
         train_data,
@@ -133,7 +109,7 @@ function HybridESN(model,
 
     if states_type isa AbstractPaddedStates
         in_size = size(train_data, 1) + 1
-        train_data = vcat(Adapt.adapt(matrix_type, ones(1, size(train_data, 2))),
+        train_data = vcat(adapt(matrix_type, ones(1, size(train_data, 2))),
             train_data)
     else
         in_size = size(train_data, 1)
diff --git a/src/predict.jl b/src/predict.jl
index 46b89076..ca0f782c 100644
--- a/src/predict.jl
+++ b/src/predict.jl
@@ -1,3 +1,71 @@
+abstract type AbstractOutputLayer end
+abstract type AbstractPrediction end
+
+#general output layer struct
+struct OutputLayer{T, I, S, L} <: AbstractOutputLayer
+    training_method::T
+    output_matrix::I
+    out_size::S
+    last_value::L
+end
+
+function Base.show(io::IO, ol::OutputLayer)
+    print(io, "OutputLayer successfully trained with output size: ", ol.out_size)
+end
+
+#prediction types
+"""
+    Generative(prediction_len)
+
+A prediction strategy that enables models to generate autonomous multi-step
+forecasts by recursively feeding their own outputs back as inputs for
+subsequent prediction steps.
+
+# Parameters
+
+  - `prediction_len`: The number of future steps to predict.
+
+# Description
+
+The `Generative` prediction method allows a model to perform multi-step
+forecasting by using its own previous predictions as inputs for future predictions.
+
+At each step, the model takes the current input, generates a prediction,
+and then incorporates that prediction into the input for the next step.
+This recursive process continues until the specified
+number of prediction steps (`prediction_len`) is reached.
+"""
+struct Generative{T} <: AbstractPrediction
+    prediction_len::T
+end
+
+struct Predictive{I, T} <: AbstractPrediction
+    prediction_data::I
+    prediction_len::T
+end
+
+"""
+    Predictive(prediction_data)
+
+A prediction strategy for supervised learning tasks,
+where a model predicts labels based on a provided set
+of input features (`prediction_data`).
+
+# Parameters
+
+  - `prediction_data`: The input data used for prediction, `feature` x `sample`
+
+# Description
+
+The `Predictive` prediction method uses the provided input data
+(`prediction_data`) to produce corresponding labels or outputs based
+on the learned relationships in the model.
+"""
+function Predictive(prediction_data)
+    prediction_len = size(prediction_data, 2)
+    Predictive(prediction_data, prediction_len)
+end
+
 function obtain_prediction(rc::AbstractReservoirComputer,
         prediction::Generative,
         x,
@@ -48,7 +116,7 @@ end
 
 #single matrix for other training methods
 function output_storing(training_method, out_size, prediction_len, storing_type)
-    return Adapt.adapt(storing_type, zeros(out_size, prediction_len))
+    return adapt(storing_type, zeros(out_size, prediction_len))
 end
 
 #general storing -> single matrix