diff --git a/Jenkinsfile b/Jenkinsfile
index c80ce66253..b423eb5590 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -1,4 +1,4 @@
 @Library('pipeline-library')_
 
 VitisLibPipeline (branch: 'master', libname: 'xf_blas', 
-                  email: 'amr@xilinx.com', devtest: 'RunBLAS.sh', TOOLVERSION: '2019.1_release')
+                  email: 'amr@xilinx.com', devtest: 'RunBLAS.sh', TOOLVERSION: '2019.2_released')
diff --git a/L1/include/hw/xf_blas/gemm.hpp b/L1/include/hw/xf_blas/gemm.hpp
new file mode 100644
index 0000000000..4fdc7d4bc2
--- /dev/null
+++ b/L1/include/hw/xf_blas/gemm.hpp
@@ -0,0 +1,108 @@
+/*
+ * Copyright 2019 Xilinx, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef XF_BLAS_GEMM_HPP
+#define XF_BLAS_GEMM_HPP
+
+#ifndef __cplusplus
+#error "BLAS Library only works with C++."
+#endif
+
+#include "ap_int.h"
+#include "hls_stream.h"
+#include "xf_blas/helpers.hpp"
+#include "scal.hpp"
+#include "axpy.hpp"
+
+namespace xf {
+
+namespace blas {
+
+template <typename t_DataType, unsigned int t_M, unsigned int t_N = t_M, typename t_MacDataType = t_DataType>
+class SystolicArray {
+   public:
+    static void process_dsp(unsigned int p_k,
+                            hls::stream<WideType<t_DataType, t_M> >& p_As,
+                            hls::stream<WideType<t_DataType, t_N> >& p_Bs,
+                            hls::stream<WideType<t_MacDataType, t_N> >& p_sum,
+                            unsigned int p_multi = 1) {
+#ifndef __SYNTHESIS__
+        assert(p_k >= t_M + t_N);
+#endif
+
+        WideType<t_DataType, t_M + t_N> l_winA[t_M];
+#pragma HLS ARRAY_PARTITION variable = l_winA dim = 0 complete
+        WideType<t_DataType, t_M + t_N> l_winB[t_N];
+#pragma HLS ARRAY_PARTITION variable = l_winB dim = 0 complete
+
+        WideType<t_MacDataType, t_N> l_C[t_M];
+#pragma HLS ARRAY_PARTITION variable = l_C dim = 0 complete
+        WideType<t_MacDataType, t_N> l_Co[t_M];
+#pragma HLS ARRAY_PARTITION variable = l_Co dim = 0 complete
+
+        for (int k = 0, l = 0; l < p_multi * p_k + t_M + t_N; l++, k++) {
+#pragma HLS PIPELINE
+            if (k == p_k) {
+                k = 0;
+            }
+
+            if (l > p_k && k >= t_N && k < t_M + t_N) {
+                p_sum.write(l_Co[k - t_N]);
+            }
+
+            WideType<t_DataType, t_M> l_A = 0;
+            WideType<t_DataType, t_N> l_B = 0;
+
+            if (l < p_multi * p_k) {
+                l_A = p_As.read();
+                l_B = p_Bs.read();
+            }
+
+            for (int j = 0; j < t_M; j++) l_winA[j].shift(l_A[j]);
+            for (int j = 0; j < t_N; j++) l_winB[j].shift(l_B[j]);
+            for (int m = 0; m < t_M; m++) {
+                for (int n = 0; n < t_N; n++) {
+                    int l_id = m + n;
+                    if (l_id == k) {
+                        l_Co[m][n] = l_C[m][n];
+                        l_C[m][n] = 0;
+                    }
+                    l_C[m][n] += l_winA[m][l_id] * l_winB[n][l_id];
+                }
+            }
+        }
+    }
+};
+
+template <typename t_DataType,
+          unsigned int t_M,
+          unsigned int t_N = t_M,
+          typename t_IndexType = unsigned int,
+          typename t_MacDataType = t_DataType>
+void gemm(const unsigned int p_k,
+          hls::stream<WideType<t_DataType, t_M> >& p_A,
+          hls::stream<WideType<t_DataType, t_N> >& p_B,
+          hls::stream<WideType<t_MacDataType, t_N> >& p_C,
+          const unsigned int p_r = 1) {
+#pragma HLS DATAFLOW
+    SystolicArray<t_DataType, t_M, t_N, t_MacDataType>::process_dsp(p_k, p_A, p_B, p_C, p_r);
+}
+
+} // end namespace blas
+
+} // end namespace xf
+
+#endif
diff --git a/L1/include/hw/xf_blas/helpers/dataMover/transpMatB2.hpp b/L1/include/hw/xf_blas/helpers/dataMover/transpMatB2.hpp
index 3e6abfa992..5c7ba40924 100644
--- a/L1/include/hw/xf_blas/helpers/dataMover/transpMatB2.hpp
+++ b/L1/include/hw/xf_blas/helpers/dataMover/transpMatB2.hpp
@@ -92,30 +92,54 @@ template <typename t_DataType, unsigned int t_ParEntries>
 void transpMatBlocks(unsigned int p_blocks,
                      hls::stream<WideType<t_DataType, t_ParEntries> >& p_in,
                      hls::stream<WideType<t_DataType, t_ParEntries> >& p_out) {
-    t_DataType l_buf[t_ParEntries][t_ParEntries];
+    t_DataType l_buf[2][t_ParEntries][t_ParEntries];
 #pragma HLS ARRAY_PARTITION variable = l_buf complete dim = 0
-    for (unsigned int l_block = 0; l_block < p_blocks; ++l_block) {
-        // shuffle and store
-        for (unsigned int i = 0; i < t_ParEntries; ++i) {
+
+    for (int i = 0; i < t_ParEntries; ++i) {
 #pragma HLS PIPELINE
-            WideType<t_DataType, t_ParEntries> l_val;
+        WideType<t_DataType, t_ParEntries> l_val;
 #pragma HLS ARRAY_PARTITION variable = l_val complete
-            l_val = p_in.read();
-            for (unsigned int j = 0; j < t_ParEntries; ++j) {
-                l_buf[i][j] = l_val[j];
-            }
+        l_val = p_in.read();
+        for (int j = 0; j < t_ParEntries; ++j) {
+            l_buf[0][i][j] = l_val[j];
         }
+    }
 
-        for (unsigned int i = 0; i < t_ParEntries; ++i) {
+    for (unsigned int l_block = 1; l_block < p_blocks; ++l_block) {
+        int jIn = 0, jOut = 0;
+        do {
 #pragma HLS PIPELINE
-            WideType<t_DataType, t_ParEntries> l_val;
-#pragma HLS ARRAY_PARTITION variable = l_val complete
-            for (unsigned int j = 0; j < t_ParEntries; ++j) {
-                l_val[j] = l_buf[j][i];
+            WideType<t_DataType, t_ParEntries> l_valIn;
+#pragma HLS ARRAY_PARTITION variable = l_valIn complete
+            WideType<t_DataType, t_ParEntries> l_valOut;
+#pragma HLS ARRAY_PARTITION variable = l_valOut complete
+            if (p_in.read_nb(l_valIn)) {
+                for (int k = 0; k < t_ParEntries; ++k) {
+                    l_buf[l_block % 2][jIn][k] = l_valIn[k];
+                }
+                jIn++;
             }
-            p_out.write(l_val);
-        }
+            for (int k = 0; k < t_ParEntries; ++k) {
+                l_valOut[k] = l_buf[(l_block - 1) % 2][k][jOut];
+            }
+            if (jOut < t_ParEntries) {
+                p_out.write(l_valOut);
+                jOut++;
+            }
+        } while ((jIn < t_ParEntries) || (jOut < t_ParEntries));
     }
+
+    int i = 0;
+    do {
+#pragma HLS PIPELINE
+        WideType<t_DataType, t_ParEntries> l_valOut;
+#pragma HLS ARRAY_PARTITION variable = l_valOut complete
+        for (int j = 0; j < t_ParEntries; ++j) {
+            l_valOut[j] = l_buf[(p_blocks - 1) % 2][j][i];
+        }
+        p_out.write(l_valOut);
+        i++;
+    } while (i < t_ParEntries);
 }
 
 template <typename t_DataType, unsigned int t_ParEntries>
diff --git a/L1/include/hw/xf_blas/helpers/dataMover/vecMoverB1.hpp b/L1/include/hw/xf_blas/helpers/dataMover/vecMoverB1.hpp
index dae62dc15e..b6f2d34530 100644
--- a/L1/include/hw/xf_blas/helpers/dataMover/vecMoverB1.hpp
+++ b/L1/include/hw/xf_blas/helpers/dataMover/vecMoverB1.hpp
@@ -110,6 +110,7 @@ void readVec2Stream(t_DataType* p_in, unsigned int p_n, hls::stream<WideType<t_D
 #pragma HLS PIPELINE
         BitConv<t_DataType> l_bitConv;
         WideType<t_DataType, t_ParEntries> l_val;
+#pragma HLS ARRAY_PARTITION variable = l_val complete
         for (unsigned int j = 0; j < t_ParEntries; ++j) {
             l_val[j] = p_in[i * t_ParEntries + j];
         }
diff --git a/L1/include/hw/xf_blas/helpers/utils/types.hpp b/L1/include/hw/xf_blas/helpers/utils/types.hpp
index 44de080017..4a5d0da94a 100644
--- a/L1/include/hw/xf_blas/helpers/utils/types.hpp
+++ b/L1/include/hw/xf_blas/helpers/utils/types.hpp
@@ -45,66 +45,197 @@ class WideType {
     static const unsigned int FLOAT_WIDTH = 7;
 
    public:
+    static const unsigned int t_TypeWidth = t_Width * t_DataWidth;
+    typedef ap_uint<t_TypeWidth> t_TypeInt;
     typedef T DataType;
     static const unsigned int t_WidthS = t_Width;
     static const unsigned int t_per4k = t_4k / t_DataWidth / t_Width * 8;
 
    public:
-    T& getVal(unsigned int i) { return (m_Val[i]); }
-    T& operator[](unsigned int p_Idx) { return (m_Val[p_Idx]); }
+    T& getVal(unsigned int i) {
+#ifndef __SYNTHESIS__
+        assert(i < t_Width);
+#endif
+        return (m_Val[i]);
+    }
+    T& operator[](unsigned int p_Idx) {
+#ifndef __SYNTHESIS__
+        assert(p_Idx < t_Width);
+#endif
+        return (m_Val[p_Idx]);
+    }
+    const T& operator[](unsigned int p_Idx) const {
+#ifndef __SYNTHESIS__
+        assert(p_Idx < t_Width);
+#endif
+        return (m_Val[p_Idx]);
+    }
     T* getValAddr() { return (&m_Val[0]); }
-    WideType() {}
-    WideType(T p_initScalar) {
-#pragma HLS inline self
+
+    WideType() {
+#pragma HLS ARRAY_PARTITION variable = m_Val complete dim = 1
+    }
+
+    WideType(const WideType& wt) {
+#pragma HLS ARRAY_PARTITION variable = m_Val complete dim = 1
+        for (int i = 0; i < t_Width; i++)
+#pragma HLS UNROLL
+            m_Val[i] = wt[i];
+    }
+
+    WideType(const t_TypeInt& p_val) {
+#pragma HLS ARRAY_PARTITION variable = m_Val complete dim = 1
+        for (int i = 0; i < t_Width; ++i) {
+#pragma HLS UNROLL
+            ap_uint<t_DataWidth> l_val = p_val.range(t_DataWidth * (1 + i) - 1, t_DataWidth * i);
+            m_Val[i] = *reinterpret_cast<T*>(&l_val);
+        }
+    }
+
+    WideType(const T p_initScalar) {
+#pragma HLS ARRAY_PARTITION variable = m_Val complete dim = 1
+        for (int i = 0; i < t_Width; ++i) {
+#pragma HLS UNROLL
+            m_Val[i] = p_initScalar;
+        }
+    }
+
+    operator const t_TypeInt() {
+#pragma HLS ARRAY_PARTITION variable = m_Val complete dim = 1
+        t_TypeInt l_fVal;
         for (int i = 0; i < t_Width; ++i) {
 #pragma HLS UNROLL
-            getVal(i) = p_initScalar;
+            T l_v = m_Val[i];
+            ap_uint<t_DataWidth> l_val = *reinterpret_cast<ap_uint<t_DataWidth>*>(&l_v);
+            l_fVal.range(t_DataWidth * (1 + i) - 1, t_DataWidth * i) = l_val;
         }
+        return l_fVal;
     }
+
     T shift(T p_ValIn) {
-#pragma HLS inline self
         T l_valOut = m_Val[t_Width - 1];
     WIDE_TYPE_SHIFT:
         for (int i = t_Width - 1; i > 0; --i) {
+#pragma HLS UNROLL
             T l_val = m_Val[i - 1];
             m_Val[i] = l_val;
         }
         m_Val[0] = p_ValIn;
         return (l_valOut);
     }
+
     T shift() {
-#pragma HLS inline self
         T l_valOut = m_Val[t_Width - 1];
-    WIDE_TYPE_SHIFT:
         for (int i = t_Width - 1; i > 0; --i) {
+#pragma HLS UNROLL
             T l_val = m_Val[i - 1];
             m_Val[i] = l_val;
         }
         return (l_valOut);
     }
+
     T unshift() {
-#pragma HLS inline self
         T l_valOut = m_Val[0];
-    WIDE_TYPE_SHIFT:
         for (int i = 0; i < t_Width - 1; ++i) {
+#pragma HLS UNROLL
             T l_val = m_Val[i + 1];
             m_Val[i] = l_val;
         }
         return (l_valOut);
     }
+
+    T unshift(const T p_val) {
+        T l_valOut = m_Val[0];
+        for (int i = 0; i < t_Width - 1; ++i) {
+#pragma HLS UNROLL
+            T l_val = m_Val[i + 1];
+            m_Val[i] = l_val;
+        }
+        m_Val[t_Width - 1] = p_val;
+        return (l_valOut);
+    }
+
     static const WideType zero() {
         WideType l_zero;
         for (int i = 0; i < t_Width; ++i) {
+#pragma HLS UNROLL
             l_zero[i] = 0;
         }
         return (l_zero);
     }
+
     static unsigned int per4k() { return (t_per4k); }
     void print(std::ostream& os) {
         for (int i = 0; i < t_Width; ++i) {
-            os << std::setw(FLOAT_WIDTH) << getVal(i) << " ";
+            os << std::setw(FLOAT_WIDTH) << m_Val[i] << " ";
         }
     }
+
+    friend std::ostream& operator<<(std::ostream& os, WideType& p_Val) {
+        p_Val.print(os);
+        return (os);
+    }
+};
+
+template <typename T, unsigned int t_DataWidth>
+class WideType<T, 1, t_DataWidth> {
+   private:
+    T m_Val;
+    static const unsigned int t_4k = 4096;
+    static const unsigned int FLOAT_WIDTH = 7;
+
+   public:
+    static const unsigned int t_TypeWidth = t_DataWidth;
+    typedef T t_TypeInt;
+    typedef T DataType;
+    static const unsigned int t_WidthS = 1;
+    static const unsigned int t_per4k = t_4k / t_DataWidth * 8;
+
+   public:
+    T& operator[](unsigned int p_Idx) {
+#ifndef __SYNTHESIS__
+        assert(p_Idx == 0);
+#endif
+        return m_Val;
+    }
+
+    const T& operator[](unsigned int p_Idx) const {
+#ifndef __SYNTHESIS__
+        assert(p_Idx == 0);
+#endif
+        return m_Val;
+    }
+
+    T* getValAddr() { return (&m_Val); }
+
+    WideType() {}
+
+    WideType(const WideType& wt) { m_Val = wt[0]; }
+
+    WideType(const T p_initScalar) { m_Val = p_initScalar; }
+
+    operator const t_TypeInt() { return m_Val; }
+
+    T shift(T p_ValIn) {
+        T l_valOut = m_Val;
+        m_Val = p_ValIn;
+        return l_valOut;
+    }
+    T shift() { return m_Val; }
+
+    T unshift() { return m_Val; }
+
+    T unshift(T p_ValIn) {
+        T l_valOut = m_Val;
+        m_Val = p_ValIn;
+        return l_valOut;
+    }
+
+    static const WideType zero() { return WideType(0); }
+
+    static unsigned int per4k() { return (t_per4k); }
+    void print(std::ostream& os) { os << std::setw(FLOAT_WIDTH) << m_Val << " "; }
+
     friend std::ostream& operator<<(std::ostream& os, WideType& p_Val) {
         p_Val.print(os);
         return (os);
@@ -251,6 +382,44 @@ inline double BitConv<double>::toType(BitConv<double>::BitsType p_Val) {
     return (u.f);
 }
 
+template <unsigned int t_Bits, unsigned int t_Width, typename t_DataType>
+ap_uint<t_Bits> convWideVal2Bits(WideType<t_DataType, t_Width> p_val) {
+#pragma HLS inline
+#ifndef __SYNTHESIS__
+    assert((t_Bits > t_Width) && (t_Bits % t_Width == 0));
+#endif
+    const unsigned int t_DataBits = sizeof(t_DataType) * 8;
+    const unsigned int t_ResEntryBits = t_Bits / t_Width;
+    ap_uint<t_Bits> l_res;
+    for (unsigned int i = 0; i < t_Width; ++i) {
+#pragma HLS UNROLL
+        BitConv<t_DataType> l_bitConv;
+        ap_uint<t_DataBits> l_datBits = l_bitConv.toBits(p_val[i]);
+        ap_uint<t_ResEntryBits> l_resEntry = l_datBits;
+        l_res.range((i + 1) * t_ResEntryBits - 1, i * t_ResEntryBits) = l_resEntry;
+    }
+    return l_res;
+}
+
+template <unsigned int t_Bits, unsigned int t_Width, typename t_DataType>
+WideType<t_DataType, t_Width> convBits2WideType(ap_uint<t_Bits> p_bits) {
+#pragma HLS inline
+#ifndef __SYNTHESIS__
+    assert((t_Bits > t_Width) && (t_Bits % t_Width == 0));
+#endif
+    const unsigned int t_DataBits = sizeof(t_DataType) * 8;
+    const unsigned int t_InEntryBits = t_Bits / t_Width;
+    WideType<t_DataType, t_Width> l_res;
+    for (unsigned int i = 0; i < t_Width; ++i) {
+#pragma HLS UNROLL
+        BitConv<t_DataType> l_bitConv;
+        ap_uint<t_InEntryBits> l_inDatBits = p_bits.range((i + 1) * t_InEntryBits - 1, i * t_InEntryBits);
+        ap_uint<t_DataBits> l_datBits = l_inDatBits;
+        t_DataType l_val = l_bitConv.toType(l_datBits);
+        l_res[i] = l_val;
+    }
+    return l_res;
+}
 // Type converter - for vectors of different lengths and types
 template <typename TS, typename TD>
 class WideConv {
diff --git a/L1/tests/sw/include/utils.hpp b/L1/tests/sw/include/utils.hpp
index 610e69176f..b83718b8cd 100644
--- a/L1/tests/sw/include/utils.hpp
+++ b/L1/tests/sw/include/utils.hpp
@@ -43,7 +43,7 @@ bool compare<double>(double x, double ref) {
 template <>
 bool compare<float>(float x, float ref) {
     bool l_exactMatch;
-    return isClose<float>(2e-3, 3e-6, x, ref, l_exactMatch);
+    return isClose<float>(1e-3, 3e-6, x, ref, l_exactMatch);
 }
 
 template <typename T>
@@ -64,4 +64,31 @@ bool compare(unsigned int n, T* x, T* ref) {
     }
     return l_ret;
 }
+
+template <typename T>
+bool compare(unsigned int n, T* x, T* ref, int& err) {
+    bool l_ret = true;
+    try {
+        if (ref == nullptr) {
+            if (x == nullptr) return true;
+            for (int i = 0; i < n; i++) {
+                if (!compare(x[i], (T)0)) {
+                    err++;
+                    l_ret = false;
+                }
+            }
+        } else {
+            for (int i = 0; i < n; i++) {
+                if (!compare(x[i], ref[i])) {
+                    l_ret = false;
+                    err++;
+                }
+            }
+        }
+    } catch (exception& e) {
+        std::cout << "Exception happend: " << e.what() << std::endl;
+        return false;
+    }
+    return l_ret;
+}
 #endif
diff --git a/L1/tests/sw/python/run_test.py b/L1/tests/sw/python/run_test.py
index 7e40a1ca64..c35f90aae8 100755
--- a/L1/tests/sw/python/run_test.py
+++ b/L1/tests/sw/python/run_test.py
@@ -97,6 +97,11 @@ def main(profileList, args):
     else:
       statPath = os.path.join(os.getcwd(),"statistics_%d.rpt"%args.id) 
     list2File(statList, statPath) 
+    failures = [k for k in statList if k['Status'] == 'Failed']
+    if len(failures) !=0 :
+        sys.exit(1)
+    else:
+        sys.exit(0)
 
 if __name__== "__main__":
   parser = argparse.ArgumentParser(description='Generate random vectors and run test.')
diff --git a/L3/README.md b/L3/README.md
index c405f8eb4d..58fc10c936 100644
--- a/L3/README.md
+++ b/L3/README.md
@@ -1,2 +1,2 @@
-# Level 3: Scout software APIs
-This directory contains software libraries and APIs for Sount software users.
+# Level 3: Vitis software APIs
+This directory contains software libraries and APIs for Vitis software users.
diff --git a/L3/benchmarks/bench_helper.hpp b/L3/benchmarks/bench_helper.hpp
index d025f27066..e6c5bc0636 100644
--- a/L3/benchmarks/bench_helper.hpp
+++ b/L3/benchmarks/bench_helper.hpp
@@ -62,7 +62,7 @@ float getBoardFreqMHz(string xclbin) {
         }
     }
     if (l_freq == -1) {
-        // if xbutil does not work, user could put the XOCC achieved kernel frequcy here
+        // if xbutil does not work, user could put the vitis achieved kernel frequcy here
         l_freq = 250;
         std::cout << "INFO: Failed to get board frequency by xclbinutil. This is normal for cpu and hw emulation, "
                      "using 250 MHz for reporting.\n";
diff --git a/L3/benchmarks/gemm/data/float/matA_in_1024_1024.bin b/L3/benchmarks/gemm/data/float/matA_in_1024_1024.bin
index f7b65812a9..bc739fef9d 100644
Binary files a/L3/benchmarks/gemm/data/float/matA_in_1024_1024.bin and b/L3/benchmarks/gemm/data/float/matA_in_1024_1024.bin differ
diff --git a/L3/benchmarks/gemm/data/float/matA_in_256_256.bin b/L3/benchmarks/gemm/data/float/matA_in_256_256.bin
index 6cd5b91dbb..d58f0d3265 100644
Binary files a/L3/benchmarks/gemm/data/float/matA_in_256_256.bin and b/L3/benchmarks/gemm/data/float/matA_in_256_256.bin differ
diff --git a/L3/benchmarks/gemm/data/float/matA_in_512_512.bin b/L3/benchmarks/gemm/data/float/matA_in_512_512.bin
index d809dcac41..1d3f387513 100644
Binary files a/L3/benchmarks/gemm/data/float/matA_in_512_512.bin and b/L3/benchmarks/gemm/data/float/matA_in_512_512.bin differ
diff --git a/L3/benchmarks/gemm/data/float/matB_in_1024_1024.bin b/L3/benchmarks/gemm/data/float/matB_in_1024_1024.bin
index f7b65812a9..bc739fef9d 100644
Binary files a/L3/benchmarks/gemm/data/float/matB_in_1024_1024.bin and b/L3/benchmarks/gemm/data/float/matB_in_1024_1024.bin differ
diff --git a/L3/benchmarks/gemm/data/float/matB_in_256_256.bin b/L3/benchmarks/gemm/data/float/matB_in_256_256.bin
index 6cd5b91dbb..d58f0d3265 100644
Binary files a/L3/benchmarks/gemm/data/float/matB_in_256_256.bin and b/L3/benchmarks/gemm/data/float/matB_in_256_256.bin differ
diff --git a/L3/benchmarks/gemm/data/float/matB_in_512_512.bin b/L3/benchmarks/gemm/data/float/matB_in_512_512.bin
index d809dcac41..1d3f387513 100644
Binary files a/L3/benchmarks/gemm/data/float/matB_in_512_512.bin and b/L3/benchmarks/gemm/data/float/matB_in_512_512.bin differ
diff --git a/L3/benchmarks/gemm/data/float/matC_in_1024_1024.bin b/L3/benchmarks/gemm/data/float/matC_in_1024_1024.bin
index f7b65812a9..bc739fef9d 100644
Binary files a/L3/benchmarks/gemm/data/float/matC_in_1024_1024.bin and b/L3/benchmarks/gemm/data/float/matC_in_1024_1024.bin differ
diff --git a/L3/benchmarks/gemm/data/float/matC_in_256_256.bin b/L3/benchmarks/gemm/data/float/matC_in_256_256.bin
index 6cd5b91dbb..d58f0d3265 100644
Binary files a/L3/benchmarks/gemm/data/float/matC_in_256_256.bin and b/L3/benchmarks/gemm/data/float/matC_in_256_256.bin differ
diff --git a/L3/benchmarks/gemm/data/float/matC_in_512_512.bin b/L3/benchmarks/gemm/data/float/matC_in_512_512.bin
index d809dcac41..1d3f387513 100644
Binary files a/L3/benchmarks/gemm/data/float/matC_in_512_512.bin and b/L3/benchmarks/gemm/data/float/matC_in_512_512.bin differ
diff --git a/L3/benchmarks/gemm/data/float/matC_out_256_256.bin b/L3/benchmarks/gemm/data/float/matC_out_256_256.bin
index 3461c3bc55..a4509a7547 100644
Binary files a/L3/benchmarks/gemm/data/float/matC_out_256_256.bin and b/L3/benchmarks/gemm/data/float/matC_out_256_256.bin differ
diff --git a/L3/benchmarks/gemm/data/float/matC_out_512_512.bin b/L3/benchmarks/gemm/data/float/matC_out_512_512.bin
index f62f7a2d77..15cb2c2767 100644
Binary files a/L3/benchmarks/gemm/data/float/matC_out_512_512.bin and b/L3/benchmarks/gemm/data/float/matC_out_512_512.bin differ
diff --git a/L3/benchmarks/gemm/gemm_bench.cpp b/L3/benchmarks/gemm/gemm_bench.cpp
index 9783b8bccc..0df5f02517 100644
--- a/L3/benchmarks/gemm/gemm_bench.cpp
+++ b/L3/benchmarks/gemm/gemm_bench.cpp
@@ -50,19 +50,13 @@ void readBin(char* mat, unsigned int row, unsigned int col, string dataDir, stri
 int main(int argc, char** argv) {
     if (argc < 3) {
         cerr << " usage: \n"
-             << " gemm_bench.exe gemx.xclbin config_info.dat m k n data_dir iteration\n"
+             << " gemm_bench.exe gemx.xclbin config_info.dat m k n data_dir\n"
              << " gemm_bench.exe gemx.xclbin config_info.dat\n";
         return EXIT_FAILURE;
     }
     unsigned int l_argIdx = 1;
     string l_xclbinFile(argv[l_argIdx++]);
     string l_configFile(argv[l_argIdx++]);
-    string l_logFile;
-
-    ofstream logFile("xrt_report.txt");
-    logFile.close();
-    l_logFile = "xrt_report.txt";
-
     int m = 256;
     int k = 256;
     int n = 256;
@@ -91,12 +85,6 @@ int main(int argc, char** argv) {
 #endif
     }
 
-    int iteration = 5;
-    if (argc >= 9) {
-        iteration = stoi(argv[l_argIdx++]);
-        cout << "Read custom iteration: " << iteration << endl;
-    }
-
     int i, j; // i-row l_numKernel -1 ,j- column l_numKernel -1
 
     vector<XFBLAS_dataType*> goldenC;
@@ -112,8 +100,7 @@ int main(int argc, char** argv) {
     TimePointType l_tp_create_time;
     l_tp_start_time = chrono::high_resolution_clock::now();
     xfblasEngine_t engineName = XFBLAS_ENGINE_GEMM;
-    xfblasStatus_t status =
-        xfblasCreate(l_xclbinFile.c_str(), l_configFile, l_logFile.c_str(), engineName, l_numKernel);
+    xfblasStatus_t status = xfblasCreate(l_xclbinFile.c_str(), l_configFile, engineName, l_numKernel);
 
     showTimeData("xfblasCreate", l_tp_start_time, l_tp_create_time);
 
@@ -122,94 +109,83 @@ int main(int argc, char** argv) {
     TimePointType l_tp_loop[3];
     chrono::duration<double> l_timeApiSum = chrono::seconds(0);
 
-    for (int i = 0; i < iteration; i++) {
-        vector<XFBLAS_dataType *> a, b, c;
-
-        for (int kernelIndex = 0; kernelIndex < l_numKernel; kernelIndex++) {
-            XFBLAS_dataType *tmp_a, *tmp_b, *tmp_c;
-            posix_memalign((void**)&tmp_a, 4096, m * k * sizeof(XFBLAS_dataType));
-            memset(tmp_a, 0, m * k);
-            posix_memalign((void**)&tmp_b, 4096, k * n * sizeof(XFBLAS_dataType));
-            memset(tmp_b, 0, k * n);
-            posix_memalign((void**)&tmp_c, 4096, m * n * sizeof(XFBLAS_dataType));
-            memset(tmp_c, 0, m * n);
-            readBin((char*)tmp_a, m, k, data_dir, "matA_in_", sizeof(XFBLAS_dataType));
-            readBin((char*)tmp_b, k, n, data_dir, "matB_in_", sizeof(XFBLAS_dataType));
-            readBin((char*)tmp_c, m, n, data_dir, "matC_in_", sizeof(XFBLAS_dataType));
-            a.push_back(tmp_a);
-            b.push_back(tmp_b);
-            c.push_back(tmp_c);
-        }
+    vector<XFBLAS_dataType *> a, b, c;
 
-        unsigned int l_tpIdx = 0;
-        l_tp_loop[l_tpIdx] = chrono::high_resolution_clock::now();
-        for (int kernelIndex = 0; kernelIndex < l_numKernel; kernelIndex++) {
-            status = xfblasMallocRestricted(m, k, sizeof(XFBLAS_dataType), a[kernelIndex], k, kernelIndex);
-            status = xfblasMallocRestricted(k, n, sizeof(XFBLAS_dataType), b[kernelIndex], n, kernelIndex);
-            status = xfblasMallocRestricted(m, n, sizeof(XFBLAS_dataType), c[kernelIndex], n, kernelIndex);
-#ifdef XFBLAS_LAUNCH_ASYNC
-            xfblasSetMatrixRestrictedAsync(a[kernelIndex], kernelIndex);
-            xfblasSetMatrixRestrictedAsync(b[kernelIndex], kernelIndex);
-            xfblasSetMatrixRestrictedAsync(c[kernelIndex], kernelIndex);
-#else
-            status = xfblasSetMatrixRestricted(a[kernelIndex], kernelIndex);
-            status = xfblasSetMatrixRestricted(b[kernelIndex], kernelIndex);
-            status = xfblasSetMatrixRestricted(c[kernelIndex], kernelIndex);
-#endif
-        }
-#ifdef XFBLAS_LAUNCH_ASYNC
-        xfblasKernelSynchronize();
-#endif
-        showTimeData("copyToFpga", l_tp_loop[l_tpIdx], l_tp_loop[l_tpIdx + 1]);
-        l_tpIdx++;
+    for (int kernelIndex = 0; kernelIndex < l_numKernel; kernelIndex++) {
+        XFBLAS_dataType *tmp_a, *tmp_b, *tmp_c;
+        posix_memalign((void**)&tmp_a, 4096, m * k * sizeof(XFBLAS_dataType));
+        memset(tmp_a, 0, m * k);
+        posix_memalign((void**)&tmp_b, 4096, k * n * sizeof(XFBLAS_dataType));
+        memset(tmp_b, 0, k * n);
+        posix_memalign((void**)&tmp_c, 4096, m * n * sizeof(XFBLAS_dataType));
+        memset(tmp_c, 0, m * n);
+        readBin((char*)tmp_a, m, k, data_dir, "matA_in_", sizeof(XFBLAS_dataType));
+        readBin((char*)tmp_b, k, n, data_dir, "matB_in_", sizeof(XFBLAS_dataType));
+        readBin((char*)tmp_c, m, n, data_dir, "matC_in_", sizeof(XFBLAS_dataType));
+        a.push_back(tmp_a);
+        b.push_back(tmp_b);
+        c.push_back(tmp_c);
+    }
 
-        for (int kernelIndex = 0; kernelIndex < l_numKernel; kernelIndex++) {
-            status = xfblasGemm(XFBLAS_OP_N, XFBLAS_OP_N, m, n, k, 1, a[kernelIndex], k, b[kernelIndex], n, 1,
-                                c[kernelIndex], n, kernelIndex);
-        }
+    unsigned int l_tpIdx = 0;
+    l_tp_loop[l_tpIdx] = chrono::high_resolution_clock::now();
+    for (int kernelIndex = 0; kernelIndex < l_numKernel; kernelIndex++) {
+        status = xfblasMallocRestricted(m, k, sizeof(XFBLAS_dataType), a[kernelIndex], k, kernelIndex);
+        status = xfblasMallocRestricted(k, n, sizeof(XFBLAS_dataType), b[kernelIndex], n, kernelIndex);
+        status = xfblasMallocRestricted(m, n, sizeof(XFBLAS_dataType), c[kernelIndex], n, kernelIndex);
+        status = xfblasSetMatrixRestricted(a[kernelIndex], kernelIndex);
+        status = xfblasSetMatrixRestricted(b[kernelIndex], kernelIndex);
+        status = xfblasSetMatrixRestricted(c[kernelIndex], kernelIndex);
+    }
+
+    showTimeData("copyToFpga", l_tp_loop[l_tpIdx], l_tp_loop[l_tpIdx + 1]);
+    l_tpIdx++;
+
+    for (int kernelIndex = 0; kernelIndex < l_numKernel; kernelIndex++) {
+        status = xfblasGemm(XFBLAS_OP_N, XFBLAS_OP_N, m, n, k, 1, a[kernelIndex], k, b[kernelIndex], n, 1,
+                            c[kernelIndex], n, kernelIndex);
+    }
 
-        for (int kernelIndex = 0; kernelIndex < l_numKernel; kernelIndex++) {
 #ifdef XFBLAS_LAUNCH_ASYNC
-            xfblasGetMatrixRestrictedAsync(c[kernelIndex], kernelIndex);
+    for (int kernelIndex = 0; kernelIndex < l_numKernel; kernelIndex++) {
+        xfblasGetMatrixRestrictedAsync(c[kernelIndex], kernelIndex);
+    }
+    xfblasKernelSynchronize();
+
 #else
-            status = xfblasGetMatrixRestricted(c[kernelIndex], kernelIndex);
-#endif
-        }
-#ifdef XFBLAS_LAUNCH_ASYNC
-        xfblasKernelSynchronize();
+    for (int kernelIndex = 0; kernelIndex < l_numKernel; kernelIndex++) {
+        status = xfblasGetMatrixRestricted(c[kernelIndex], kernelIndex);
+    }
 #endif
 
-        showTimeData("copyFromFpga", l_tp_loop[l_tpIdx], l_tp_loop[l_tpIdx + 1]);
-        l_tpIdx++;
+    showTimeData("copyFromFpga", l_tp_loop[l_tpIdx], l_tp_loop[l_tpIdx + 1]);
+    l_tpIdx++;
 
-        if (i == iteration - 1) {
-            for (int kernelIndex = 0; kernelIndex < l_numKernel; kernelIndex++) {
-                XFBLAS_dataType* tmp_c;
-                posix_memalign((void**)&tmp_c, 4096, m * n * sizeof(XFBLAS_dataType));
-                memcpy(tmp_c, c[kernelIndex], m * n * sizeof(XFBLAS_dataType));
-                resultC.push_back(tmp_c);
-            }
-        }
+    for (int kernelIndex = 0; kernelIndex < l_numKernel; kernelIndex++) {
+        XFBLAS_dataType* tmp_c;
+        posix_memalign((void**)&tmp_c, 4096, m * n * sizeof(XFBLAS_dataType));
+        memcpy(tmp_c, c[kernelIndex], m * n * sizeof(XFBLAS_dataType));
+        resultC.push_back(tmp_c);
+    }
 
-        for (int kernelIndex = 0; kernelIndex < l_numKernel; kernelIndex++) {
-            xfblasFree(a[kernelIndex], kernelIndex);
-            xfblasFree(b[kernelIndex], kernelIndex);
-            xfblasFree(c[kernelIndex], kernelIndex);
-        }
-        xfblasFreeInstr();
-        chrono::duration<double> l_timeApiLoop = l_tp_loop[l_tpIdx] - l_tp_loop[0];
-        l_timeApiSum = l_timeApiSum + l_timeApiLoop;
-
-        for (int kernelIndex = 0; kernelIndex < l_numKernel; kernelIndex++) {
-            free(a[kernelIndex]);
-            free(b[kernelIndex]);
-            free(c[kernelIndex]);
-        }
-        a.clear();
-        b.clear();
-        c.clear();
+    for (int kernelIndex = 0; kernelIndex < l_numKernel; kernelIndex++) {
+        xfblasFree(a[kernelIndex], kernelIndex);
+        xfblasFree(b[kernelIndex], kernelIndex);
+        xfblasFree(c[kernelIndex], kernelIndex);
     }
-    chrono::duration<double> l_timeApi = l_timeApiSum / iteration;
+    chrono::duration<double> l_timeApiLoop = l_tp_loop[l_tpIdx] - l_tp_loop[0];
+    l_timeApiSum = l_timeApiSum + l_timeApiLoop;
+
+    for (int kernelIndex = 0; kernelIndex < l_numKernel; kernelIndex++) {
+        free(a[kernelIndex]);
+        free(b[kernelIndex]);
+        free(c[kernelIndex]);
+    }
+    a.clear();
+    b.clear();
+    c.clear();
+
+    chrono::duration<double> l_timeApi = l_timeApiSum;
     double l_timeMs = l_timeApi.count() * 1e3;
 
     cout << "Api time is " << fixed << setprecision(6) << l_timeMs << " msec\n";
@@ -238,12 +214,6 @@ int main(int argc, char** argv) {
         } else {
             cout << "Test failed!\n";
         }
-        for (i = 0; i < 10; i++) {
-            for (j = 0; j < 10; j++) {
-                cout << (resultC[kernelIndex][IDX2R(i, j, k)]) << " ";
-            }
-            cout << "\n";
-        }
     }
 
     for (int kernelIndex = 0; kernelIndex < l_numKernel; kernelIndex++) {
diff --git a/L3/benchmarks/gemm/gemm_mkl/gemm_mkl_gen_bin.cpp b/L3/benchmarks/gemm/gemm_mkl/gemm_mkl_gen_bin.cpp
index ae1d03d60b..85528d748d 100644
--- a/L3/benchmarks/gemm/gemm_mkl/gemm_mkl_gen_bin.cpp
+++ b/L3/benchmarks/gemm/gemm_mkl/gemm_mkl_gen_bin.cpp
@@ -81,7 +81,7 @@ int main(int argc, char** argv) {
 #endif
 
     // Generating Golden Output
-    GEMM_MKL(m, k, n, alpha, beta, a, b, c);
+    GEMM_MKL(m, n, k, alpha, beta, a, b, c);
 
 #ifdef USE_SHORT
     for (int i = 0; i < m * n; i++) c_short[i] = (short)c[i];
diff --git a/L3/benchmarks/gemm/gemm_mkl/gemm_mkl_helper.hpp b/L3/benchmarks/gemm/gemm_mkl/gemm_mkl_helper.hpp
index ea5504ed3a..a1fc297692 100644
--- a/L3/benchmarks/gemm/gemm_mkl/gemm_mkl_helper.hpp
+++ b/L3/benchmarks/gemm/gemm_mkl/gemm_mkl_helper.hpp
@@ -17,6 +17,8 @@
 #ifndef GEMM_MKL_HELPER_HPP
 #define GEMM_MKL_HELPER_HPP
 
+#define IDX2R(i, j, ld) (((i) * (ld)) + (j))
+
 #include <iostream>
 #include <mkl.h>
 #include <chrono>
@@ -63,10 +65,12 @@ XFBLAS_dataType* createMat(int p_rows, int p_cols, bool is_zero = false) {
     return mat;
 }
 
-// TODO, implement random input
 void initMat(XFBLAS_dataType* mat, int p_rows, int p_cols, bool is_zero) {
     srand(time(NULL));
-    for (int j = 0; j < p_rows; j++)
-        for (int i = 0; i < p_cols; i++) mat[i + (size_t)j * (size_t)p_cols] = (!is_zero & (i == j)) ? 1 : 0;
+    for (int j = 0; j < p_rows; j++) {
+        for (int i = 0; i < p_cols; i++) {
+            mat[IDX2R(j, i, p_cols)] = 1;
+        }
+    }
 }
 #endif
diff --git a/L3/benchmarks/gemm/run_gemm_bench.sh b/L3/benchmarks/gemm/run_gemm_bench.sh
index be763aa706..a81082612e 100755
--- a/L3/benchmarks/gemm/run_gemm_bench.sh
+++ b/L3/benchmarks/gemm/run_gemm_bench.sh
@@ -28,7 +28,7 @@ else
   while [  $n -le 8192 ]; do
     date
     echo "#############  $n ################"
-    nice ./bin/gemm_bench.exe $1 $2 $n $n $n ./data/$dataType/ $numKernels 1 | tee log-$n.txt
+    nice ./bin/gemm_bench.exe $1 $2 $n $n $n ./data/$dataType/ $numKernels | tee log-$n.txt
     logs="$logs log-$n.txt"
     n=`expr $n \* 2`
   done
diff --git a/L3/benchmarks/gemv/gemv_bench.cpp b/L3/benchmarks/gemv/gemv_bench.cpp
index 570f40f8c2..73def94080 100644
--- a/L3/benchmarks/gemv/gemv_bench.cpp
+++ b/L3/benchmarks/gemv/gemv_bench.cpp
@@ -44,12 +44,6 @@ int main(int argc, char** argv) {
     unsigned int l_argIdx = 1;
     string l_xclbinFile(argv[l_argIdx++]);
     string l_configFile(argv[l_argIdx++]);
-    string l_logFile;
-
-    ofstream logFile("xrt_report.txt");
-    logFile.close();
-    l_logFile = "xrt_report.txt";
-
     int l_numKernel = 1;
     int m = 256;
     int n = 256;
@@ -122,8 +116,7 @@ int main(int argc, char** argv) {
     l_tp[l_tpIdx] = chrono::high_resolution_clock::now();
 
     xfblasEngine_t engineName = XFBLAS_ENGINE_GEMV;
-    xfblasStatus_t status =
-        xfblasCreate(l_xclbinFile.c_str(), l_configFile, l_logFile.c_str(), engineName, l_numKernel);
+    xfblasStatus_t status = xfblasCreate(l_xclbinFile.c_str(), l_configFile, engineName, l_numKernel);
 
     showTimeData("xfblasCreate", l_tp[l_tpIdx], l_tp[l_tpIdx + 1]);
     l_tpIdx++;
diff --git a/L3/examples/gemm/Makefile b/L3/examples/gemm/Makefile
index dd378a9583..6350e4f1b9 100644
--- a/L3/examples/gemm/Makefile
+++ b/L3/examples/gemm/Makefile
@@ -120,20 +120,10 @@ $(EXE_FILE): $(SRCS) | check_xrt
 .PHONY: host
 host: check_xrt $(EXE_FILE)
 
-short: gemm_example.exe
-
-float: gemm_example_float.exe
-
 common: gemm_common_example.exe
 
 pre_allocated: gemm_pre_allocated_example.exe
 
-gemm_example.exe: gemm_example.cpp
-	$(CC) -D XFBLAS_dataType=short -o $@ $^ $(CXXFLAGS) $(LDFLAGS)
-
-gemm_example_float.exe: gemm_example.cpp
-	$(CC) -D XFBLAS_dataType=float -o $@ $^ $(CXXFLAGS) $(LDFLAGS)
-	
 gemm_common_example.exe: gemm_common_example.cpp
 	$(CC) -D XFBLAS_dataType=$(XFBLAS_dataType) -o $@ $^ $(CXXFLAGS) $(LDFLAGS)
 
diff --git a/L3/examples/gemm/gemm_common_example.cpp b/L3/examples/gemm/gemm_common_example.cpp
index 5ca8dc9c9c..7a05f76fb0 100644
--- a/L3/examples/gemm/gemm_common_example.cpp
+++ b/L3/examples/gemm/gemm_common_example.cpp
@@ -33,12 +33,6 @@ int main(int argc, char** argv) {
     unsigned int l_argIdx = 1;
     string l_xclbinFile(argv[l_argIdx++]);
     string l_configFile(argv[l_argIdx++]);
-    string l_logFile;
-
-    ofstream logFile("xrt_report.txt");
-    logFile.close();
-    l_logFile = "xrt_report.txt";
-
     int l_numKernel = 1;
 
     if (argc == 4) {
@@ -80,7 +74,7 @@ int main(int argc, char** argv) {
     xfblasEngine_t engineName = XFBLAS_ENGINE_GEMM;
     xfblasStatus_t status = XFBLAS_STATUS_SUCCESS;
 
-    status = xfblasCreate(l_xclbinFile.c_str(), l_configFile, l_logFile.c_str(), engineName, l_numKernel);
+    status = xfblasCreate(l_xclbinFile.c_str(), l_configFile, engineName, l_numKernel);
     if (status != XFBLAS_STATUS_SUCCESS) {
         cout << "Create Handle failed with error code: " << status << "\n";
         return EXIT_FAILURE;
@@ -124,7 +118,7 @@ int main(int argc, char** argv) {
     status = xfblasGetMatrix(m, n, sizeof(*c), d_c, c, n, l_numKernel - 1);
 
     if (status != XFBLAS_STATUS_SUCCESS) {
-        cout << "Get Matirx failed with error code: " << status << "\n";
+        cout << "Get Matrix failed with error code: " << status << "\n";
         return EXIT_FAILURE;
     }
 
diff --git a/L3/examples/gemm/gemm_example.cpp b/L3/examples/gemm/gemm_example.cpp
index 695c4d8020..b054de0f55 100644
--- a/L3/examples/gemm/gemm_example.cpp
+++ b/L3/examples/gemm/gemm_example.cpp
@@ -77,10 +77,6 @@ int main(int argc, char** argv) {
     unsigned int l_argIdx = 1;
     string l_xclbinFile(argv[l_argIdx++]);
     string l_configFile(argv[l_argIdx++]);
-    string l_logFile;
-    ofstream logFile("xrt_report.txt");
-    logFile.close();
-    l_logFile = "xrt_report.txt";
     int l_numKernel = 1;
 
     if (argc == 4) {
@@ -89,8 +85,7 @@ int main(int argc, char** argv) {
     }
 
     xfblasEngine_t engineName = XFBLAS_ENGINE_GEMM;
-    xfblasStatus_t status =
-        xfblasCreate(l_xclbinFile.c_str(), l_configFile, l_logFile.c_str(), engineName, l_numKernel);
+    xfblasStatus_t status = xfblasCreate(l_xclbinFile.c_str(), l_configFile, engineName, l_numKernel);
     if (status != XFBLAS_STATUS_SUCCESS) {
         cout << "Create Handle failed with error code: " << status << "\n";
         return EXIT_FAILURE;
@@ -165,7 +160,7 @@ int main(int argc, char** argv) {
     status = xfblasGetMatrixRestricted(c, l_numKernel - 1);
 
     if (status != XFBLAS_STATUS_SUCCESS) {
-        cout << "Get Matirx failed with error code: " << status << "\n";
+        cout << "Get Matrix failed with error code: " << status << "\n";
         xfblasDestroy();
         return EXIT_FAILURE;
     }
diff --git a/L3/examples/gemm/gemm_pre_allocated_example.cpp b/L3/examples/gemm/gemm_pre_allocated_example.cpp
index 9fbb005116..7a3fb2fa22 100644
--- a/L3/examples/gemm/gemm_pre_allocated_example.cpp
+++ b/L3/examples/gemm/gemm_pre_allocated_example.cpp
@@ -32,11 +32,6 @@ int main(int argc, char** argv) {
     unsigned int l_argIdx = 1;
     string l_xclbinFile(argv[l_argIdx++]);
     string l_configFile(argv[l_argIdx++]);
-    string l_logFile;
-
-    ofstream logFile("xrt_report.txt");
-    logFile.close();
-    l_logFile = "xrt_report.txt";
 
     int i, j; // i-row index ,j- column index
 
@@ -49,7 +44,7 @@ int main(int argc, char** argv) {
     xfblasEngine_t engineName = XFBLAS_ENGINE_GEMM;
     xfblasStatus_t status = XFBLAS_STATUS_SUCCESS;
 
-    status = xfblasCreate(l_xclbinFile.c_str(), l_configFile, l_logFile.c_str(), engineName);
+    status = xfblasCreate(l_xclbinFile.c_str(), l_configFile, engineName);
     if (status != XFBLAS_STATUS_SUCCESS) {
         cout << "Create Handle failed with error code: " << status << "\n";
         return EXIT_FAILURE;
diff --git a/L3/examples/gemv/Makefile b/L3/examples/gemv/Makefile
index 5337ece761..0b9b16f20e 100644
--- a/L3/examples/gemv/Makefile
+++ b/L3/examples/gemv/Makefile
@@ -120,18 +120,8 @@ $(EXE_FILE): $(SRCS) | check_xrt
 .PHONY: host
 host: check_xrt $(EXE_FILE)
 
-short: gemv_example.exe
-
-float: gemv_example_float.exe
-
 common: gemv_common_example.exe
 
-gemv_example.exe: gemv_example.cpp
-	$(CC) -D XFBLAS_dataType=short -o $@ $^ $(CXXFLAGS) $(LDFLAGS)
-
-gemv_example_float.exe: gemv_example.cpp
-	$(CC) -D XFBLAS_dataType=float -o $@ $^ $(CXXFLAGS) $(LDFLAGS)
-	
 gemv_common_example.exe: gemv_common_example.cpp
 	$(CC) -D XFBLAS_dataType=$(XFBLAS_dataType) -o $@ $^ $(CXXFLAGS) $(LDFLAGS)
 
diff --git a/L3/examples/gemv/gemv_common_example.cpp b/L3/examples/gemv/gemv_common_example.cpp
index f9e0187e60..f75082944c 100644
--- a/L3/examples/gemv/gemv_common_example.cpp
+++ b/L3/examples/gemv/gemv_common_example.cpp
@@ -32,11 +32,6 @@ int main(int argc, char** argv) {
     unsigned int l_argIdx = 1;
     string l_xclbinFile(argv[l_argIdx++]);
     string l_configFile(argv[l_argIdx++]);
-    string l_logFile;
-
-    ofstream logFile("xrt_report.txt");
-    logFile.close();
-    l_logFile = "xrt_report.txt";
 
     int l_numKernel = 1;
 
@@ -76,7 +71,7 @@ int main(int argc, char** argv) {
     xfblasEngine_t engineName = XFBLAS_ENGINE_GEMV;
     xfblasStatus_t status = XFBLAS_STATUS_SUCCESS;
 
-    status = xfblasCreate(l_xclbinFile.c_str(), l_configFile, l_logFile.c_str(), engineName, l_numKernel);
+    status = xfblasCreate(l_xclbinFile.c_str(), l_configFile, engineName, l_numKernel);
     if (status != XFBLAS_STATUS_SUCCESS) {
         cout << "Create Handle failed with error code: " << status << "\n";
         return EXIT_FAILURE;
@@ -120,7 +115,7 @@ int main(int argc, char** argv) {
     status = xfblasGetVector(m, sizeof(*y), d_y, y, 1, l_numKernel - 1);
 
     if (status != XFBLAS_STATUS_SUCCESS) {
-        cout << "Get Matirx failed with error code: " << status << "\n";
+        cout << "Get Matrix failed with error code: " << status << "\n";
         return EXIT_FAILURE;
     }
 
diff --git a/L3/examples/gemv/gemv_example.cpp b/L3/examples/gemv/gemv_example.cpp
index 17c55b27f3..7554e39930 100644
--- a/L3/examples/gemv/gemv_example.cpp
+++ b/L3/examples/gemv/gemv_example.cpp
@@ -71,10 +71,6 @@ int main(int argc, char** argv) {
     unsigned int l_argIdx = 1;
     string l_xclbinFile(argv[l_argIdx++]);
     string l_configFile(argv[l_argIdx++]);
-    string l_logFile;
-    ofstream logFile("xrt_report.txt");
-    logFile.close();
-    l_logFile = "xrt_report.txt";
     int l_numKernel = 1;
 
     if (argc == 4) {
@@ -83,8 +79,7 @@ int main(int argc, char** argv) {
     }
 
     xfblasEngine_t engineName = XFBLAS_ENGINE_GEMV;
-    xfblasStatus_t status =
-        xfblasCreate(l_xclbinFile.c_str(), l_configFile, l_logFile.c_str(), engineName, l_numKernel);
+    xfblasStatus_t status = xfblasCreate(l_xclbinFile.c_str(), l_configFile, engineName, l_numKernel);
     if (status != XFBLAS_STATUS_SUCCESS) {
         cout << "Create Handle failed with error code: " << status << "\n";
         return EXIT_FAILURE;
@@ -156,7 +151,7 @@ int main(int argc, char** argv) {
     status = xfblasGetVectorRestricted(y, l_numKernel - 1);
 
     if (status != XFBLAS_STATUS_SUCCESS) {
-        cout << "Get Matirx failed with error code: " << status << "\n";
+        cout << "Get Matrix failed with error code: " << status << "\n";
         xfblasDestroy();
         return EXIT_FAILURE;
     }
diff --git a/L3/include/sw/xf_blas/api.hpp b/L3/include/sw/xf_blas/api.hpp
new file mode 100644
index 0000000000..f1996f7bfb
--- /dev/null
+++ b/L3/include/sw/xf_blas/api.hpp
@@ -0,0 +1,95 @@
+/*
+ * Copyright 2019 Xilinx, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef XF_BLAS_API_HPP
+#define XF_BLAS_API_HPP
+
+extern "C" {
+
+bool xfblasCreate(char* xclbin, char* engineName, unsigned int kernelNumber, unsigned int deviceIndex);
+bool xfblasSend(void* A, unsigned long long numElem, int elemSize, unsigned int kernelIndex, unsigned int deviceIndex);
+bool xfblasGet(void* A, unsigned int kernelIndex, unsigned int deviceIndex);
+void xfblasFreeInstr(unsigned int kernelIndex, unsigned int deviceIndex);
+void xfblasDestroy(unsigned int kernelNumber, unsigned int deviceIndex);
+void xfblasFree(void* A, unsigned int kernelIndex, unsigned int deviceIndex);
+bool xfblasGemm(int m,
+                int n,
+                int k,
+                int alpha,
+                void* A,
+                int lda,
+                void* B,
+                int ldb,
+                int beta,
+                void* C,
+                int ldc,
+                unsigned int kernelIndex,
+                unsigned int deviceIndex);
+bool xfblasGemv(int m,
+                int n,
+                int alpha,
+                void* A,
+                int lda,
+                void* x,
+                int incx,
+                int beta,
+                void* y,
+                int incy,
+                unsigned int kernelIndex,
+                unsigned int deviceIndex);
+bool xfblasFcn(int m,
+               int n,
+               int k,
+               int alpha,
+               void* A,
+               int lda,
+               void* B,
+               int ldb,
+               int beta,
+               void* C,
+               int ldc,
+               void* X,
+               int ldx,
+               int p_postScale,
+               int p_postShift,
+               short p_preluScale,
+               short p_preluAlpha,
+               unsigned int kernelIndex,
+               unsigned int deviceIndex);
+bool xfblasFcnByAddress(unsigned int l_aOff,
+                        unsigned int l_bOff,
+                        unsigned int l_cOff,
+                        unsigned int l_xOff,
+                        unsigned int p_m,
+                        unsigned int p_n,
+                        unsigned int p_k,
+                        unsigned int p_lda,
+                        unsigned int p_ldb,
+                        unsigned int p_ldc,
+                        unsigned int p_ldx,
+                        int p_postScale,
+                        int p_postShift,
+                        short p_preluScale,
+                        short p_preluAlpha,
+                        unsigned int kernelIndex,
+                        unsigned int deviceIndex);
+bool xfblasGetByAddress(
+    void* A, unsigned long long p_bufSize, unsigned int offset, unsigned int kernelIndex, unsigned int deviceIndex);
+void xfblasExecuteAsync(unsigned int numkernels, unsigned int deviceIndex);
+bool xfblasExecute(unsigned int kernelIndex, unsigned int deviceIndex);
+}
+
+#endif
\ No newline at end of file
diff --git a/L3/include/sw/xf_blas/gemm_host.hpp b/L3/include/sw/xf_blas/gemm_host.hpp
index ad3593e528..a7460fcd70 100644
--- a/L3/include/sw/xf_blas/gemm_host.hpp
+++ b/L3/include/sw/xf_blas/gemm_host.hpp
@@ -62,12 +62,8 @@ class GEMMHost : public BLASHost {
     GEMMHost() = delete;
     virtual ~GEMMHost() {}
     GEMMHost(const GEMMHost&) = delete;
-    GEMMHost(const char* p_xclbin,
-             const char* p_logFile,
-             xfblasStatus_t* p_status,
-             unsigned int p_kernelIndex,
-             unsigned int p_deviceIndex)
-        : BLASHost(p_xclbin, p_logFile, p_status, p_kernelIndex, p_deviceIndex) {}
+    GEMMHost(const char* p_xclbin, xfblasStatus_t* p_status, unsigned int p_kernelIndex, unsigned int p_deviceIndex)
+        : BLASHost(p_xclbin, p_status, p_kernelIndex, p_deviceIndex) {}
 
     virtual xfblasStatus_t addGEMMOp(void* p_a,
                                      void* p_b,
diff --git a/L3/include/sw/xf_blas/gemv_host.hpp b/L3/include/sw/xf_blas/gemv_host.hpp
index 15d8c1ef08..ccd05f4aef 100644
--- a/L3/include/sw/xf_blas/gemv_host.hpp
+++ b/L3/include/sw/xf_blas/gemv_host.hpp
@@ -51,12 +51,8 @@ class GEMVHost : public BLASHost {
     GEMVHost() = delete;
     virtual ~GEMVHost() {}
     GEMVHost(const GEMVHost&) = delete;
-    GEMVHost(const char* p_xclbin,
-             const char* p_logFile,
-             xfblasStatus_t* p_status,
-             unsigned int p_kernelIndex,
-             unsigned int p_deviceIndex)
-        : BLASHost(p_xclbin, p_logFile, p_status, p_kernelIndex, p_deviceIndex) {}
+    GEMVHost(const char* p_xclbin, xfblasStatus_t* p_status, unsigned int p_kernelIndex, unsigned int p_deviceIndex)
+        : BLASHost(p_xclbin, p_status, p_kernelIndex, p_deviceIndex) {}
 
     virtual xfblasStatus_t addGEMVOp(
         void* p_a, void* p_b, void* p_c, unsigned int p_m, unsigned int p_n, unsigned int p_lda) {
diff --git a/L3/include/sw/xf_blas/helpers/funcs/fcn_host.hpp b/L3/include/sw/xf_blas/helpers/funcs/fcn_host.hpp
new file mode 100644
index 0000000000..280f31410a
--- /dev/null
+++ b/L3/include/sw/xf_blas/helpers/funcs/fcn_host.hpp
@@ -0,0 +1,161 @@
+/*
+ * Copyright 2019 Xilinx, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef XF_BLAS_FCN_HOST_HPP
+#define XF_BLAS_FCN_HOST_HPP
+
+#include "handle.hpp"
+#include "host.hpp"
+
+namespace xf {
+
+namespace blas {
+
+class FcnArgs : public BLASArgs {
+   public:
+    virtual ~FcnArgs() {}
+    FcnArgs() = delete;
+    FcnArgs(unsigned int p_aOffset,
+            unsigned int p_bOffset,
+            unsigned int p_cOffset,
+            unsigned int p_xOffset,
+            unsigned int p_m,
+            unsigned int p_k,
+            unsigned int p_n,
+            unsigned int p_lda,
+            unsigned int p_ldb,
+            unsigned int p_ldc,
+            unsigned int p_ldx,
+            int p_postScale,
+            int p_postShift,
+            short p_preluScale,
+            short p_preluAlpha)
+        : m_fcn_args({OpFcn, p_aOffset, p_bOffset, p_cOffset, p_xOffset, p_m, p_k, p_n, p_lda, p_ldb, p_ldc, p_ldx, 0,
+                      0, 0, 0}) {
+        m_fcn_args.m_postScaleVal = (p_postScale << 8) | (p_postShift & 0x000000ff);
+        m_fcn_args.m_PReLUVal = (p_preluScale << 6) | (p_preluAlpha & 0x003f);
+    }
+    size_t sizeInBytes() { return sizeof(m_fcn_args); }
+    char* asByteArray() { return reinterpret_cast<char*>(&m_fcn_args); }
+
+   protected:
+    struct {
+        int m_optype;
+        unsigned int m_aOffset, m_bOffset, m_cOffset, m_xOffset, m_m, m_k, m_n, m_lda, m_ldb, m_ldc, m_ldx;
+        int m_postScaleVal;
+        short m_PReLUVal;
+        short m_shortEmpty;
+        int m_empty[2];
+    } m_fcn_args;
+};
+
+class FCNHost : public BLASHost {
+   public:
+    FCNHost() = delete;
+    virtual ~FCNHost() {}
+    FCNHost(const FCNHost&) = delete;
+    FCNHost(const char* p_xclbin, xfblasStatus_t* p_status, unsigned int p_kernelIndex, unsigned int p_deviceIndex)
+        : BLASHost(p_xclbin, p_status, p_kernelIndex, p_deviceIndex) {}
+
+    virtual xfblasStatus_t addFCNOp(void* p_a,
+                                    void* p_b,
+                                    void* p_c,
+                                    void* p_bias,
+                                    unsigned int p_m,
+                                    unsigned int p_n,
+                                    unsigned int p_k,
+                                    unsigned int p_lda,
+                                    unsigned int p_ldb,
+                                    unsigned int p_ldc,
+                                    unsigned int p_ldx,
+                                    int p_postScale,
+                                    int p_postShift,
+                                    short p_preluScale,
+                                    short p_preluAlpha) {
+        if (this->m_bufHandle.find(p_a) == this->m_bufHandle.end() ||
+            this->m_bufHandle.find(p_b) == this->m_bufHandle.end() ||
+            this->m_bufHandle.find(p_c) == this->m_bufHandle.end() ||
+            this->m_bufHandle.find(p_bias) == this->m_bufHandle.end()) {
+            return XFBLAS_STATUS_ALLOC_FAILED;
+        }
+
+        unsigned long long handle_A, handle_B, handle_C, handle_bias;
+        auto& l_devPtr = this->m_bufHandle;
+
+        handle_A = l_devPtr[p_a];
+        handle_B = l_devPtr[p_b];
+        handle_C = l_devPtr[p_c];
+        handle_bias = l_devPtr[p_bias];
+
+        xclBOProperties p;
+        uint64_t address_A = !xclGetBOProperties(this->m_fpga->m_handle, handle_A, &p) ? p.paddr : -1;
+        uint64_t address_B = !xclGetBOProperties(this->m_fpga->m_handle, handle_B, &p) ? p.paddr : -1;
+        uint64_t address_C = !xclGetBOProperties(this->m_fpga->m_handle, handle_C, &p) ? p.paddr : -1;
+        uint64_t address_bias = !xclGetBOProperties(this->m_fpga->m_handle, handle_bias, &p) ? p.paddr : -1;
+
+        unsigned long long l_aOff, l_bOff, l_cOff, l_xOff;
+        l_aOff = (unsigned long long)address_A;
+        l_bOff = (unsigned long long)address_B;
+        l_cOff = (unsigned long long)address_C;
+        l_xOff = (unsigned long long)address_bias;
+
+        l_aOff -= this->m_fpga->m_baseAddress[this->m_cuIndex];
+        l_bOff -= this->m_fpga->m_baseAddress[this->m_cuIndex];
+        l_cOff -= this->m_fpga->m_baseAddress[this->m_cuIndex];
+        l_xOff -= this->m_fpga->m_baseAddress[this->m_cuIndex];
+
+        l_aOff /= this->PAGE_SIZE;
+        l_bOff /= this->PAGE_SIZE;
+        l_cOff /= this->PAGE_SIZE;
+        l_xOff /= this->PAGE_SIZE;
+
+        FcnArgs args(l_aOff, l_bOff, l_cOff, l_xOff, p_m, p_k, p_n, p_lda, p_ldb, p_ldc, p_ldx, p_postScale,
+                     p_postShift, p_preluScale, p_preluAlpha);
+        this->addInstr(&args);
+        this->enableRun();
+
+        return XFBLAS_STATUS_SUCCESS;
+    }
+
+    virtual xfblasStatus_t addFCNOpByAddress(unsigned int l_aOff,
+                                             unsigned int l_bOff,
+                                             unsigned int l_cOff,
+                                             unsigned int l_xOff,
+                                             unsigned int p_m,
+                                             unsigned int p_n,
+                                             unsigned int p_k,
+                                             unsigned int p_lda,
+                                             unsigned int p_ldb,
+                                             unsigned int p_ldc,
+                                             unsigned int p_ldx,
+                                             int p_postScale,
+                                             int p_postShift,
+                                             short p_preluScale,
+                                             short p_preluAlpha) {
+        FcnArgs args(l_aOff, l_bOff, l_cOff, l_xOff, p_m, p_k, p_n, p_lda, p_ldb, p_ldc, p_ldx, p_postScale,
+                     p_postShift, p_preluScale, p_preluAlpha);
+        this->addInstr(&args);
+        this->enableRun();
+
+        return XFBLAS_STATUS_SUCCESS;
+    }
+};
+
+} // namespace blas
+
+} // namespace xf
+
+#endif
diff --git a/L3/include/sw/xf_blas/host.hpp b/L3/include/sw/xf_blas/host.hpp
index e4bb436546..a3bea3ccef 100644
--- a/L3/include/sw/xf_blas/host.hpp
+++ b/L3/include/sw/xf_blas/host.hpp
@@ -1,424 +1,440 @@
-/*
- * Copyright 2019 Xilinx, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef XF_BLAS_HOST_HPP
-#define XF_BLAS_HOST_HPP
-
-#include <stdio.h>
-#include <string.h>
-#include <vector>
-#include <string>
-#include <unordered_map>
-#include <iostream>
-
-#include "ert.h"
-#include "xclhal2.h"
-#include "xclbin.h"
-
-#include "../utility/utility.hpp"
-#include "helper.hpp"
-#include "gemxkernel_hw.hpp"
-
-#define IDX2R(i, j, ld) (((i) * (ld)) + (j))
-
-using namespace std;
-
-namespace xf {
-
-namespace blas {
-
-class XFpga {
-   public:
-    xclDeviceHandle m_handle;
-    uuid_t m_xclbinId;
-    vector<int> m_mem;
-    vector<unsigned long long> m_baseAddress;
-    vector<unsigned int> m_execHandles;
-    bool m_init = false;
-
-    XFpga() = delete;
-    XFpga(const char* p_xclbin, const char* p_logFile, int* p_err, unsigned int deviceIndex = 0) {
-        if (deviceIndex >= xclProbe()) {
-            *p_err = 1;
-            return;
-        }
-        m_handle = xclOpen(deviceIndex, p_logFile, XCL_INFO);
-        if (xclLockDevice(m_handle)) {
-            *p_err = 1;
-            return;
-        }
-        ifstream l_stream(p_xclbin);
-        l_stream.seekg(0, l_stream.end);
-        int l_size = l_stream.tellg();
-        l_stream.seekg(0, l_stream.beg);
-
-        char* l_header = new char[l_size];
-        l_stream.read(l_header, l_size);
-
-        const xclBin* l_blob = (const xclBin*)l_header;
-        if (xclLoadXclBin(m_handle, l_blob)) {
-            *p_err = 1;
-            return;
-        }
-        // cout << "Finished downloading bitstream " << p_xclbin << "\n";
-        const axlf* l_top = (const axlf*)l_header;
-
-        auto l_topo = xclbin::get_axlf_section(l_top, MEM_TOPOLOGY);
-        struct mem_topology* l_topology = (mem_topology*)(l_header + l_topo->m_sectionOffset);
-
-        for (int i = 0; i < l_topology->m_count; ++i) {
-            if (l_topology->m_mem_data[i].m_used) {
-                m_baseAddress.push_back(l_topology->m_mem_data[i].m_base_address);
-                int l_mem = i;
-                m_mem.push_back(l_mem);
-            }
-        }
-
-        uuid_copy(m_xclbinId, l_top->m_header.uuid);
-        delete[] l_header;
-    }
-
-    ~XFpga() {}
-
-    bool openContext(unsigned int p_cuIndex) {
-        if (xclOpenContext(m_handle, m_xclbinId, p_cuIndex, true)) {
-            return false;
-        }
-        return true;
-    }
-
-    unsigned int createBuf(void* p_ptr, size_t p_szBytes, unsigned int p_kernelIndex) {
-        return xclAllocUserPtrBO(m_handle, p_ptr, p_szBytes, m_mem[p_kernelIndex]);
-    }
-
-    bool copyToFpga(unsigned int p_bufHandle, size_t p_szBytes) {
-        if (xclSyncBO(m_handle, p_bufHandle, XCL_BO_SYNC_BO_TO_DEVICE, p_szBytes, 0)) {
-            return false;
-        }
-        return true;
-    }
-
-    bool copyFromFpga(unsigned int p_bufHandle, size_t p_szBytes) {
-        if (xclSyncBO(m_handle, p_bufHandle, XCL_BO_SYNC_BO_FROM_DEVICE, p_szBytes, 0)) {
-            return false;
-        }
-        return true;
-    }
-
-    bool execKernel(unsigned int p_kernelIndex) {
-        unsigned int m_execHandle = xclAllocBO(m_handle, 4096 + 4096, xclBOKind(0), (1 << 31));
-        void* execData = xclMapBO(m_handle, m_execHandle, true);
-        auto ecmd = reinterpret_cast<ert_start_kernel_cmd*>(execData);
-        auto rsz = XGEMXKERNEL_0_GEMXKERNEL_0_CONTROL_ADDR_P_DDRWR_M_VAL_DATA / 4 + 2; // regmap array size
-        memset(ecmd, 0, (sizeof *ecmd) + rsz);
-        ecmd->state = ERT_CMD_STATE_NEW;
-        ecmd->opcode = ERT_START_CU;
-        ecmd->count = 1 + rsz;
-        ecmd->cu_mask = 0x1 << p_kernelIndex;
-        ecmd->data[XGEMXKERNEL_0_GEMXKERNEL_0_CONTROL_ADDR_AP_CTRL] = 0x0; // ap_start
-        ecmd->data[XGEMXKERNEL_0_GEMXKERNEL_0_CONTROL_ADDR_P_DDRRD_M_VAL_DATA / 4] = m_baseAddress[p_kernelIndex];
-        ecmd->data[XGEMXKERNEL_0_GEMXKERNEL_0_CONTROL_ADDR_P_DDRWR_M_VAL_DATA / 4] = m_baseAddress[p_kernelIndex];
-        ecmd->data[XGEMXKERNEL_0_GEMXKERNEL_0_CONTROL_ADDR_P_DDRRD_M_VAL_DATA / 4 + 1] =
-            m_baseAddress[p_kernelIndex] >> 32;
-        ecmd->data[XGEMXKERNEL_0_GEMXKERNEL_0_CONTROL_ADDR_P_DDRWR_M_VAL_DATA / 4 + 1] =
-            m_baseAddress[p_kernelIndex] >> 32;
-
-        if (xclExecBuf(m_handle, m_execHandle)) {
-            return false;
-        }
-
-        while (xclExecWait(m_handle, 1) == 0)
-            ;
-
-        m_execHandles.push_back(m_execHandle);
-
-        return true;
-    }
-};
-
-class XFpgaHold {
-   public:
-    unordered_map<unsigned int, shared_ptr<XFpga> > m_xFpgaPtr;
-    static XFpgaHold& instance() {
-        static XFpgaHold theInstance;
-        return theInstance;
-    }
-
-   protected:
-    XFpgaHold() {}
-};
-
-class XHost {
-   protected:
-    static const unsigned int PAGE_SIZE = 4096;
-    static const unsigned int INSTR_BUF_SIZE = PAGE_SIZE;
-    static const unsigned int KERN_DBG_BUF_SIZE = PAGE_SIZE;
-    unordered_map<void*, void*> m_hostMat;
-    unordered_map<void*, unsigned int> m_bufHandle;
-    unordered_map<void*, unsigned long long> m_hostMatSz;
-    // shared_ptr<XFpga> m_fpga = XFpgaHold::instance().m_xFpgaPtr;
-    shared_ptr<XFpga> m_fpga;
-    vector<unsigned long long> m_ddrDeviceBaseAddr;
-    char* m_progBuf;
-    char* m_instrBuf;
-    unsigned int m_instrOffset;
-    unsigned int m_instrBufHandle;
-    unsigned int m_cuIndex;
-
-   public:
-    XHost() = delete;
-    XHost(const char* p_xclbin,
-          const char* p_logFile,
-          xfblasStatus_t* p_status,
-          unsigned int p_kernelIndex,
-          unsigned int p_deviceIndex) {
-        m_fpga = XFpgaHold::instance().m_xFpgaPtr[p_deviceIndex];
-        m_cuIndex = p_kernelIndex;
-        if (!m_fpga->openContext(m_cuIndex)) {
-            *p_status = XFBLAS_STATUS_NOT_INITIALIZED;
-            return;
-        }
-        void* l_alignedMem = nullptr;
-        int l_memAllocStatus = posix_memalign(&l_alignedMem, PAGE_SIZE, INSTR_BUF_SIZE);
-        if (l_memAllocStatus) {
-            *p_status = XFBLAS_STATUS_ALLOC_FAILED;
-        }
-        m_instrBuf = (char*)l_alignedMem;
-        m_progBuf = (char*)l_alignedMem;
-        memset(m_instrBuf, 0, INSTR_BUF_SIZE);
-        m_instrOffset = 0;
-        m_instrBufHandle = m_fpga->createBuf(m_instrBuf, INSTR_BUF_SIZE + KERN_DBG_BUF_SIZE, m_cuIndex);
-    }
-
-    bool addMatRestricted(void* p_hostHandle, void* p_matPtr, unsigned long long p_bufSize) {
-        auto& l_hostPtr = m_hostMat;
-        auto& l_hostSzPtr = m_hostMatSz;
-        if (((unsigned long)p_matPtr & (PAGE_SIZE - 1)) != 0) {
-            void* l_matPtr;
-            posix_memalign((void**)&l_matPtr, 4096, p_bufSize);
-            memcpy(l_matPtr, p_matPtr, p_bufSize);
-            if (l_hostPtr.find(p_hostHandle) == l_hostPtr.end()) {
-                l_hostPtr[p_hostHandle] = l_matPtr;
-                l_hostSzPtr[p_hostHandle] = p_bufSize;
-                return true;
-            }
-        } else {
-            if (l_hostPtr.find(p_hostHandle) == l_hostPtr.end()) {
-                l_hostPtr[p_hostHandle] = p_matPtr;
-                l_hostSzPtr[p_hostHandle] = p_bufSize;
-                return true;
-            }
-        }
-        return false;
-    }
-
-    xfblasStatus_t allocMatRestricted(void* p_hostHandle, void* p_matPtr, unsigned long long p_bufSize) {
-        if (!addMatRestricted(p_hostHandle, p_matPtr, p_bufSize)) {
-            return XFBLAS_STATUS_ALLOC_FAILED;
-        }
-        auto& l_hostPtr = m_hostMat;
-        auto& l_devPtr = m_bufHandle;
-        auto& l_hostSzPtr = m_hostMatSz;
-        if (l_devPtr.find(p_hostHandle) != l_devPtr.end()) {
-            return XFBLAS_STATUS_ALLOC_FAILED;
-        } else {
-            l_devPtr[p_hostHandle] = m_fpga->createBuf(l_hostPtr[p_hostHandle], l_hostSzPtr[p_hostHandle], m_cuIndex);
-            return XFBLAS_STATUS_SUCCESS;
-        }
-    }
-
-    template <typename t_dataType>
-    xfblasStatus_t allocMat(t_dataType* p_devPtr, size_t p_bufSize) {
-        auto& l_devPtr = m_bufHandle;
-        auto& l_hostSzPtr = m_hostMatSz;
-        if (l_devPtr.find(*p_devPtr) != l_devPtr.end()) {
-            return XFBLAS_STATUS_ALLOC_FAILED;
-        } else {
-            unsigned int l_deviceHandle =
-                xclAllocBO(m_fpga->m_handle, p_bufSize, XCL_BO_DEVICE_RAM, m_fpga->m_mem[m_cuIndex]);
-            *p_devPtr = (t_dataType)xclMapBO(m_fpga->m_handle, l_deviceHandle, true);
-            memset(*p_devPtr, 0, p_bufSize);
-            l_hostSzPtr[*p_devPtr] = p_bufSize;
-            l_devPtr[*p_devPtr] = l_deviceHandle;
-            return XFBLAS_STATUS_SUCCESS;
-        }
-    }
-
-    template <typename t_dataType>
-    xfblasStatus_t setMatToFPGA(
-        void* p_hostHandle, int p_rows, int p_lda, int p_paddedLda, t_dataType& p_hostPtr, t_dataType& p_devPtr) {
-        auto& l_devPtr = m_bufHandle;
-        auto& l_hostSzPtr = m_hostMatSz;
-        if (l_devPtr.find(p_hostHandle) != l_devPtr.end()) {
-            for (int i = 0; i < p_rows; i++) {
-                for (int j = 0; j < p_lda; j++) {
-                    p_devPtr[IDX2R(i, j, p_paddedLda)] = p_hostPtr[IDX2R(i, j, p_lda)];
-                }
-            }
-            if (!m_fpga->copyToFpga(l_devPtr[p_hostHandle], l_hostSzPtr[p_hostHandle])) {
-                return XFBLAS_STATUS_ALLOC_FAILED;
-            }
-        } else {
-            return XFBLAS_STATUS_ALLOC_FAILED;
-        }
-        return XFBLAS_STATUS_SUCCESS;
-    }
-
-    xfblasStatus_t setMatToFPGARestricted(void* p_hostHandle) {
-        auto& l_devPtr = m_bufHandle;
-        auto& l_hostSzPtr = m_hostMatSz;
-        if (l_devPtr.find(p_hostHandle) != l_devPtr.end()) {
-            if (!m_fpga->copyToFpga(l_devPtr[p_hostHandle], l_hostSzPtr[p_hostHandle])) {
-                return XFBLAS_STATUS_ALLOC_FAILED;
-            }
-        } else {
-            return XFBLAS_STATUS_ALLOC_FAILED;
-        }
-        return XFBLAS_STATUS_SUCCESS;
-    }
-
-    void addInstr(BLASArgs* p_args) {
-        char* l_instr = p_args->asByteArray();
-        char* l_currPos = &m_progBuf[m_instrOffset];
-        memcpy(l_currPos, l_instr, p_args->sizeInBytes());
-        m_instrOffset += p_args->sizeInBytes();
-    }
-
-    template <typename t_dataType>
-    xfblasStatus_t getMat(
-        void* p_hostHandle, int p_rows, int p_lda, int p_paddedLda, t_dataType& p_hostPtr, t_dataType& p_devPtr) {
-        auto& l_hostSzPtr = m_hostMatSz;
-        auto& l_devPtr = m_bufHandle;
-        if (l_devPtr.find(p_hostHandle) != l_devPtr.end()) {
-            if (!m_fpga->copyFromFpga(l_devPtr[p_hostHandle], l_hostSzPtr[p_hostHandle])) {
-                return XFBLAS_STATUS_ALLOC_FAILED;
-            }
-            for (int i = 0; i < p_rows; i++) {
-                for (int j = 0; j < p_lda; j++) {
-                    p_hostPtr[IDX2R(i, j, p_lda)] = p_devPtr[IDX2R(i, j, p_paddedLda)];
-                }
-            }
-        } else {
-            return XFBLAS_STATUS_ALLOC_FAILED;
-        }
-        return XFBLAS_STATUS_SUCCESS;
-    }
-
-    xfblasStatus_t deviceSync() {
-        for (auto& l_devPtr : m_bufHandle) {
-            if (!m_fpga->copyToFpga(l_devPtr.second, m_hostMatSz[l_devPtr.first])) {
-                return XFBLAS_STATUS_ALLOC_FAILED;
-            }
-        }
-        return XFBLAS_STATUS_SUCCESS;
-    }
-
-    xfblasStatus_t getMatManaged() {
-        for (auto& l_devPtr : m_bufHandle) {
-            if (!m_fpga->copyFromFpga(l_devPtr.second, m_hostMatSz[l_devPtr.first])) {
-                return XFBLAS_STATUS_ALLOC_FAILED;
-            }
-        }
-        return XFBLAS_STATUS_SUCCESS;
-    }
-
-    xfblasStatus_t getMatRestricted(void* p_hostHandle, void* p_matPtr) {
-        auto& l_hostPtr = m_hostMat;
-        auto& l_hostSzPtr = m_hostMatSz;
-        auto& l_devPtr = m_bufHandle;
-        if (l_hostPtr.find(p_hostHandle) != l_hostPtr.end()) {
-            if (!m_fpga->copyFromFpga(l_devPtr[p_hostHandle], l_hostSzPtr[p_hostHandle])) {
-                return XFBLAS_STATUS_ALLOC_FAILED;
-            }
-            if (((unsigned long)p_matPtr & (PAGE_SIZE - 1)) != 0) {
-                memcpy(p_matPtr, l_hostPtr[p_hostHandle], l_hostSzPtr[p_hostHandle]);
-            }
-        } else {
-            return XFBLAS_STATUS_ALLOC_FAILED;
-        }
-        return XFBLAS_STATUS_SUCCESS;
-    }
-
-    void clearInstrBuf() {
-        memset(this->m_progBuf, 0, PAGE_SIZE);
-        this->m_instrOffset = 0;
-    }
-
-    xfblasStatus_t freeMat(void* p_hostHandle) {
-        auto& l_devPtr = m_bufHandle;
-        if (l_devPtr.find(p_hostHandle) == l_devPtr.end()) {
-            return XFBLAS_STATUS_ALLOC_FAILED;
-        } else {
-            xclFreeBO(m_fpga->m_handle, l_devPtr[p_hostHandle]);
-            this->m_bufHandle.erase(p_hostHandle);
-            this->m_hostMatSz.erase(p_hostHandle);
-            if (!m_hostMat.empty()) {
-                this->m_hostMat.erase(p_hostHandle);
-            }
-            return XFBLAS_STATUS_SUCCESS;
-        }
-    }
-
-    xfblasStatus_t closeContext(unsigned int p_kernelIndex) {
-        free(m_progBuf);
-        xclFreeBO(m_fpga->m_handle, m_instrBufHandle);
-        if (p_kernelIndex < (unsigned int)m_fpga->m_execHandles.size()) {
-            xclFreeBO(m_fpga->m_handle, m_fpga->m_execHandles[p_kernelIndex]);
-        }
-        xclCloseContext(m_fpga->m_handle, m_fpga->m_xclbinId, this->m_cuIndex);
-        return XFBLAS_STATUS_SUCCESS;
-    }
-    void closeDevice() { xclClose(m_fpga->m_handle); }
-};
-
-class BLASHost : public XHost {
-   private:
-    bool m_execControl = true;
-
-   public:
-    BLASHost() = delete;
-    virtual ~BLASHost() {}
-    BLASHost(const BLASHost&) = delete;
-
-    BLASHost(const char* p_xclbin,
-             const char* p_logFile,
-             xfblasStatus_t* p_status,
-             unsigned int p_kernelIndex,
-             unsigned int p_deviceIndex)
-        : XHost(p_xclbin, p_logFile, p_status, p_kernelIndex, p_deviceIndex) {}
-
-    xfblasStatus_t execute() {
-        xfblasStatus_t l_status = XFBLAS_STATUS_SUCCESS;
-        if (m_execControl) {
-            if (!this->m_fpga->copyToFpga(this->m_instrBufHandle, this->INSTR_BUF_SIZE + this->KERN_DBG_BUF_SIZE)) {
-                l_status = XFBLAS_STATUS_ALLOC_FAILED;
-            }
-            if (!this->m_fpga->execKernel(this->m_cuIndex)) {
-                l_status = XFBLAS_STATUS_ALLOC_FAILED;
-            }
-            m_execControl = false;
-        }
-        return l_status;
-    }
-
-    void enableRun() { m_execControl = true; }
-};
-
-} // namespace blas
-
-} // namespace xf
-
+/*
+ * Copyright 2019 Xilinx, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef XF_BLAS_HOST_HPP
+#define XF_BLAS_HOST_HPP
+
+#include <stdio.h>
+#include <string.h>
+#include <vector>
+#include <string>
+#include <unordered_map>
+#include <iostream>
+
+#include "ert.h"
+#include "xclhal2.h"
+#include "xclbin.h"
+
+#include "../utility/utility.hpp"
+#include "helper.hpp"
+#include "gemxkernel_hw.hpp"
+
+#define IDX2R(i, j, ld) (((i) * (ld)) + (j))
+
+using namespace std;
+
+namespace xf {
+
+namespace blas {
+
+class XFpga {
+   public:
+    xclDeviceHandle m_handle;
+    uuid_t m_xclbinId;
+    vector<int> m_mem;
+    vector<unsigned long long> m_baseAddress;
+    
+    bool m_init = false;
+
+    XFpga() = delete;
+    XFpga(const char* p_xclbin, int* p_err, unsigned int deviceIndex = 0) {
+        if (deviceIndex >= xclProbe()) {
+            *p_err = 1;
+            return;
+        }
+        m_handle = xclOpen(deviceIndex, NULL, XCL_INFO);
+        if (xclLockDevice(m_handle)) {
+            *p_err = 1;
+            return;
+        }
+        ifstream l_stream(p_xclbin);
+        l_stream.seekg(0, l_stream.end);
+        int l_size = l_stream.tellg();
+        l_stream.seekg(0, l_stream.beg);
+
+        char* l_header = new char[l_size];
+        l_stream.read(l_header, l_size);
+
+        const xclBin* l_blob = (const xclBin*)l_header;
+        if (xclLoadXclBin(m_handle, l_blob)) {
+            *p_err = 1;
+            return;
+        }
+        // cout << "Finished downloading bitstream " << p_xclbin << "\n";
+        const axlf* l_top = (const axlf*)l_header;
+
+        auto l_topo = xclbin::get_axlf_section(l_top, MEM_TOPOLOGY);
+        struct mem_topology* l_topology = (mem_topology*)(l_header + l_topo->m_sectionOffset);
+
+        for (int i = 0; i < l_topology->m_count; ++i) {
+            if (l_topology->m_mem_data[i].m_used) {
+                m_baseAddress.push_back(l_topology->m_mem_data[i].m_base_address);
+                int l_mem = i;
+                m_mem.push_back(l_mem);
+            }
+        }
+
+        uuid_copy(m_xclbinId, l_top->m_header.uuid);
+        delete[] l_header;
+    }
+
+    ~XFpga() {}
+
+    bool openContext(unsigned int p_cuIndex) {
+        if (xclOpenContext(m_handle, m_xclbinId, p_cuIndex, true)) {
+            return false;
+        }
+        return true;
+    }
+
+    unsigned int createBuf(void* p_ptr, size_t p_szBytes, unsigned int p_kernelIndex) {
+        return xclAllocUserPtrBO(m_handle, p_ptr, p_szBytes, m_mem[p_kernelIndex]);
+    }
+
+    bool copyToFpga(unsigned int p_bufHandle, size_t p_szBytes) {
+        if (xclSyncBO(m_handle, p_bufHandle, XCL_BO_SYNC_BO_TO_DEVICE, p_szBytes, 0)) {
+            return false;
+        }
+        return true;
+    }
+
+    bool copyFromFpga(unsigned int p_bufHandle, size_t p_szBytes) {
+        if (xclSyncBO(m_handle, p_bufHandle, XCL_BO_SYNC_BO_FROM_DEVICE, p_szBytes, 0)) {
+            return false;
+        }
+        return true;
+    }
+
+    unsigned int execKernel(unsigned int p_kernelIndex) {
+        unsigned int m_execHandle = xclAllocBO(m_handle, 4096 + 4096, xclBOKind(0), (1 << 31));
+        void* execData = xclMapBO(m_handle, m_execHandle, true);
+        auto ecmd = reinterpret_cast<ert_start_kernel_cmd*>(execData);
+        auto rsz = XGEMXKERNEL_0_GEMXKERNEL_0_CONTROL_ADDR_P_DDRWR_M_VAL_DATA / 4 + 2; // regmap array size
+        memset(ecmd, 0, (sizeof *ecmd) + rsz);
+        ecmd->state = ERT_CMD_STATE_NEW;
+        ecmd->opcode = ERT_START_CU;
+        ecmd->count = 1 + rsz;
+        ecmd->cu_mask = 0x1 << p_kernelIndex;
+        ecmd->data[XGEMXKERNEL_0_GEMXKERNEL_0_CONTROL_ADDR_AP_CTRL] = 0x0; // ap_start
+        ecmd->data[XGEMXKERNEL_0_GEMXKERNEL_0_CONTROL_ADDR_P_DDRRD_M_VAL_DATA / 4] = m_baseAddress[p_kernelIndex];
+        ecmd->data[XGEMXKERNEL_0_GEMXKERNEL_0_CONTROL_ADDR_P_DDRWR_M_VAL_DATA / 4] = m_baseAddress[p_kernelIndex];
+        ecmd->data[XGEMXKERNEL_0_GEMXKERNEL_0_CONTROL_ADDR_P_DDRRD_M_VAL_DATA / 4 + 1] =
+            m_baseAddress[p_kernelIndex] >> 32;
+        ecmd->data[XGEMXKERNEL_0_GEMXKERNEL_0_CONTROL_ADDR_P_DDRWR_M_VAL_DATA / 4 + 1] =
+            m_baseAddress[p_kernelIndex] >> 32;
+        
+        if (xclExecBuf(m_handle, m_execHandle)) {
+            return 0;
+        }
+        //while (ecmd->state == 1){
+            while (xclExecWait(m_handle, 1) == 0);
+
+        //}
+        return m_execHandle;
+    }
+};
+
+class XFpgaHold {
+   public:
+    unordered_map<unsigned int, shared_ptr<XFpga> > m_xFpgaPtr;
+    static XFpgaHold& instance() {
+        static XFpgaHold theInstance;
+        return theInstance;
+    }
+
+   protected:
+    XFpgaHold() {}
+};
+
+class XHost {
+   protected:
+    static const unsigned int PAGE_SIZE = 4096;
+    static const unsigned int INSTR_BUF_SIZE = PAGE_SIZE;
+    static const unsigned int KERN_DBG_BUF_SIZE = PAGE_SIZE;
+    unordered_map<void*, void*> m_hostMat;
+    unordered_map<void*, unsigned int> m_bufHandle;
+    unordered_map<void*, unsigned long long> m_hostMatSz;
+    shared_ptr<XFpga> m_fpga;
+    vector<unsigned long long> m_ddrDeviceBaseAddr;
+    vector<unsigned int> m_execHandles;
+    char* m_progBuf;
+    char* m_instrBuf;
+    unsigned int m_instrOffset;
+    unsigned int m_instrBufHandle;
+    unsigned int m_cuIndex;
+
+   public:
+    XHost() = delete;
+    XHost(const char* p_xclbin, xfblasStatus_t* p_status, unsigned int p_kernelIndex, unsigned int p_deviceIndex) {
+        m_fpga = XFpgaHold::instance().m_xFpgaPtr[p_deviceIndex];
+        m_cuIndex = p_kernelIndex;
+        if (!m_fpga->openContext(m_cuIndex)) {
+            *p_status = XFBLAS_STATUS_NOT_INITIALIZED;
+            return;
+        }
+        void* l_alignedMem = nullptr;
+        int l_memAllocStatus = posix_memalign(&l_alignedMem, PAGE_SIZE, INSTR_BUF_SIZE);
+        if (l_memAllocStatus) {
+            *p_status = XFBLAS_STATUS_ALLOC_FAILED;
+        }
+        m_instrBuf = (char*)l_alignedMem;
+        m_progBuf = (char*)l_alignedMem;
+        memset(m_instrBuf, 0, INSTR_BUF_SIZE);
+        m_instrOffset = 0;
+        m_instrBufHandle = m_fpga->createBuf(m_instrBuf, INSTR_BUF_SIZE + KERN_DBG_BUF_SIZE, m_cuIndex);
+    }
+
+    bool addMatRestricted(void* p_hostHandle, void* p_matPtr, unsigned long long p_bufSize) {
+        auto& l_hostPtr = m_hostMat;
+        auto& l_hostSzPtr = m_hostMatSz;
+        if (((unsigned long)p_matPtr & (PAGE_SIZE - 1)) != 0) {
+            void* l_matPtr;
+            posix_memalign((void**)&l_matPtr, 4096, p_bufSize);
+            memcpy(l_matPtr, p_matPtr, p_bufSize);
+            if (l_hostPtr.find(p_hostHandle) == l_hostPtr.end()) {
+                l_hostPtr[p_hostHandle] = l_matPtr;
+                l_hostSzPtr[p_hostHandle] = p_bufSize;
+                return true;
+            } else if (m_hostMatSz[p_hostHandle] != p_bufSize) {
+                l_hostPtr[p_hostHandle] = l_matPtr;
+                l_hostSzPtr[p_hostHandle] = p_bufSize;
+                this->m_bufHandle.erase(p_hostHandle);
+                return true;
+            }
+        } else {
+            if (l_hostPtr.find(p_hostHandle) == l_hostPtr.end()) {
+                l_hostPtr[p_hostHandle] = p_matPtr;
+                l_hostSzPtr[p_hostHandle] = p_bufSize;
+                return true;
+            } else if (m_hostMatSz[p_hostHandle] != p_bufSize) {
+                l_hostPtr[p_hostHandle] = p_matPtr;
+                l_hostSzPtr[p_hostHandle] = p_bufSize;
+                this->m_bufHandle.erase(p_hostHandle);
+                return true;
+            }
+        }
+        return false;
+    }
+
+    xfblasStatus_t allocMatRestricted(void* p_hostHandle, void* p_matPtr, unsigned long long p_bufSize) {
+        addMatRestricted(p_hostHandle, p_matPtr, p_bufSize);
+        auto& l_hostPtr = m_hostMat;
+        auto& l_devPtr = m_bufHandle;
+        auto& l_hostSzPtr = m_hostMatSz;
+        if (l_devPtr.find(p_hostHandle) != l_devPtr.end()) {
+            xclFreeBO(m_fpga->m_handle, l_devPtr[p_hostHandle]);
+            if (((unsigned long)p_matPtr & (PAGE_SIZE - 1)) != 0) {
+                void* l_matPtr;
+                posix_memalign((void**)&l_matPtr, 4096, p_bufSize);
+                memcpy(l_matPtr, p_matPtr, p_bufSize);
+                l_hostPtr[p_hostHandle] = l_matPtr;
+                l_devPtr[p_hostHandle] = m_fpga->createBuf(l_hostPtr[p_hostHandle], l_hostSzPtr[p_hostHandle], m_cuIndex);
+            } else {
+                l_hostPtr[p_hostHandle] = p_matPtr;
+                l_devPtr[p_hostHandle] = m_fpga->createBuf(l_hostPtr[p_hostHandle], l_hostSzPtr[p_hostHandle], m_cuIndex);
+            }
+        } else {
+            l_devPtr[p_hostHandle] = m_fpga->createBuf(l_hostPtr[p_hostHandle], l_hostSzPtr[p_hostHandle], m_cuIndex);   
+        }
+        return XFBLAS_STATUS_SUCCESS;
+    }
+
+    template <typename t_dataType>
+    xfblasStatus_t allocMat(t_dataType* p_devPtr, size_t p_bufSize) {
+        auto& l_devPtr = m_bufHandle;
+        auto& l_hostSzPtr = m_hostMatSz;
+        if (l_devPtr.find(*p_devPtr) != l_devPtr.end()) {
+            return XFBLAS_STATUS_ALLOC_FAILED;
+        } else {
+            unsigned int l_deviceHandle =
+                xclAllocBO(m_fpga->m_handle, p_bufSize, XCL_BO_DEVICE_RAM, m_fpga->m_mem[m_cuIndex]);
+            *p_devPtr = (t_dataType)xclMapBO(m_fpga->m_handle, l_deviceHandle, true);
+            memset(*p_devPtr, 0, p_bufSize);
+            l_hostSzPtr[*p_devPtr] = p_bufSize;
+            l_devPtr[*p_devPtr] = l_deviceHandle;
+            return XFBLAS_STATUS_SUCCESS;
+        }
+    }
+
+    template <typename t_dataType>
+    xfblasStatus_t setMatToFPGA(
+        void* p_hostHandle, int p_rows, int p_lda, int p_paddedLda, t_dataType& p_hostPtr, t_dataType& p_devPtr) {
+        auto& l_devPtr = m_bufHandle;
+        auto& l_hostSzPtr = m_hostMatSz;
+        if (l_devPtr.find(p_hostHandle) != l_devPtr.end()) {
+            for (int i = 0; i < p_rows; i++) {
+                for (int j = 0; j < p_lda; j++) {
+                    p_devPtr[IDX2R(i, j, p_paddedLda)] = p_hostPtr[IDX2R(i, j, p_lda)];
+                }
+            }
+            if (!m_fpga->copyToFpga(l_devPtr[p_hostHandle], l_hostSzPtr[p_hostHandle])) {
+                return XFBLAS_STATUS_ALLOC_FAILED;
+            }
+        } else {
+            return XFBLAS_STATUS_ALLOC_FAILED;
+        }
+        return XFBLAS_STATUS_SUCCESS;
+    }
+
+    xfblasStatus_t setMatToFPGARestricted(void* p_hostHandle) {
+        auto& l_devPtr = m_bufHandle;
+        auto& l_hostSzPtr = m_hostMatSz;
+        if (!m_fpga->copyToFpga(l_devPtr[p_hostHandle], l_hostSzPtr[p_hostHandle])) {
+                return XFBLAS_STATUS_ALLOC_FAILED;
+        }
+
+        return XFBLAS_STATUS_SUCCESS;
+    }
+
+    void addInstr(BLASArgs* p_args) {
+        char* l_instr = p_args->asByteArray();
+        char* l_currPos = &m_progBuf[m_instrOffset];
+        memcpy(l_currPos, l_instr, p_args->sizeInBytes());
+        m_instrOffset += p_args->sizeInBytes();
+    }
+
+    template <typename t_dataType>
+    xfblasStatus_t getMat(
+        void* p_hostHandle, int p_rows, int p_lda, int p_paddedLda, t_dataType& p_hostPtr, t_dataType& p_devPtr) {
+        auto& l_hostSzPtr = m_hostMatSz;
+        auto& l_devPtr = m_bufHandle;
+        if (l_devPtr.find(p_hostHandle) != l_devPtr.end()) {
+            if (!m_fpga->copyFromFpga(l_devPtr[p_hostHandle], l_hostSzPtr[p_hostHandle])) {
+                return XFBLAS_STATUS_ALLOC_FAILED;
+            }
+            for (int i = 0; i < p_rows; i++) {
+                for (int j = 0; j < p_lda; j++) {
+                    p_hostPtr[IDX2R(i, j, p_lda)] = p_devPtr[IDX2R(i, j, p_paddedLda)];
+                }
+            }
+        } else {
+            return XFBLAS_STATUS_ALLOC_FAILED;
+        }
+        return XFBLAS_STATUS_SUCCESS;
+    }
+
+    xfblasStatus_t deviceSync() {
+        for (auto& l_devPtr : m_bufHandle) {
+            if (!m_fpga->copyToFpga(l_devPtr.second, m_hostMatSz[l_devPtr.first])) {
+                return XFBLAS_STATUS_ALLOC_FAILED;
+            }
+        }
+        return XFBLAS_STATUS_SUCCESS;
+    }
+
+    xfblasStatus_t getMatManaged() {
+        for (auto& l_devPtr : m_bufHandle) {
+            if (!m_fpga->copyFromFpga(l_devPtr.second, m_hostMatSz[l_devPtr.first])) {
+                return XFBLAS_STATUS_ALLOC_FAILED;
+            }
+        }
+        return XFBLAS_STATUS_SUCCESS;
+    }
+
+    xfblasStatus_t getMatRestricted(void* p_hostHandle, void* p_matPtr) {
+        auto& l_hostPtr = m_hostMat;
+        auto& l_hostSzPtr = m_hostMatSz;
+        auto& l_devPtr = m_bufHandle;
+        if (l_hostPtr.find(p_hostHandle) != l_hostPtr.end()) {
+            if (!m_fpga->copyFromFpga(l_devPtr[p_hostHandle], l_hostSzPtr[p_hostHandle])) {
+                return XFBLAS_STATUS_ALLOC_FAILED;
+            }
+            if (((unsigned long)p_matPtr & (PAGE_SIZE - 1)) != 0) {
+                memcpy(p_matPtr, l_hostPtr[p_hostHandle], l_hostSzPtr[p_hostHandle]);
+            }
+        } else {
+            return XFBLAS_STATUS_ALLOC_FAILED;
+        }
+        return XFBLAS_STATUS_SUCCESS;
+    }
+    
+    xfblasStatus_t getMatByAddress(void* p_matPtr, unsigned long long p_bufSize, unsigned int offset) {
+        uint64_t l_address = offset*PAGE_SIZE+m_fpga->m_baseAddress[m_cuIndex];
+        if (xclUnmgdPread(m_fpga->m_handle,0,p_matPtr,p_bufSize,l_address) < 0){
+          return XFBLAS_STATUS_ALLOC_FAILED;
+        }
+        return XFBLAS_STATUS_SUCCESS;
+    }
+
+    void clearInstrBuf() {
+        memset(this->m_progBuf, 0, PAGE_SIZE);
+        this->m_instrOffset = 0;
+    }
+
+    xfblasStatus_t freeMat(void* p_hostHandle) {
+        auto& l_devPtr = m_bufHandle;
+        if (l_devPtr.find(p_hostHandle) == l_devPtr.end()) {
+            return XFBLAS_STATUS_ALLOC_FAILED;
+        } else {
+            xclFreeBO(m_fpga->m_handle, l_devPtr[p_hostHandle]);
+            this->m_bufHandle.erase(p_hostHandle);
+            this->m_hostMatSz.erase(p_hostHandle);
+            if (!m_hostMat.empty()) {
+                this->m_hostMat.erase(p_hostHandle);
+            }
+            return XFBLAS_STATUS_SUCCESS;
+        }
+    }
+
+    xfblasStatus_t closeContext(unsigned int p_kernelIndex) {
+        free(m_progBuf);
+        xclFreeBO(m_fpga->m_handle, m_instrBufHandle);
+        for (unsigned int i=0;i<m_execHandles.size();i++){
+            xclFreeBO(m_fpga->m_handle, m_execHandles[i]);
+        }
+        xclCloseContext(m_fpga->m_handle, m_fpga->m_xclbinId, this->m_cuIndex);
+        return XFBLAS_STATUS_SUCCESS;
+    }
+    void closeDevice() { xclClose(m_fpga->m_handle); }
+};
+
+class BLASHost : public XHost {
+   private:
+    bool m_execControl = true;
+
+   public:
+    BLASHost() = delete;
+    virtual ~BLASHost() {}
+    BLASHost(const BLASHost&) = delete;
+
+    BLASHost(const char* p_xclbin, xfblasStatus_t* p_status, unsigned int p_kernelIndex, unsigned int p_deviceIndex)
+        : XHost(p_xclbin, p_status, p_kernelIndex, p_deviceIndex) {}
+
+    xfblasStatus_t execute() {
+        xfblasStatus_t l_status = XFBLAS_STATUS_SUCCESS;
+        if (m_execControl) {
+            if (!this->m_fpga->copyToFpga(this->m_instrBufHandle, this->INSTR_BUF_SIZE + this->KERN_DBG_BUF_SIZE)) {
+                l_status = XFBLAS_STATUS_ALLOC_FAILED;
+            }
+            unsigned int m_execHandle = this->m_fpga->execKernel(this->m_cuIndex);
+            if (!m_execHandle) {
+                l_status = XFBLAS_STATUS_ALLOC_FAILED;
+            } else {
+                this-> m_execHandles.push_back(m_execHandle);
+            }
+            m_execControl = false;
+        }
+        return l_status;
+    }
+
+    void enableRun() { m_execControl = true; }
+};
+
+} // namespace blas
+
+} // namespace vitis
+
 #endif
\ No newline at end of file
diff --git a/L3/include/sw/xf_blas/wrapper.hpp b/L3/include/sw/xf_blas/wrapper.hpp
index 222abef2db..c621c140c7 100644
--- a/L3/include/sw/xf_blas/wrapper.hpp
+++ b/L3/include/sw/xf_blas/wrapper.hpp
@@ -30,7 +30,6 @@ namespace blas {
  * prior to any other XFBLAS library calls.
  * @param xclbin file path to FPGA bitstream
  * @param configFile file path to config_info.dat file
- * @param logFile file path to log file
  * @param engineName XFBLAS engine to run
  * @param kernelNumber number of kernels that is being used, default is 1
  * @param deviceIndex index of device that is being used, default is 0
@@ -41,7 +40,6 @@ namespace blas {
  */
 xfblasStatus_t xfblasCreate(const char* xclbin,
                             string configFile,
-                            const char* logFile,
                             xfblasEngine_t engineName,
                             unsigned int kernelNumber = 1,
                             unsigned int deviceIndex = 0) {
@@ -52,9 +50,7 @@ xfblasStatus_t xfblasCreate(const char* xclbin,
 
     int l_err = 0;
 
-    // XFpgaHold::instance().m_xFpgaPtr = shared_ptr<XFpga>(new XFpga(xclbin, logFile, &l_err));
-
-    shared_ptr<XFpga> l_xFpga(new XFpga(xclbin, logFile, &l_err, deviceIndex));
+    shared_ptr<XFpga> l_xFpga(new XFpga(xclbin, &l_err, deviceIndex));
     XFpgaHold::instance().m_xFpgaPtr[deviceIndex] = l_xFpga;
 
     if (l_err != 0) {
@@ -68,22 +64,17 @@ xfblasStatus_t xfblasCreate(const char* xclbin,
 
         for (unsigned int i = 0; i < kernelNumber; i++) {
             BLASHostHandle::instance().m_handlePtr[deviceIndex].push_back(
-                shared_ptr<BLASHost>(new GEMMHost(xclbin, logFile, &l_status, i, deviceIndex)));
+                shared_ptr<BLASHost>(new GEMMHost(xclbin, &l_status, i, deviceIndex)));
         }
         return l_status;
     } else if (engineName == XFBLAS_ENGINE_GEMV) {
         if (ConfigDict::instance().m_dict["GEMX_runGemv"] != "1") {
             return XFBLAS_STATUS_INVALID_VALUE;
         }
-        // int l_err = 0;
 
-        // XFpgaHold::instance().m_xFpgaPtr = shared_ptr<XFpga>(new XFpga(xclbin, logFile, &l_err));
-        // if (l_err != 0) {
-        //    return XFBLAS_STATUS_NOT_INITIALIZED;
-        //}
         for (unsigned int i = 0; i < kernelNumber; i++) {
             BLASHostHandle::instance().m_handlePtr[deviceIndex].push_back(
-                shared_ptr<BLASHost>(new GEMVHost(xclbin, logFile, &l_status, i, deviceIndex)));
+                shared_ptr<BLASHost>(new GEMVHost(xclbin, &l_status, i, deviceIndex)));
         }
         return l_status;
 
diff --git a/L3/include/sw/xf_blas/wrapper_async.hpp b/L3/include/sw/xf_blas/wrapper_async.hpp
index 1d9cc8d163..e85bf49713 100644
--- a/L3/include/sw/xf_blas/wrapper_async.hpp
+++ b/L3/include/sw/xf_blas/wrapper_async.hpp
@@ -26,96 +26,8 @@ namespace xf {
 
 namespace blas {
 
-vector<unsigned int> concurrentKernels;
 vector<future<xfblasStatus_t> > fuStatus;
 
-/**
- * @brief This asynchronous function copies a matrix in host memory to FPGA device memory. xfblasMalloc() need to be
- * called prior to this function.
- * @param rows number of rows in the matrix
- * @param cols number of cols in the matrix that is being used
- * @param elemSize number of bytes required to store each element in the matrix
- * @param A pointer to the matrix array in the host memory
- * @param lda leading dimension of the matrix that indicates the total number of cols in the matrix
- * @param d_A pointer to mapped memory
- * @param kernelIndex index of kernel that is being used, default is 0
- * @param deviceIndex index of device that is being used, default is 0
- */
-void xfblasSetMatrixAsync(int rows,
-                          int cols,
-                          int elemSize,
-                          short* A,
-                          int lda,
-                          short* d_A,
-                          unsigned int kernelIndex = 0,
-                          unsigned int deviceIndex = 0) {
-    concurrentKernels.push_back(kernelIndex);
-    fuStatus.push_back(async(
-        launch::async, [&] { return xfblasSetMatrix(rows, cols, elemSize, A, lda, d_A, kernelIndex, deviceIndex); }));
-}
-
-void xfblasSetMatrixAsync(int rows,
-                          int cols,
-                          int elemSize,
-                          float* A,
-                          int lda,
-                          float* d_A,
-                          unsigned int kernelIndex = 0,
-                          unsigned int deviceIndex = 0) {
-    concurrentKernels.push_back(kernelIndex);
-    fuStatus.push_back(async(
-        launch::async, [&] { return xfblasSetMatrix(rows, cols, elemSize, A, lda, d_A, kernelIndex, deviceIndex); }));
-}
-
-/**
- * @brief This asynchronous function copies a vector in host memory to FPGA device memory. xfblasMalloc() need to be
- * called prior to this function.
- * @param n number of elements in vector
- * @param elemSize number of bytes required to store each element in the vector
- * @param x pointer to the vector in the host memory
- * @param incx the storage spacing between consecutive elements of vector x
- * @param d_x pointer to mapped memory
- * @param kernelIndex index of kernel that is being used, default is 0
- * @param deviceIndex index of device that is being used, default is 0
- */
-void xfblasSetVectorAsync(
-    int n, int elemSize, short* x, int incx, short* d_x, unsigned int kernelIndex = 0, unsigned int deviceIndex = 0) {
-    concurrentKernels.push_back(kernelIndex);
-    fuStatus.push_back(
-        async(launch::async, [&] { return xfblasSetVector(n, elemSize, x, incx, d_x, kernelIndex, deviceIndex); }));
-}
-
-void xfblasSetVectorAsync(
-    int n, int elemSize, float* x, int incx, float* d_x, unsigned int kernelIndex = 0, unsigned int deviceIndex = 0) {
-    concurrentKernels.push_back(kernelIndex);
-    fuStatus.push_back(
-        async(launch::async, [&] { return xfblasSetVector(n, elemSize, x, incx, d_x, kernelIndex, deviceIndex); }));
-}
-
-/**
- * @brief This asynchronous function copies a matrix in host memory to FPGA device memory. xfblasMallocRestricted() need
- * to be called prior to this function.
- * @param A pointer to the matrix array in the host memory
- * @param kernelIndex index of kernel that is being used, default is 0
- * @param deviceIndex index of device that is being used, default is 0
- */
-void xfblasSetMatrixRestrictedAsync(void* A, unsigned int kernelIndex = 0, unsigned int deviceIndex = 0) {
-    concurrentKernels.push_back(kernelIndex);
-    fuStatus.push_back(async(launch::async, xfblasSetMatrixRestricted, A, kernelIndex, deviceIndex));
-}
-
-/**
- * @brief This asynchronous function copies a vector in host memory to FPGA device memory. xfblasMallocRestricted() need
- * to be called prior to this function.
- * @param x pointer to the vector in the host memory
- * @param kernelIndex index of kernel that is being used, default is 0
- * @param deviceIndex index of device that is being used, default is 0
- */
-void xfblasSetVectorRestrictedAsync(void* x, unsigned int kernelIndex = 0, unsigned int deviceIndex = 0) {
-    concurrentKernels.push_back(kernelIndex);
-    fuStatus.push_back(async(launch::async, xfblasSetVectorRestricted, x, kernelIndex, deviceIndex));
-}
-
 /**
  * @brief This asynchronous function copies a matrix in FPGA device memory to host memory
  * @param rows number of rows in the matrix
@@ -135,7 +47,6 @@ void xfblasGetMatrixAsync(int rows,
                           int lda,
                           unsigned int kernelIndex = 0,
                           unsigned int deviceIndex = 0) {
-    concurrentKernels.push_back(kernelIndex);
     fuStatus.push_back(async(
         launch::async, [&] { return xfblasGetMatrix(rows, cols, elemSize, d_A, A, lda, kernelIndex, deviceIndex); }));
 }
@@ -148,7 +59,6 @@ void xfblasGetMatrixAsync(int rows,
                           int lda,
                           unsigned int kernelIndex = 0,
                           unsigned int deviceIndex = 0) {
-    concurrentKernels.push_back(kernelIndex);
     fuStatus.push_back(async(
         launch::async, [&] { return xfblasGetMatrix(rows, cols, elemSize, d_A, A, lda, kernelIndex, deviceIndex); }));
 }
@@ -165,14 +75,12 @@ void xfblasGetMatrixAsync(int rows,
  */
 void xfblasGetVectorAsync(
     int n, int elemSize, short* d_x, short* x, int incx, unsigned int kernelIndex = 0, unsigned int deviceIndex = 0) {
-    concurrentKernels.push_back(kernelIndex);
     fuStatus.push_back(
         async(launch::async, [&] { return xfblasGetVector(n, elemSize, d_x, x, incx, kernelIndex, deviceIndex); }));
 }
 
 void xfblasGetVectorAsync(
     int n, int elemSize, float* d_x, float* x, int incx, unsigned int kernelIndex = 0, unsigned int deviceIndex = 0) {
-    concurrentKernels.push_back(kernelIndex);
     fuStatus.push_back(
         async(launch::async, [&] { return xfblasGetVector(n, elemSize, d_x, x, incx, kernelIndex, deviceIndex); }));
 }
@@ -184,7 +92,6 @@ void xfblasGetVectorAsync(
  * @param deviceIndex index of device that is being used, default is 0
  */
 void xfblasGetMatrixRestrictedAsync(void* A, unsigned int kernelIndex = 0, unsigned int deviceIndex = 0) {
-    concurrentKernels.push_back(kernelIndex);
     fuStatus.push_back(async(launch::async, xfblasGetMatrixRestricted, A, kernelIndex, deviceIndex));
 }
 
@@ -195,7 +102,6 @@ void xfblasGetMatrixRestrictedAsync(void* A, unsigned int kernelIndex = 0, unsig
  * @param deviceIndex index of device that is being used, default is 0
  */
 void xfblasGetVectorRestrictedAsync(void* x, unsigned int kernelIndex = 0, unsigned int deviceIndex = 0) {
-    concurrentKernels.push_back(kernelIndex);
     fuStatus.push_back(async(launch::async, xfblasGetVectorRestricted, x, kernelIndex, deviceIndex));
 }
 
@@ -204,14 +110,6 @@ void xfblasKernelSynchronize() {
         fu.wait();
     }
     fuStatus.clear();
-    concurrentKernels.clear();
-}
-
-// TODO, potential bugs, probably not needed
-void xfblasKernelSynchronize(unsigned int kernelIndex) {
-    for (unsigned int i = 0; i < concurrentKernels.size(); i++) {
-        if (concurrentKernels[i] == kernelIndex) fuStatus[i].wait();
-    }
 }
 
 } // namespace blas
diff --git a/L3/overlay/README.md b/L3/overlay/README.md
index 77d60d76d6..7c65f4c3cb 100644
--- a/L3/overlay/README.md
+++ b/L3/overlay/README.md
@@ -1 +1,3 @@
-This directory contains link to download xclbins
+The .xclbin files used for running L3 benchmark, examples and tests can be downloaded from the following link.
+
+[xclbin files for Nimbix U200 and U250 Nodes](https://www.xilinx.com/bin/public/openDownload?filename=vitis_BLAS_library_r1.0_xclbin.tar)
diff --git a/L3/overlay/u200_xdma_201830_2/gemm_float_1kernel/config_info.dat b/L3/overlay/u200_xdma_201830_2/gemm_float_1kernel/config_info.dat
deleted file mode 100644
index f4b9d5dfde..0000000000
--- a/L3/overlay/u200_xdma_201830_2/gemm_float_1kernel/config_info.dat
+++ /dev/null
@@ -1,26 +0,0 @@
-GEMX_gemmMBlocks=4
-GEMX_gemmKBlocks=4
-GEMX_gemmNBlocks=4
-GEMX_splitMesh=0
-GEMX_keepMacBits=0
-GEMX_macBits=48
-GEMX_XdataType=float
-GEMX_XddrWidth=16
-TEST_MEMCPY=0
-GEMX_instructionSizeBytes=64
-GEMX_dataType=float
-GEMX_dataEqIntType=int
-GEMX_ddrWidth=16
-GEMX_argInstrWidth=1
-GEMX_numInstr=16
-GEMX_argPipeline=2
-GEMX_part=u200
-GEMX_runTransp=0
-GEMX_runGemv=0
-GEMX_runGemm=1
-GEMX_runSpmv=0
-GEMX_runUspmv=0
-GEMX_runFcn=0
-GEMX_numKernels=1
-GEMX_systolicArray=1
-GEMX_fpgaDdrBanks=XCL_MEM_DDR_BANK0,XCL_MEM_DDR_BANK3,XCL_MEM_DDR_BANK1,XCL_MEM_DDR_BANK2
diff --git a/L3/overlay/u200_xdma_201830_2/gemm_float_1kernel/gemx.xclbin b/L3/overlay/u200_xdma_201830_2/gemm_float_1kernel/gemx.xclbin
deleted file mode 100644
index 354e2b4608..0000000000
Binary files a/L3/overlay/u200_xdma_201830_2/gemm_float_1kernel/gemx.xclbin and /dev/null differ
diff --git a/L3/overlay/u200_xdma_201830_2/gemm_float_2kernel/config_info.dat b/L3/overlay/u200_xdma_201830_2/gemm_float_2kernel/config_info.dat
deleted file mode 100644
index e5dd1b5ab3..0000000000
--- a/L3/overlay/u200_xdma_201830_2/gemm_float_2kernel/config_info.dat
+++ /dev/null
@@ -1,26 +0,0 @@
-GEMX_gemmMBlocks=4
-GEMX_gemmKBlocks=4
-GEMX_gemmNBlocks=4
-GEMX_splitMesh=0
-GEMX_keepMacBits=0
-GEMX_macBits=48
-GEMX_XdataType=float
-GEMX_XddrWidth=16
-TEST_MEMCPY=0
-GEMX_instructionSizeBytes=64
-GEMX_dataType=float
-GEMX_dataEqIntType=int
-GEMX_ddrWidth=16
-GEMX_argInstrWidth=1
-GEMX_numInstr=16
-GEMX_argPipeline=2
-GEMX_part=u200
-GEMX_runTransp=0
-GEMX_runGemv=0
-GEMX_runGemm=1
-GEMX_runSpmv=0
-GEMX_runUspmv=0
-GEMX_runFcn=0
-GEMX_numKernels=2
-GEMX_systolicArray=1
-GEMX_fpgaDdrBanks=XCL_MEM_DDR_BANK0,XCL_MEM_DDR_BANK3,XCL_MEM_DDR_BANK1,XCL_MEM_DDR_BANK2
diff --git a/L3/overlay/u200_xdma_201830_2/gemm_float_2kernel/gemx.xclbin b/L3/overlay/u200_xdma_201830_2/gemm_float_2kernel/gemx.xclbin
deleted file mode 100644
index d8050599f9..0000000000
Binary files a/L3/overlay/u200_xdma_201830_2/gemm_float_2kernel/gemx.xclbin and /dev/null differ
diff --git a/L3/overlay/u200_xdma_201830_2/gemm_short_2kernel/config_info.dat b/L3/overlay/u200_xdma_201830_2/gemm_short_2kernel/config_info.dat
deleted file mode 100644
index 42e897697b..0000000000
--- a/L3/overlay/u200_xdma_201830_2/gemm_short_2kernel/config_info.dat
+++ /dev/null
@@ -1,25 +0,0 @@
-GEMX_gemmMBlocks=4
-GEMX_gemmKBlocks=4
-GEMX_gemmNBlocks=4
-GEMX_splitMesh=1
-GEMX_keepMacBits=1
-GEMX_macBits=48
-GEMX_XdataType=short
-GEMX_XddrWidth=32
-TEST_MEMCPY=0
-GEMX_instructionSizeBytes=64
-GEMX_dataType=short
-GEMX_dataEqIntType=short
-GEMX_ddrWidth=32
-GEMX_argInstrWidth=1
-GEMX_numInstr=16
-GEMX_argPipeline=2
-GEMX_part=u200
-GEMX_runTransp=0
-GEMX_runGemv=0
-GEMX_runGemm=1
-GEMX_runSpmv=0
-GEMX_runUspmv=0
-GEMX_runFcn=0
-GEMX_numKernels=2
-GEMX_fpgaDdrBanks=XCL_MEM_DDR_BANK0,XCL_MEM_DDR_BANK3,XCL_MEM_DDR_BANK1,XCL_MEM_DDR_BANK2
diff --git a/L3/overlay/u200_xdma_201830_2/gemm_short_2kernel/gemx.xclbin b/L3/overlay/u200_xdma_201830_2/gemm_short_2kernel/gemx.xclbin
deleted file mode 100644
index 805af3d419..0000000000
Binary files a/L3/overlay/u200_xdma_201830_2/gemm_short_2kernel/gemx.xclbin and /dev/null differ
diff --git a/L3/overlay/vcu1525_dynamic_5_1/gemm_float_1kernel/config_info.dat b/L3/overlay/vcu1525_dynamic_5_1/gemm_float_1kernel/config_info.dat
deleted file mode 100644
index 4a29db4944..0000000000
--- a/L3/overlay/vcu1525_dynamic_5_1/gemm_float_1kernel/config_info.dat
+++ /dev/null
@@ -1,25 +0,0 @@
-GEMX_gemmMBlocks=4
-GEMX_gemmKBlocks=4
-GEMX_gemmNBlocks=4
-GEMX_splitMesh=1
-GEMX_keepMacBits=0
-GEMX_macBits=48
-GEMX_XdataType=float
-GEMX_XddrWidth=16
-TEST_MEMCPY=0
-GEMX_instructionSizeBytes=64
-GEMX_dataType=float
-GEMX_dataEqIntType=int
-GEMX_ddrWidth=16
-GEMX_argInstrWidth=1
-GEMX_numInstr=16
-GEMX_argPipeline=2
-GEMX_part=vcu1525
-GEMX_runTransp=0
-GEMX_runGemv=0
-GEMX_runGemm=1
-GEMX_runSpmv=0
-GEMX_runUspmv=0
-GEMX_runFcn=0
-GEMX_numKernels=1
-GEMX_fpgaDdrBanks=XCL_MEM_DDR_BANK0,XCL_MEM_DDR_BANK3,XCL_MEM_DDR_BANK1,XCL_MEM_DDR_BANK2
diff --git a/L3/overlay/vcu1525_dynamic_5_1/gemm_float_1kernel/gemx.xclbin b/L3/overlay/vcu1525_dynamic_5_1/gemm_float_1kernel/gemx.xclbin
deleted file mode 100644
index 0f87162a1d..0000000000
Binary files a/L3/overlay/vcu1525_dynamic_5_1/gemm_float_1kernel/gemx.xclbin and /dev/null differ
diff --git a/L3/overlay/vcu1525_dynamic_5_1/gemm_short_1kernel/config_info.dat b/L3/overlay/vcu1525_dynamic_5_1/gemm_short_1kernel/config_info.dat
deleted file mode 100644
index c252be327f..0000000000
--- a/L3/overlay/vcu1525_dynamic_5_1/gemm_short_1kernel/config_info.dat
+++ /dev/null
@@ -1,24 +0,0 @@
-GEMX_gemmMBlocks=4
-GEMX_gemmKBlocks=4
-GEMX_gemmNBlocks=4
-GEMX_splitMesh=1
-GEMX_keepMacBits=1
-GEMX_macBits=48
-GEMX_XdataType=short
-GEMX_XddrWidth=32
-TEST_MEMCPY=0
-GEMX_dataType=short
-GEMX_dataEqIntType=short
-GEMX_ddrWidth=32
-GEMX_argInstrWidth=1
-GEMX_numInstr=16
-GEMX_argPipeline=2
-GEMX_part=vcu1525
-GEMX_runTransp=0
-GEMX_runGemv=0
-GEMX_runGemm=1
-GEMX_runSpmv=0
-GEMX_runUspmv=0
-GEMX_runFcn=0
-GEMX_numKernels=1
-GEMX_fpgaDdrBanks=XCL_MEM_DDR_BANK0,XCL_MEM_DDR_BANK3,XCL_MEM_DDR_BANK1,XCL_MEM_DDR_BANK2
diff --git a/L3/overlay/vcu1525_dynamic_5_1/gemm_short_1kernel/gemx.xclbin b/L3/overlay/vcu1525_dynamic_5_1/gemm_short_1kernel/gemx.xclbin
deleted file mode 100644
index 1679c22064..0000000000
Binary files a/L3/overlay/vcu1525_dynamic_5_1/gemm_short_1kernel/gemx.xclbin and /dev/null differ
diff --git a/L3/overlay/vcu1525_dynamic_5_1/gemv_short_1kernel/config_info.dat b/L3/overlay/vcu1525_dynamic_5_1/gemv_short_1kernel/config_info.dat
deleted file mode 100644
index eb51d0d52a..0000000000
--- a/L3/overlay/vcu1525_dynamic_5_1/gemv_short_1kernel/config_info.dat
+++ /dev/null
@@ -1,21 +0,0 @@
-GEMX_gemvkVectorBlocks=512
-GEMX_gemvmVectorBlocks=512
-GEMX_gemvmGroups=1
-GEMX_transpBlocks=8
-TEST_MEMCPY=0
-GEMX_instructionSizeBytes=64
-GEMX_dataType=short
-GEMX_dataEqIntType=short
-GEMX_ddrWidth=32
-GEMX_argInstrWidth=1
-GEMX_numInstr=16
-GEMX_argPipeline=2
-GEMX_part=vcu1525
-GEMX_runTransp=0
-GEMX_runGemv=1
-GEMX_runGemm=0
-GEMX_runSpmv=0
-GEMX_runUspmv=0
-GEMX_runFcn=0
-GEMX_numKernels=1
-GEMX_fpgaDdrBanks=XCL_MEM_DDR_BANK0,XCL_MEM_DDR_BANK2,XCL_MEM_DDR_BANK3,XCL_MEM_DDR_BANK1
diff --git a/L3/overlay/vcu1525_dynamic_5_1/gemv_short_1kernel/gemx.xclbin b/L3/overlay/vcu1525_dynamic_5_1/gemv_short_1kernel/gemx.xclbin
deleted file mode 100644
index e656734230..0000000000
Binary files a/L3/overlay/vcu1525_dynamic_5_1/gemv_short_1kernel/gemx.xclbin and /dev/null differ
diff --git a/L3/src/sw/api.cpp b/L3/src/sw/api.cpp
new file mode 100644
index 0000000000..08faf07d22
--- /dev/null
+++ b/L3/src/sw/api.cpp
@@ -0,0 +1,239 @@
+/*
+ * Copyright 2019 Xilinx, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <omp.h>
+#include "handle.hpp"
+#include "gemm_host.hpp"
+#include "gemv_host.hpp"
+#include "helpers/funcs/fcn_host.hpp"
+#include "api.hpp"
+
+using namespace xf::blas;
+
+bool xfblasCreate(char* xclbin, char* engineName, unsigned int kernelNumber, unsigned int deviceIndex) {
+    int l_err = 0;
+    shared_ptr<XFpga> l_xFpga(new XFpga(xclbin, &l_err, deviceIndex));
+    XFpgaHold::instance().m_xFpgaPtr[deviceIndex] = l_xFpga;
+
+    if (l_err != 0) {
+        return false;
+    }
+    xfblasStatus_t l_status = XFBLAS_STATUS_SUCCESS;
+
+    if (strcmp(engineName, "Gemm") == 0) {
+        for (unsigned int i = 0; i < kernelNumber; i++) {
+            BLASHostHandle::instance().m_handlePtr[deviceIndex].push_back(
+                shared_ptr<BLASHost>(new GEMMHost(xclbin, &l_status, i, deviceIndex)));
+        }
+        return true;
+    } else if (strcmp(engineName, "Gemv") == 0) {
+        for (unsigned int i = 0; i < kernelNumber; i++) {
+            BLASHostHandle::instance().m_handlePtr[deviceIndex].push_back(
+                shared_ptr<BLASHost>(new GEMVHost(xclbin, &l_status, i, deviceIndex)));
+        }
+        return true;
+    } else if (strcmp(engineName, "Fcn") == 0) {
+        for (unsigned int i = 0; i < kernelNumber; i++) {
+            BLASHostHandle::instance().m_handlePtr[deviceIndex].push_back(
+                shared_ptr<BLASHost>(new FCNHost(xclbin, &l_status, i, deviceIndex)));
+        }
+        return true;
+    } else {
+        return false;
+    }
+}
+
+bool xfblasSend(void* A, unsigned long long numElem, int elemSize, unsigned int kernelIndex, unsigned int deviceIndex) {
+    unsigned long long l_bufSize = numElem * elemSize;
+    xfblasStatus_t l_status =
+        BLASHostHandle::instance().m_handlePtr[deviceIndex][kernelIndex]->allocMatRestricted(A, A, l_bufSize);
+    if (l_status != XFBLAS_STATUS_SUCCESS) {
+        return false;
+    }
+    l_status = BLASHostHandle::instance().m_handlePtr[deviceIndex][kernelIndex]->setMatToFPGARestricted(A);
+    if (l_status != XFBLAS_STATUS_SUCCESS) {
+        return false;
+    }
+    return true;
+}
+
+bool xfblasGet(void* A, unsigned int kernelIndex, unsigned int deviceIndex) {
+    xfblasStatus_t l_status = BLASHostHandle::instance().m_handlePtr[deviceIndex][kernelIndex]->execute();
+    if (l_status != XFBLAS_STATUS_SUCCESS) {
+        return false;
+    }
+    l_status = BLASHostHandle::instance().m_handlePtr[deviceIndex][kernelIndex]->getMatRestricted(A, A);
+    if (l_status != XFBLAS_STATUS_SUCCESS) {
+        return false;
+    }
+    return true;
+}
+
+bool xfblasGetByAddress(
+    void* A, unsigned long long p_bufSize, unsigned int offset, unsigned int kernelIndex, unsigned int deviceIndex) {
+    xfblasStatus_t l_status =
+        BLASHostHandle::instance().m_handlePtr[deviceIndex][kernelIndex]->getMatByAddress(A, p_bufSize, offset);
+    if (l_status != XFBLAS_STATUS_SUCCESS) {
+        return false;
+    }
+    return true;
+}
+
+bool xfblasExecute(unsigned int kernelIndex, unsigned int deviceIndex) {
+    xfblasStatus_t l_status = BLASHostHandle::instance().m_handlePtr[deviceIndex][kernelIndex]->execute();
+    if (l_status != XFBLAS_STATUS_SUCCESS) {
+        return false;
+    }
+    return true;
+}
+
+void xfblasExecuteAsync(unsigned int numkernels, unsigned int deviceIndex) {
+#pragma omp parallel
+    {
+        omp_set_dynamic(0);
+        omp_set_num_threads(numkernels);
+#pragma omp for
+        for (int i = 0; i < numkernels; i++) {
+            BLASHostHandle::instance().m_handlePtr[deviceIndex][i]->execute();
+        }
+    }
+}
+
+void xfblasFreeInstr(unsigned int kernelIndex, unsigned int deviceIndex) {
+    BLASHostHandle::instance().m_handlePtr[deviceIndex][kernelIndex]->clearInstrBuf();
+}
+
+void xfblasFree(void* A, unsigned int kernelIndex, unsigned int deviceIndex) {
+    BLASHostHandle::instance().m_handlePtr[deviceIndex][kernelIndex]->freeMat(A);
+}
+
+void xfblasDestroy(unsigned int kernelNumber, unsigned int deviceIndex) {
+    for (unsigned int i = 0; i < kernelNumber; i++) {
+        BLASHostHandle::instance().m_handlePtr[deviceIndex][i]->clearInstrBuf();
+        BLASHostHandle::instance().m_handlePtr[deviceIndex][i]->closeContext(i);
+    }
+    BLASHostHandle::instance().m_handlePtr[deviceIndex][0]->closeDevice();
+    XFpgaHold::instance().m_xFpgaPtr.clear();
+    BLASHostHandle::instance().m_handlePtr.clear();
+}
+
+bool xfblasGemm(int m,
+                int n,
+                int k,
+                int alpha,
+                void* A,
+                int lda,
+                void* B,
+                int ldb,
+                int beta,
+                void* C,
+                int ldc,
+                unsigned int kernelIndex,
+                unsigned int deviceIndex) {
+    if (alpha == 1 && beta == 1) {
+        GEMMHost* l_gemmPtr =
+            static_cast<GEMMHost*>(BLASHostHandle::instance().m_handlePtr[deviceIndex][kernelIndex].get());
+        xfblasStatus_t l_status = l_gemmPtr->addGEMMOp(A, B, C, C, m, n, k, lda, ldb, ldc, ldc, 1, 0);
+        if (l_status != XFBLAS_STATUS_SUCCESS) {
+            return false;
+        }
+        return true;
+    } else {
+        return false;
+    }
+}
+
+bool xfblasGemv(int m,
+                int n,
+                int alpha,
+                void* A,
+                int lda,
+                void* x,
+                int incx,
+                int beta,
+                void* y,
+                int incy,
+                unsigned int kernelIndex,
+                unsigned int deviceIndex) {
+    if (alpha == 1 && beta == 1 && incx == 1 && incy == 1) {
+        GEMVHost* l_gemvPtr =
+            static_cast<GEMVHost*>(BLASHostHandle::instance().m_handlePtr[deviceIndex][kernelIndex].get());
+        xfblasStatus_t l_status = l_gemvPtr->addGEMVOp(A, x, y, m, n, lda);
+        if (l_status != XFBLAS_STATUS_SUCCESS) {
+            return false;
+        }
+        return true;
+    } else {
+        return false;
+    }
+}
+
+bool xfblasFcn(int m,
+               int n,
+               int k,
+               int alpha,
+               void* A,
+               int lda,
+               void* B,
+               int ldb,
+               int beta,
+               void* C,
+               int ldc,
+               void* X,
+               int ldx,
+               int p_postScale,
+               int p_postShift,
+               short p_preluScale,
+               short p_preluAlpha,
+               unsigned int kernelIndex,
+               unsigned int deviceIndex) {
+    if (alpha == 1 && beta == 1) {
+        FCNHost* l_fcnPtr =
+            static_cast<FCNHost*>(BLASHostHandle::instance().m_handlePtr[deviceIndex][kernelIndex].get());
+        xfblasStatus_t l_status = l_fcnPtr->addFCNOp(A, B, C, X, m, n, k, lda, ldb, ldc, ldx, p_postScale, p_postShift,
+                                                     p_preluScale, p_preluAlpha);
+        if (l_status != XFBLAS_STATUS_SUCCESS) {
+            return false;
+        }
+        return true;
+    } else {
+        return false;
+    }
+}
+
+bool xfblasFcnByAddress(unsigned int l_aOff,
+                        unsigned int l_bOff,
+                        unsigned int l_cOff,
+                        unsigned int l_xOff,
+                        unsigned int p_m,
+                        unsigned int p_n,
+                        unsigned int p_k,
+                        unsigned int p_lda,
+                        unsigned int p_ldb,
+                        unsigned int p_ldc,
+                        unsigned int p_ldx,
+                        int p_postScale,
+                        int p_postShift,
+                        short p_preluScale,
+                        short p_preluAlpha,
+                        unsigned int kernelIndex,
+                        unsigned int deviceIndex) {
+    FCNHost* l_fcnPtr = static_cast<FCNHost*>(BLASHostHandle::instance().m_handlePtr[deviceIndex][kernelIndex].get());
+    xfblasStatus_t l_status =
+        l_fcnPtr->addFCNOpByAddress(l_aOff, l_bOff, l_cOff, l_xOff, p_m, p_n, p_k, p_lda, p_ldb, p_ldc, p_ldx,
+                                    p_postScale, p_postShift, p_preluScale, p_preluAlpha);
+    return true;
+}
diff --git a/L3/src/sw/python_api/Makefile b/L3/src/sw/python_api/Makefile
new file mode 100644
index 0000000000..5929a1cb54
--- /dev/null
+++ b/L3/src/sw/python_api/Makefile
@@ -0,0 +1,152 @@
+ 
+#
+# Copyright 2019 Xilinx, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# -----------------------------------------------------------------------------
+#                          project common settings
+
+MK_PATH := $(abspath $(lastword $(MAKEFILE_LIST)))
+CUR_DIR := $(patsubst %/,%,$(dir $(MK_PATH)))
+
+.SECONDEXPANSION:
+
+# -----------------------------------------------------------------------------
+#                            common tool setup
+
+
+.PHONY: help
+
+help::
+	@echo ""
+	@echo "Makefile Usage:"
+	@echo ""
+	@echo "  make api "
+	@echo "      Command to build shared library file."
+	@echo ""
+	@echo "  make clean "
+	@echo "      Command to remove the generated non-hardware files."
+	@echo ""
+	@echo "  make cleanall"
+	@echo "      Command to remove all the generated files."
+	@echo ""
+
+ifeq (,$(XILINX_XRT))
+XILINX_XRT = /opt/xilinx/xrt
+endif
+export XILINX_XRT
+.PHONY: check_xrt
+check_xrt:
+ifeq (,$(wildcard $(XILINX_XRT)/lib/libxilinxopencl.so))
+	@echo "Cannot locate XRT installation. Please set XILINX_XRT variable." && false
+endif
+
+export PATH := $(XILINX_XRT)/bin:$(PATH)
+
+ifeq (,$(LD_LIBRARY_PATH))
+LD_LIBRARY_PATH := $(XILINX_XRT)/lib
+else
+LD_LIBRARY_PATH := $(XILINX_XRT)/lib:$(LD_LIBRARY_PATH)
+endif
+
+# -----------------------------------------------------------------------------
+# BEGIN_XF_MK_USER_SECTION
+# -----------------------------------------------------------------------------
+
+XF_PROJ_ROOT ?= $(CUR_DIR)/../../../..
+XFLIB_DIR := $(abspath $(XF_PROJ_ROOT))
+
+XCLBIN_FILE :=
+KERNELS :=
+
+# -----------------------------------------------------------------------------
+
+SRC_DIR = $(CUR_DIR)
+
+EXE_NAME = 
+HOST_ARGS =
+
+SRCS = $(XFLIB_DIR)/L3/src/sw/api.cpp
+
+CXXFLAGS += -g -I$(XILINX_XRT)/include -I $(XFLIB_DIR)/L3/include/sw/xf_blas
+
+# -----------------------------------------------------------------------------
+# END_XF_MK_USER_SECTION
+# -----------------------------------------------------------------------------
+
+.PHONY: all
+all: host 
+
+OBJ_DIR_BASE ?= obj
+BIN_DIR_BASE ?= bin
+
+
+OBJ_DIR = $(CUR_DIR)/$(OBJ_DIR_BASE)$(BIN_DIR_SUFFIX)
+BIN_DIR = $(CUR_DIR)/$(BIN_DIR_BASE)$(BIN_DIR_SUFFIX)
+
+RUN_ENV =
+OBJ_FILES = $(CUR_DIR)/$(OBJ_DIR_BASE)/api.o
+EXTRA_OBJS = 
+
+CXX := /usr/bin/gcc
+CC := /urs/bin/gcc
+
+CXXFLAGS += -O3 -std=c++11 -fPIC -fopenmp -Wextra -Wall -Wno-unused-parameter -Wno-unused-variable -Wno-unused-result
+CFLAGS +=
+LDFLAGS += -pthread -L$(XILINX_XRT)/lib -lxilinxopencl
+LDFLAGS += -L$(XILINX_XRT)/lib -lz -lstdc++ -lrt -pthread -lxrt_core -ldl -luuid
+
+EXE_EXT ?= exe
+EXE_FILE =
+
+API_OUT = xfblas.so
+
+.PHONY: host
+
+host: api
+
+api: check_xrt $(API_OUT)
+
+$(OBJ_FILES):  $(SRCS)
+	mkdir -p $(OBJ_DIR_BASE)
+	$(CXX) -shared -c -o $@ $< $(CXXFLAGS)
+	
+$(API_OUT): $(OBJ_FILES)
+	mkdir -p lib
+	$(CXX) -shared -o lib/$@ $(OBJ_FILES) $(LDFLAGS)
+	chmod a+rx lib
+
+
+# -----------------------------------------------------------------------------
+#                                clean up
+
+clean:
+ifneq (,$(OBJ_DIR_BASE))
+	rm -rf $(CUR_DIR)/$(OBJ_DIR_BASE)*
+endif
+ifneq (,$(BIN_DIR_BASE))
+	rm -rf $(CUR_DIR)/$(BIN_DIR_BASE)*
+endif
+	rm -rf lib
+
+cleanall: clean 
+
+.PHONY: run 
+
+run: api 
+
+check: run
+
+
diff --git a/L3/src/sw/python_api/keras_rt.py b/L3/src/sw/python_api/keras_rt.py
new file mode 100644
index 0000000000..ef89d17d3e
--- /dev/null
+++ b/L3/src/sw/python_api/keras_rt.py
@@ -0,0 +1,55 @@
+ # Copyright 2019 Xilinx, Inc.
+ #
+ # Licensed under the Apache License, Version 2.0 (the "License");
+ # you may not use this file except in compliance with the License.
+ # You may obtain a copy of the License at
+ #
+ #     http://www.apache.org/licenses/LICENSE-2.0
+ #
+ # Unless required by applicable law or agreed to in writing, software
+ # distributed under the License is distributed on an "AS IS" BASIS,
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ # See the License for the specific language governing permissions and
+ # limitations under the License.
+
+import numpy as np
+import xfblas_L3 as xfblas
+from xfblas_L3_rt import XfblasRT
+
+class KerasRT(XfblasRT):
+    def __init__(self, keras_model, xclbin_opts, wgt_scale, bias_scale, post_scale, relu_scale,idxKernel,idxDevice):
+      keras_w = keras_model.get_weights()[0::2]
+      keras_b = keras_model.get_weights()[1::2]
+      XfblasRT.__init__(self, xclbin_opts, keras_w, keras_b, wgt_scale, bias_scale, post_scale,relu_scale,idxKernel,idxDevice)
+      self.kmodel = keras_model
+       
+    def loadInstr(self):
+      xfblas.freeInstr(self.idxKernel,self.idxDevice)
+      for i,l in enumerate(self.kmodel.layers):
+          act = l.get_config()['activation']
+          if self._qw[0].dtype == np.float32:
+            if act == 'relu':
+              xfblas.fcnOp( self.fpga_buf[i], self._qw[i], self.fpga_buf[i+1], self._qb[i], 1, 0, 0, 0, self.idxKernel,self.idxDevice)
+            else:
+              xfblas.fcnOp( self.fpga_buf[i], self._qw[i], self.fpga_buf[i+1], self._qb[i], 1, 0, 1, 0, self.idxKernel,self.idxDevice)      
+          else:
+            if act == 'relu':
+              xfblas.fcnOp( self.fpga_buf[i], self._qw[i], self.fpga_buf[i+1], self._qb[i], self.post_scale[i][0], self.post_scale[i][1], 0, 0, self.idxKernel,self.idxDevice)
+            else:
+              xfblas.fcnOp( self.fpga_buf[i], self._qw[i], self.fpga_buf[i+1], self._qb[i], self.post_scale[i][0], self.post_scale[i][1], 1, 0, self.idxKernel,self.idxDevice)
+              
+    def loadInstrByAddress(self):
+      xfblas.freeInstr(self.idxKernel,self.idxDevice)
+      numLayers= len(self.kmodel.layers)
+      for i,l in enumerate(self.kmodel.layers):
+          act = l.get_config()['activation']
+          if self._qw[0].dtype == np.float32:
+            if act == 'relu':
+              xfblas.fcnOpByAddress(self.offset_list[2*numLayers+i],self.offset_list[i],self.offset_list[2*numLayers+i+1],self.offset_list[numLayers+i], self.fpga_buf[i], self._qw[i], self.fpga_buf[i+1], self._qb[i], 1, 0, 0, 0, self.idxKernel,self.idxDevice)
+            else:
+              xfblas.fcnOpByAddress(self.offset_list[2*numLayers+i],self.offset_list[i],self.offset_list[2*numLayers+i+1],self.offset_list[numLayers+i],self.fpga_buf[i], self._qw[i], self.fpga_buf[i+1], self._qb[i], 1, 0, 1, 0, self.idxKernel,self.idxDevice)      
+          else:
+            if act == 'relu':
+              xfblas.fcnOpByAddress(self.offset_list[2*numLayers+i],self.offset_list[i],self.offset_list[2*numLayers+i+1],self.offset_list[numLayers+i],self.fpga_buf[i], self._qw[i], self.fpga_buf[i+1], self._qb[i], self.post_scale[i][0], self.post_scale[i][1], 0, 0, self.idxKernel,self.idxDevice)
+            else:
+              xfblas.fcnOpByAddress(self.offset_list[2*numLayers+i],self.offset_list[i],self.offset_list[2*numLayers+i+1],self.offset_list[numLayers+i],self.fpga_buf[i], self._qw[i], self.fpga_buf[i+1], self._qb[i], self.post_scale[i][0], self.post_scale[i][1], 1, 0, self.idxKernel,self.idxDevice)
\ No newline at end of file
diff --git a/L3/src/sw/python_api/xfblas_L3.py b/L3/src/sw/python_api/xfblas_L3.py
new file mode 100644
index 0000000000..ad014838cd
--- /dev/null
+++ b/L3/src/sw/python_api/xfblas_L3.py
@@ -0,0 +1,418 @@
+ # Copyright 2019 Xilinx, Inc.
+ #
+ # Licensed under the Apache License, Version 2.0 (the "License");
+ # you may not use this file except in compliance with the License.
+ # You may obtain a copy of the License at
+ #
+ #     http://www.apache.org/licenses/LICENSE-2.0
+ #
+ # Unless required by applicable law or agreed to in writing, software
+ # distributed under the License is distributed on an "AS IS" BASIS,
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ # See the License for the specific language governing permissions and
+ # limitations under the License.
+
+from ctypes import *
+import numpy as np
+import sys
+import argparse
+import os
+
+class XFBLASManager:
+  def __init__(self,libFile):
+    self._lib = cdll.LoadLibrary(libFile)
+    self._lib.xfblasCreate.argtypes = [c_char_p,c_char_p,c_uint,c_uint]
+    self._lib.xfblasCreate.restype = c_bool
+    self._lib.xfblasSend.argtypes = [np.ctypeslib.ndpointer(flags="C_CONTIGUOUS"),c_ulonglong,c_uint,c_uint,c_uint]
+    self._lib.xfblasSend.restype = c_bool
+    self._lib.xfblasGet.argtypes = [np.ctypeslib.ndpointer(flags="C_CONTIGUOUS"),c_uint,c_uint]
+    self._lib.xfblasGet.restype = c_bool
+    self._lib.xfblasFreeInstr.argtypes = [c_uint,c_uint]
+    self._lib.xfblasDestroy.argtypes = [c_uint,c_uint]
+    self._lib.xfblasFree.argtypes = [np.ctypeslib.ndpointer(flags="C_CONTIGUOUS"),c_uint,c_uint]
+    self._lib.xfblasGemm.argtypes = [c_uint,c_uint,c_uint,c_uint,
+                                     np.ctypeslib.ndpointer(flags="C_CONTIGUOUS"), c_uint,
+                                     np.ctypeslib.ndpointer(flags="C_CONTIGUOUS"), c_uint,
+                                     c_uint,
+                                     np.ctypeslib.ndpointer(flags="C_CONTIGUOUS"), c_uint,
+                                     c_uint,c_uint]
+    self._lib.xfblasGemm.restype = c_bool
+    self._lib.xfblasGemv.argtypes = [c_uint,c_uint,c_uint,
+                                     np.ctypeslib.ndpointer(flags="C_CONTIGUOUS"), c_uint,
+                                     np.ctypeslib.ndpointer(flags="C_CONTIGUOUS"), c_uint,
+                                     c_uint,
+                                     np.ctypeslib.ndpointer(flags="C_CONTIGUOUS"), c_uint,
+                                     c_uint,c_uint]
+    self._lib.xfblasGemv.restype = c_bool
+    self._lib.xfblasFcn.argtypes = [c_uint,c_uint,c_uint,c_uint,
+                                     np.ctypeslib.ndpointer(flags="C_CONTIGUOUS"), c_uint,
+                                     np.ctypeslib.ndpointer(flags="C_CONTIGUOUS"), c_uint,
+                                     c_uint,
+                                     np.ctypeslib.ndpointer(flags="C_CONTIGUOUS"), c_uint,
+                                     np.ctypeslib.ndpointer(flags="C_CONTIGUOUS"), c_uint,
+                                     c_int,c_int,
+                                     c_short,c_short,
+                                     c_uint,c_uint]
+    self._lib.xfblasFcnByAddress.argtypes = [c_uint,c_uint,c_uint,c_uint,
+                                             c_uint,c_uint,c_uint,c_uint,
+                                             c_uint,c_uint,c_uint,
+                                             c_int,c_int,
+                                             c_short,c_short,
+                                             c_uint,c_uint]
+    self._lib.xfblasGemm.restype = c_bool
+    self._lib.xfblasGetByAddress.argtypes = [np.ctypeslib.ndpointer(flags="C_CONTIGUOUS"),c_ulonglong,c_uint,c_uint,c_uint]
+    self._lib.xfblasGetByAddress.restype = c_bool
+    self._lib.xfblasExecuteAsync.argtypes = [c_uint,c_uint]
+    self._lib.xfblasExecute.argtypes = [c_uint,c_uint]
+    
+  def createGemm(self,xclbin,numKernel,idxDevice):
+    '''
+    create Gemm Handle
+    
+    Parameters
+    ----------
+    xclbin
+                file path for FPGA bitstream
+    numKernel
+                number of CUs in the xclbin
+    idxDeivce
+                index of local device to be used
+    '''
+    b_xclbin = xclbin.encode('utf-8')
+    b_log = xclbin.encode('utf-8')
+    return self._lib.xfblasCreate(b_xclbin,b'Gemm',numKernel,idxDevice)
+    
+  def createGemv(self,xclbin,numKernel,idxDevice):
+    '''
+    create Gemv Handle
+    
+    Parameters
+    ----------
+    xclbin
+                file path for FPGA bitstream
+    numKernel
+                number of CUs in the xclbin
+    idxDeivce
+                index of local device to be used
+    '''
+    b_xclbin = xclbin.encode('utf-8')
+    b_log = xclbin.encode('utf-8')
+    return self._lib.xfblasCreate(b_xclbin,b'Gemv',numKernel,idxDevice)
+  
+  def createFcn(self,xclbin,numKernel,idxDevice):
+    '''
+    create Fcn Handle
+    
+    Parameters
+    ----------
+    xclbin
+                file path for FPGA bitstream
+    numKernel
+                number of CUs in the xclbin
+    idxDeivce
+                index of local device to be used
+    '''
+    b_xclbin = xclbin.encode('utf-8')
+    b_log = xclbin.encode('utf-8')
+    return self._lib.xfblasCreate(b_xclbin,b'Fcn',numKernel,idxDevice)
+    
+  def sendMat(self,A,idxKernel,idxDevice):
+    '''
+    send mat from host to device
+    
+    Parameters
+    ----------
+    A:          ndarray
+                matrix in host memory
+    idxKernel:  int
+                index of kernel to be used
+    idxDeivce:  int
+                index of local device to be used
+    '''
+    return self._lib.xfblasSend(A,c_ulonglong(A.size),c_uint(A.itemsize),idxKernel,idxDevice)
+    
+  def getMat(self,A,idxKernel,idxDevice):
+    '''
+    get mat from device to host
+    
+    Parameters
+    ----------
+    A:          ndarray
+                matrix in host memory
+    idxKernel:  int
+                index of kernel to be used
+    idxDeivce:  int
+                index of local device to be used
+    '''
+    return self._lib.xfblasGet(A,idxKernel,idxDevice)
+  
+  def freeInstr(self,idxKernel,idxDevice):
+    '''
+    free memory for instructions
+    
+    Parameters
+    ----------
+    idxKernel
+                index of kernel to be used
+    idxDeivce
+                index of local device to be used
+    '''
+    return self._lib.xfblasFreeInstr(idxKernel,idxDevice)
+  
+  def freeMat(self,A,idxKernel,idxDevice):
+    '''
+    free device memory for mat A
+    
+    Parameters
+    ----------
+    A:          ndarray
+                matrix in host memory
+    idxKernel:  int
+                index of kernel to be used
+    idxDeivce:  int
+                index of local device to be used
+    '''
+    return self._lib.xfblasFree(A,idxKernel,idxDevice)
+  
+  def destroy(self,numKernel,idxDevice):
+    '''
+    release handle used by the XFBLAS library
+    
+    Parameters
+    ----------
+    numKernel
+                number of CUs in the xclbin
+    idxDeivce
+                index of local device to be used
+    '''    
+    return self._lib.xfblasDestroy(numKernel,idxDevice)
+    
+  def gemmOp(self,A,B,C,idxKernel,idxDevice):
+    '''
+    perform matrix-matrix multiplication of C=A*B
+    
+    Parameters
+    ----------
+    A:              ndarray
+                    matrix in host memory
+    B:              ndarray
+                    matrix in host memory
+    C:              ndarray
+                    matrix in host memory
+    idxKernel:      int
+                    index of kernel to be used
+    idxDeivce:      int
+                    index of local device to be used
+    '''      
+    return self._lib.xfblasGemm(c_uint(A.shape[0]), c_uint(B.shape[1]), c_uint(A.shape[1]), 1, A, c_uint(A.shape[1]), B, c_uint(B.shape[1]), 1, C, c_uint(B.shape[1]),idxKernel,idxDevice)
+  
+  def gemvOp(self,A,x,y,idxKernel,idxDevice):
+    '''
+    perform matrix-vector multiplication of y=A*x
+    
+    Parameters
+    ----------
+    A:              ndarray
+                    matrix in host memory
+    x:              ndarray
+                    vector in host memory
+    y:              ndarray
+                    vector in host memory
+    idxKernel:      int
+                    index of kernel to be used
+    idxDeivce:      int
+                    index of local device to be used
+    '''      
+    return self._lib.xfblasGemv(c_uint(A.shape[0]), c_uint(A.shape[1]),1, A, c_uint(A.shape[1]), x, 1, y, 1, idxKernel,idxDevice)
+  
+  def fcnOp(self,A,B,C,X,postScale,postShift,preluScale,preluAlpha,idxKernel,idxDevice):
+    '''
+    perform matrix-matrix multiplication with p_relu, and quantization support of C = relu ((A * B + X) * postScale >> postShift) 
+    
+    Parameters
+    ----------
+    A:              ndarray
+                    matrix in host memory
+    B:              ndarray
+                    matrix in host memory
+    C:              ndarray
+                    matrix in host memory
+    X:              ndarray
+                    matrix in host memory   
+    postScale:      int
+                    multiply the output values with specific scalar
+    postShift:      int
+                    shift the output values with specific scalar
+    PReLUScale:     int
+                    multiply the output values with specific scalar when output values < 0 
+    PReLUAlpha:     int
+                    shift the output values with specific scalar when output values < 0              
+    idxKernel:      int
+                    index of kernel to be used
+    idxDeivce:      int
+                    index of local device to be used
+    '''      
+    return self._lib.xfblasFcn(c_uint(A.shape[0]), c_uint(B.shape[1]), c_uint(A.shape[1]), 1, A, c_uint(A.shape[1]), B, c_uint(B.shape[1]), 1, C, c_uint(C.shape[1]),X,c_uint(X.shape[1]),postScale,postShift,preluScale,preluAlpha,idxKernel,idxDevice)
+  
+  def fcnOpByAddress(self,a,b,c,x,A,B,C,X,postScale,postShift,preluScale,preluAlpha,idxKernel,idxDevice):
+    '''
+    send intructions for matrix-matrix multiplication with p_relu, and quantization support of C = relu ((A * B + X) * postScale >> postShift) 
+    Using this function, could avoid sending matrices that will be over-written in device, but users need to provide offsets
+    
+    Parameters
+    ----------
+    a:              int
+                    offset of memory for A on device
+    b:              int
+                    offset of memory for B on device
+    c:              int
+                    offset of memory for C on device  
+    A:              ndarray
+                    matrix in host memory
+    B:              ndarray
+                    matrix in host memory
+    C:              ndarray
+                    matrix in host memory
+    X:              ndarray
+                    matrix in host memory   
+    postScale:      int
+                    multiply the output values with specific scalar
+    postShift:      int
+                    shift the output values with specific scalar
+    PReLUScale:     int
+                    multiply the output values with specific scalar when output values < 0 
+    PReLUAlpha:     int
+                    shift the output values with specific scalar when output values < 0              
+    idxKernel:      int
+                    index of kernel to be used
+    idxDeivce:      int
+                    index of local device to be used
+    '''
+    return self._lib.xfblasFcnByAddress(c_uint(a),c_uint(b),c_uint(c),c_uint(x),c_uint(A.shape[0]), c_uint(B.shape[1]), c_uint(A.shape[1]), c_uint(A.shape[1]),c_uint(B.shape[1]),c_uint(C.shape[1]),c_uint(X.shape[1]),postScale,postShift,preluScale,preluAlpha,idxKernel,idxDevice)
+  
+  def getMatByAddress(self,A,offset,idxKernel,idxDevice):
+    '''
+    get mat from device by offset
+    
+    Parameters
+    ----------
+    A:              ndarray
+                    matrix in host memory
+    offset:         int
+                    offset of memory for A on device
+    idxKernel:      int
+                    index of kernel to be used
+    idxDeivce:      int
+                    index of local device to be used
+    '''
+    return self._lib.xfblasGetByAddress(A,c_ulonglong(A.size*A.itemsize),offset,idxKernel,idxDevice)
+  
+  def executeAsync(self,numKernel,idxDevice):
+    '''
+    run number of kernels async
+    
+    Parameters
+    ----------
+    numKernel
+                number of CUs in the xclbin
+    idxDeivce
+                index of local device to be used
+    '''
+    return self._lib.xfblasExecuteAsync(numKernel,idxDevice)
+  
+  def execute(self,idxKernel,idxDevice):
+    '''
+    run ith kernel
+    
+    Parameters
+    ----------
+    idxKernel:      int
+                    index of kernel to be used
+    idxDeivce:      int
+                    index of local device to be used
+    '''
+    return self._lib.xfblasExecute(idxKernel,idxDevice)
+  
+_xfblasManager = None
+  
+def createGemm(args,xclbin_opts,numKernel=1,idxDevice=0):
+    if int(xclbin_opts['GEMX_runGemm'])!= 1:
+        raise Exception('The xclbin does not include gemm engine.')
+    createManager(args.lib)
+    return _xfblasManager.createGemm(args.xclbin,numKernel,idxDevice)
+  
+def createGemv(args,xclbin_opts,numKernel=1,idxDevice=0):
+    if int(xclbin_opts['GEMX_runGemv'])!= 1:
+        raise Exception('The xclbin does not include gemv engine.')
+    createManager(args.lib)
+    return _xfblasManager.createGemv(args.xclbin,numKernel,idxDevice)
+  
+def createFcn(args,xclbin_opts,numKernel=1,idxDevice=0):
+    if int(xclbin_opts['GEMX_runFcn'])!= 1:
+        raise Exception('The xclbin does not include fcn engine.')
+    createManager(args.lib)
+    return _xfblasManager.createFcn(args.xclbin,numKernel,idxDevice)
+  
+def sendMat(A,idxKernel=0,idxDevice=0):
+    return _xfblasManager.sendMat(A,idxKernel,idxDevice)
+  
+def getMat(A,idxKernel=0,idxDevice=0):
+    return _xfblasManager.getMat(A,idxKernel,idxDevice)
+  
+def freeInstr(idxKernel=0,idxDevice=0):
+    return _xfblasManager.freeInstr(idxKernel,idxDevice)
+
+def freeMat(A,idxKernel=0,idxDevice=0):
+    return _xfblasManager.freeMat(A,idxKernel,idxDevice)
+  
+def destroy(numKernel=1,idxDevice=0):
+    return _xfblasManager.destroy(numKernel,idxDevice)
+  
+def gemmOp(A,B,C,idxKernel=0,idxDevice=0):
+    return _xfblasManager.gemmOp(A,B,C,idxKernel,idxDevice)
+  
+def gemvOp(A,x,y,idxKernel=0,idxDevice=0):
+    return _xfblasManager.gemvOp(A,x,y,idxKernel,idxDevice)
+  
+def fcnOp(A,B,C,X,postScale=1,postShift=0,preluScale=1,preluAlpha=0,idxKernel=0,idxDevice=0):
+    return _xfblasManager.fcnOp(A,B,C,X,postScale,postShift,preluScale,preluAlpha,idxKernel,idxDevice)
+  
+def fcnOpByAddress(a,b,c,x,A,B,C,X,postScale=1,postShift=0,preluScale=1,preluAlpha=0,idxKernel=0,idxDevice=0):
+    return _xfblasManager.fcnOpByAddress(a,b,c,x,A,B,C,X,postScale,postShift,preluScale,preluAlpha,idxKernel,idxDevice)
+  
+def getMatByAddress(A,offset,idxKernel=0,idxDevice=0):
+    return _xfblasManager.getMatByAddress(A,offset,idxKernel,idxDevice)
+  
+def executeAsync(numKernel=1,idxDevice=0):
+    return _xfblasManager.executeAsync(numKernel,idxDevice)
+  
+def execute(idxKernel=0,idxDevice=0):
+    return _xfblasManager.execute(idxKernel,idxDevice)
+  
+def createManager ( libFile ):
+  global _xfblasManager
+  if not _xfblasManager:
+    _xfblasManager = XFBLASManager(libFile)    
+  return True  
+  
+def parse_cfg(filename):
+    myvars = {}
+    with open(filename) as myfile:
+        for line in myfile:
+            for word in line.split():
+               name, var = word.split("=")
+               myvars[name.strip()] = var.rstrip()  
+    return myvars
+
+def default_args():
+    parser = argparse.ArgumentParser(description='xfblas')
+    parser.add_argument('--xclbin', required = True, help='file path to FPGA bitstream')
+    parser.add_argument('--lib', required = True, help='file path to xfblas shared library')
+    parser.add_argument('--cfg', required=True, help='file describing .xclbin properties')
+    return parser
+      
+def processCommandLine():
+    parser = default_args()
+    args = parser.parse_args()
+    xclbin_opts = parse_cfg ( args.cfg ) 
+    return args, xclbin_opts
\ No newline at end of file
diff --git a/L3/src/sw/python_api/xfblas_L3_rt.py b/L3/src/sw/python_api/xfblas_L3_rt.py
new file mode 100644
index 0000000000..43ee532caa
--- /dev/null
+++ b/L3/src/sw/python_api/xfblas_L3_rt.py
@@ -0,0 +1,169 @@
+ # Copyright 2019 Xilinx, Inc.
+ #
+ # Licensed under the Apache License, Version 2.0 (the "License");
+ # you may not use this file except in compliance with the License.
+ # You may obtain a copy of the License at
+ #
+ #     http://www.apache.org/licenses/LICENSE-2.0
+ #
+ # Unless required by applicable law or agreed to in writing, software
+ # distributed under the License is distributed on an "AS IS" BASIS,
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ # See the License for the specific language governing permissions and
+ # limitations under the License.
+
+import numpy as np
+import math
+import xfblas_L3 as xfblas
+import time
+
+class XfblasRT():
+  
+  def __init__(self, xclbin_opts, wgt, bias, wgt_scale, bias_scale, post_scale,relu_scale,idxKernel,idxDevice):
+      ddrwidth = int(xclbin_opts["GEMX_ddrWidth"])
+      self.min_m = ddrwidth * int(xclbin_opts["GEMX_gemmMBlocks"])
+      self.min_k = ddrwidth * int(xclbin_opts["GEMX_gemmKBlocks"])
+      self.min_n = ddrwidth * max (int(xclbin_opts["GEMX_gemmKBlocks"]), int(xclbin_opts["GEMX_gemmNBlocks"]) )
+      if type (wgt) != list:
+          wgt = [wgt]
+      
+      if type(bias) != list:
+          bias = [bias]
+     
+      self._wshape = []
+      self.offset_list = [2]
+      for w in wgt:
+          self._wshape.append(w.shape)
+      if xclbin_opts["GEMX_dataType"] == "float":
+          self._qw = wgt
+          self.bias = bias
+      else:
+          self._qw = [np.int16(np.around(a*b)) for a,b in zip(wgt, wgt_scale)]
+          self.bias = [np.int32(np.around(a*b)) for a,b in zip(bias, bias_scale)]
+          
+      for i,b in enumerate(self._qw):
+          self._qw[i] = self.format_for_fpga( b, self.min_m, self.min_k)
+          xfblas.sendMat(self._qw[i],idxKernel,idxDevice)
+          self.offset_list.append(self.get_offset(self._qw[i]))
+          
+      self.fpga_buf = []
+      self._qb = []
+      self.out_dim = None
+      self.post_scale = post_scale
+      self.relu_scale = relu_scale
+      self.xclbin_opts = xclbin_opts
+      self.idxKernel = idxKernel
+      self.idxDevice = idxDevice
+      
+  def get_offset(self,w):
+      return int(w.shape[0]*w.shape[1]*w.itemsize/4096+self.offset_list[-1])
+  
+  def get_padded_shape ( self, shape, min_row, min_col):
+      row_padded = int( math.ceil( np.float32(shape[0]) / min_row ) * min_row ) 
+      col_padded = int( math.ceil( np.float32(shape[1]) / min_col ) * min_col )
+      return row_padded,col_padded
+
+  def format_for_fpga ( self, nparr, min_row, min_col):    
+      row_padded, col_padded = self.get_padded_shape ( nparr.shape, min_row, min_col)
+      padded_arr = np.zeros ( (row_padded, col_padded), dtype=nparr.dtype, order='C')
+      padded_arr[0:nparr.shape[0], 0:nparr.shape[1]] = nparr
+      return padded_arr            
+    
+  def format_bias (self, b, dim, min_row, min_col):
+      if b.ndim == 1:
+          b = np.broadcast_to(b, (dim[0],dim[1]) )
+      
+      b = self.format_for_fpga( b, min_row, min_col)
+      xfblas.sendMat(b,self.idxKernel,self.idxDevice)
+      self.offset_list.append(self.get_offset(b))
+      return b
+    
+  def init_fpgabuf (self, in_shape ):  
+      fpga_buf = []
+      buf_dim = [in_shape]
+  
+      for i in self._wshape:
+          buf_dim.append( (in_shape[0], i[1]) )
+          
+      self.out_dim = buf_dim[-1]
+          
+      for d in buf_dim:
+          d_padded = self.get_padded_shape(d, self.min_m, self.min_n)
+          inter_mat = np.zeros ( d_padded, dtype=self._qw[0].dtype, order='C')
+          fpga_buf.append(inter_mat)
+          
+      self.fpga_buf = fpga_buf
+      
+      formatted_bias = []
+      for dim,b  in zip (buf_dim[1:], self.bias):
+          b = self.format_bias (b, dim, self.min_m, self.min_n)
+          formatted_bias.append(b)   
+      
+      self._qb = formatted_bias
+    
+  def loadInstr(self):
+      xfblas.freeInstr(self.idxKernel,self.idxDevice)
+      for i,(w_i,b_i) in enumerate( zip( self._qw, self._qb) ):
+          xfblas.fcnOp( w_i , self.fpga_buf[i], self.fpga_buf[i+1], b_i, self.post_scale[i][0], self.post_scale[i][1],1,0,self.idxKernel,self.idxDevice)
+            
+  def predict_old ( self, inp, in_scale): #keep this for debug
+      self.init_fpgabuf(inp.shape)
+      if self.xclbin_opts["GEMX_dataType"] == "float":
+        padded_arr = self.format_for_fpga(inp, self.min_k, self.min_n)
+        np.copyto(self.fpga_buf[0],  padded_arr, casting='same_kind', where=True)
+      else:
+        padded_arr = self.format_for_fpga(inp * in_scale, self.min_k, self.min_n)
+        np.copyto(self.fpga_buf[0],  np.int16(np.around(padded_arr)), casting='same_kind', where=True)
+      for i in self.fpga_buf:
+        xfblas.sendMat(i,self.idxKernel,self.idxDevice)
+      self.loadInstr()
+      xfblas.getMat (self.fpga_buf[-1],self.idxKernel,self.idxDevice)
+      for i in self.fpga_buf:
+        xfblas.freeMat(i,self.idxKernel,self.idxDevice)
+      for i in self._qb:
+        xfblas.freeMat(i,self.idxKernel,self.idxDevice)
+      return self.fpga_buf[-1][:self.out_dim[0],:self.out_dim[1]]
+      
+  def predict ( self, inp, in_scale):
+      self.init_fpgabuf(inp.shape)
+      if self.xclbin_opts["GEMX_dataType"] == "float":
+        padded_arr = self.format_for_fpga(inp, self.min_k, self.min_n)
+        np.copyto(self.fpga_buf[0],  padded_arr, casting='same_kind', where=True)
+      else:
+        padded_arr = self.format_for_fpga(inp * in_scale, self.min_k, self.min_n)
+        np.copyto(self.fpga_buf[0],  np.int16(np.around(padded_arr)), casting='same_kind', where=True)
+      for i in self.fpga_buf:
+        self.offset_list.append(self.get_offset(i))
+      xfblas.sendMat(self.fpga_buf[0],self.idxKernel,self.idxDevice)
+      self.loadInstrByAddress()
+      xfblas.execute(self.idxKernel,self.idxDevice)
+      xfblas.getMatByAddress (self.fpga_buf[-1],self.offset_list[-2],self.idxKernel,self.idxDevice)
+      xfblas.freeMat(self.fpga_buf[0],self.idxKernel,self.idxDevice)
+      for i in self._qb:
+        xfblas.freeMat(i,self.idxKernel,self.idxDevice)
+      self.offset_list=self.offset_list[:len(self._qw)+1] # only keep the offset for weights
+      return self.fpga_buf[-1][:self.out_dim[0],:self.out_dim[1]]
+
+  def send_matrices(self, inp, in_scale):
+      self.init_fpgabuf(inp.shape)
+      if self.xclbin_opts["GEMX_dataType"] == "float":
+        padded_arr = self.format_for_fpga(inp, self.min_k, self.min_n)
+        np.copyto(self.fpga_buf[0],  padded_arr, casting='same_kind', where=True)
+      else:
+        padded_arr = self.format_for_fpga(inp * in_scale, self.min_k, self.min_n)
+        np.copyto(self.fpga_buf[0],  np.int16(np.around(padded_arr)), casting='same_kind', where=True)
+      for i in self.fpga_buf:
+        self.offset_list.append(self.get_offset(i))
+      xfblas.sendMat(self.fpga_buf[0],self.idxKernel,self.idxDevice)
+      self.loadInstrByAddress()
+      
+  def single_execute(self):
+      xfblas.execute(self.idxKernel,self.idxDevice)
+     
+  def get_result(self):
+      xfblas.getMatByAddress (self.fpga_buf[-1],self.offset_list[-2],self.idxKernel,self.idxDevice)
+      xfblas.freeMat(self.fpga_buf[0],self.idxKernel,self.idxDevice)
+      for i in self._qb:
+        xfblas.freeMat(i,self.idxKernel,self.idxDevice)
+      self.offset_list=self.offset_list[:len(self._qw)+1] # only keep the offset for weights
+      return self.fpga_buf[-1][:self.out_dim[0],:self.out_dim[1]]
\ No newline at end of file
diff --git a/L3/tests/jenkins_autorun.sh b/L3/tests/jenkins_autorun.sh
deleted file mode 100755
index 0d608902e4..0000000000
--- a/L3/tests/jenkins_autorun.sh
+++ /dev/null
@@ -1,15 +0,0 @@
-#/bin/bash
-
-TEST_DIR=./out_test
-
-if [ -d $TEST_DIR ]; then
-  rm $TEST_DIR -rf
-fi
-
-source set_env.sh
-
-PYTHON=python
-PYTEST=./python/run_test.py
-
-$PYTHON $PYTEST --operator gemm gemv --shell vcu1525_dynamic_5_1
-
diff --git a/L3/tests/run_test.py b/L3/tests/run_test.py
index d55a804a9c..87f175be9c 120000
--- a/L3/tests/run_test.py
+++ b/L3/tests/run_test.py
@@ -1 +1 @@
-python/run_test.py
\ No newline at end of file
+sw/python/run_test.py
\ No newline at end of file
diff --git a/L3/tests/python/operation.py b/L3/tests/sw/python/operation.py
similarity index 100%
rename from L3/tests/python/operation.py
rename to L3/tests/sw/python/operation.py
diff --git a/L3/tests/python/run_test.py b/L3/tests/sw/python/run_test.py
similarity index 100%
rename from L3/tests/python/run_test.py
rename to L3/tests/sw/python/run_test.py
diff --git a/L3/tests/xf_blas/gemm/test.cpp b/L3/tests/xf_blas/gemm/test.cpp
index 2bf321616d..b927a16d8f 100644
--- a/L3/tests/xf_blas/gemm/test.cpp
+++ b/L3/tests/xf_blas/gemm/test.cpp
@@ -32,10 +32,6 @@ int main(int argc, char** argv) {
     string l_configFile(argv[l_argIdx++]);
     int iterIndex = atoi(argv[l_argIdx++]);
     string l_dataDir(argv[l_argIdx++]);
-    string l_logFile;
-    ofstream logFile("out_test/xrt_report.txt");
-    logFile.close();
-    l_logFile = "out_test/xrt_report.txt";
 
     int i, j;
     XFBLAS_dataType *a, *b, *c, *goldenC;
@@ -77,8 +73,7 @@ int main(int argc, char** argv) {
     XFBLAS_dataType* d_c = NULL;
 
     xfblasEngine_t engineName = XFBLAS_ENGINE_GEMM;
-    xfblasStatus_t status =
-        xfblasCreate(l_xclbinFile.c_str(), l_configFile, l_logFile.c_str(), engineName, l_numKernel);
+    xfblasStatus_t status = xfblasCreate(l_xclbinFile.c_str(), l_configFile, engineName, l_numKernel);
     if (status != XFBLAS_STATUS_SUCCESS) {
         cout << "Create Handle failed with error code: " << status << "\n";
         return EXIT_FAILURE;
diff --git a/L3/tests/xf_blas/gemv/test.cpp b/L3/tests/xf_blas/gemv/test.cpp
index d7e248a0eb..fa6b12e581 100644
--- a/L3/tests/xf_blas/gemv/test.cpp
+++ b/L3/tests/xf_blas/gemv/test.cpp
@@ -32,10 +32,6 @@ int main(int argc, char** argv) {
     string l_configFile(argv[l_argIdx++]);
     int iterIndex = atoi(argv[l_argIdx++]);
     string l_dataDir(argv[l_argIdx++]);
-    string l_logFile;
-    ofstream logFile("out_test/xrt_report.txt");
-    logFile.close();
-    l_logFile = "out_test/xrt_report.txt";
 
     int i, j;
     XFBLAS_dataType *a, *x, *y, *goldenY;
@@ -76,8 +72,7 @@ int main(int argc, char** argv) {
     XFBLAS_dataType* d_y = NULL;
 
     xfblasEngine_t engineName = XFBLAS_ENGINE_GEMV;
-    xfblasStatus_t status =
-        xfblasCreate(l_xclbinFile.c_str(), l_configFile, l_logFile.c_str(), engineName, l_numKernel);
+    xfblasStatus_t status = xfblasCreate(l_xclbinFile.c_str(), l_configFile, engineName, l_numKernel);
     if (status != XFBLAS_STATUS_SUCCESS) {
         cout << "Create Handle failed with error code: " << status << "\n";
         return EXIT_FAILURE;
diff --git a/docs/src/user_guide/L3/L3_benchmark.rst b/docs/src/user_guide/L3/L3_benchmark.rst
index 67ca333d42..108e1ea617 100644
--- a/docs/src/user_guide/L3/L3_benchmark.rst
+++ b/docs/src/user_guide/L3/L3_benchmark.rst
@@ -18,13 +18,8 @@
 =====================
 L3 API benchmark
 =====================
-For benchmark code references please follow the link below. 
-
-**1. XFBLAS L3 benchmark**
-
-Please see XFBLAS L3 benchmark folder for more benchmark cases. 
 
 .. toctree::
-   :maxdepth: 2
+   :maxdepth: 2 
    
    L3_benchmark_gemm.rst
diff --git a/docs/src/user_guide/L3/L3_benchmark_gemm.rst b/docs/src/user_guide/L3/L3_benchmark_gemm.rst
index b76aa685e7..77764435e3 100644
--- a/docs/src/user_guide/L3/L3_benchmark_gemm.rst
+++ b/docs/src/user_guide/L3/L3_benchmark_gemm.rst
@@ -15,26 +15,58 @@
 
 .. _benchmark_gemm_l3:
 
-======================
 L3 API GEMM benchmark
 ======================
 
-1. Intel® Math Kernel Library (MKL)
-------------------------------------
+1. Benchmarking Intel® Math Kernel Librry (MKL)
+------------------------------------------------
 
 1.1 Introduction
-^^^^^^^^^^^^^^^^^
+----------------
+
 Intel® Math Kernel Library provides performance improvement of math functions, e.g. GEMM, when running with Intel processors. To compare with Xilinx's XFBLAS library, you can use our run-script (run_gemm_mkl.sh) to generate the data and performance benchmark.
 
-1.2 Prerequisites
-^^^^^^^^^^^^^^^^^^
+.. _MKL_benchmark:
+
+1.2 Benchmarking Steps
+----------------------
+
+1.2.1 Access Nimbix cloud
+-------------------------
+
+- Follow the user guide `Vitis On Nimbix`_ to login to your Nimbix account
+- Launch application "Xilinx SDAccel Development 2019.1" and select "Desktop Mode with FPGA"
+- Choose machine type "16 core, 128 GB RAM, Xilinx Alveo U250 FPGA (nx5u_xdma_201830_2)"
+- Copy the L3/bencharks/gemm directory to the Nimbix machine, and navigate to the gemm/gemm_mkl directory
+- Follow the steps below to run Intel® MKL GEMM APIsbenchmarks.
+
+.. _Vitis On Nimbix: https://www.xilinx.com/xilinxtraining/assessments/portal/alveo/intro_nimbix_cloud/story_html5.html 
+
+.. NOTE:: FPGA is not required in Intel® Math Kernel Library but will be used in Xilinx's XFBLAS library.
+
+1.2.2 Install Intel® MK library
+--------------------------------
+
+To install MKL on Nimbix, please download the full installation package for MKL2020 from `Intel® MKL Webste`_. You need to register for downloading the package. After you have downloaded the package, please unzip it and navigate to the directory includeing "install.sh". Please enter the following command to install the MKL package.
+
+.. code-block:: bash 
+  
+  sudo ./install.sh
+
+.. _Intel® MKL Webste: https://software.intel.com/en-us/mkl/choose-download/linux
+
+1.2.3 Set up MKL encironment variables
+--------------------------------------
 
 **Intel® MKL**: Assume you have installed Intel® MKL, run the appropriate script to set up the environment variables (such as $MKLROOT).
 
 .. code-block:: bash
  
   source <INTEL_MKL_INSTALL_DIR>/bin/mklvars.sh intel64
-  
+
+1.2.4 Install numactl
+---------------------
+
 **NUMACTL**: The linux operating system provides a function, called numactl, that allows the control of scheduling or memory placement policy, which is essential to run parallel programs.
 
 For Ubuntu (you only need to do it once),
@@ -43,8 +75,8 @@ For Ubuntu (you only need to do it once),
  
   sudo apt-get install numactl
 
-1.3 Benchmarking Procedures
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+1.2.5 Run MKL benchmarking script 
+---------------------------------
 
 The run-script runs the GEMM benchmark with a number of threads, data type, and work mode. Then, it will explore the GEMM's matrix size from 256 to 16384.
 
@@ -60,21 +92,8 @@ The run-script runs the GEMM benchmark with a number of threads, data type, and
 
 - mode: **g** for generating the data, **b** for benchmarking the performance, and **a** for both workloads. 
 
-1.4 Running on Nimbix Cloud
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-- Follow the user guide `Vitis On Nimbix`_ to login to your Nimbix account
-- Launch application "Xilinx SDAccel Development & Alveo FPGA 2018.3" and select "Desktop Mode with FPGA"
-- Choose machine type "16 core, 128 GB RAM, Xilinx Alveo U200 FPGA (nx5u_xdma_201830_1)"
-- Copy the L3/bencharks/gemm directory to the Nimbix machine, and navigate to the gemm/gemm_mkl directory
-- Run Intel® MKL GEMM APIs according to the above benchmark procedures.
-
-.. _Vitis On Nimbix: https://www.xilinx.com/support/documentation/sw_manuals/xilinx2018_3/ug1240-sdaccel-nimbix-getting-started.pdf
-
-.. NOTE:: FPGA is not required in Intel® Math Kernel Library but will be used in Xilinx's XFBLAS library.
-
-1.5 Performance Result on Nimbix Cloud
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+1.3 Performance Result on Nimbix Cloud
+--------------------------------------
 
 .. rubric:: Configuration:
 
@@ -90,50 +109,53 @@ The run-script runs the GEMM benchmark with a number of threads, data type, and
 	*
 		- data_type
 		- float
-		
+	*
+		- benchmark command 
+		- ./run_gemm_mkl.sh 16 float a
+
 .. rubric:: Performance Result (nonCaching):
 
-+-------------+--------------+------------+-------------+
-| Matrix Size | Cache (Y/N)  | TimeApiMS  | PerfApiTops |
-+=============+==============+============+=============+
-| 256         | N            |   29.700   | 0.001       |
-+-------------+--------------+------------+-------------+
-| 512         | N            |   11.799   | 0.023       |
-+-------------+--------------+------------+-------------+
-| 1024        | N            |   16.591   | 0.129       |
-+-------------+--------------+------------+-------------+
-| 2048        | N            |   41.319   | 0.416       |
-+-------------+--------------+------------+-------------+
-| 4096        | N            |  172.369   | 0.797       |
-+-------------+--------------+------------+-------------+
-| 8192        | N            | 1073.250   | 1.024       |
-+-------------+--------------+------------+-------------+
-| 16384       | N            | 9060.830   | 0.971       |
-+-------------+--------------+------------+-------------+
++--------------------+-------------------------------------+-------------+---------------+-------------+
+| Square Matrix Size | matrix paris running simultaneously | Cache (Y/N) | API time(ms)  | TFlops/sec  |
++====================+=====================================+=============+===============+=============+
+| 256                | 1                                   | N           |   29.700      | 0.001       |
++--------------------+-------------------------------------+-------------+---------------+-------------+
+| 512                | 1                                   | N           |   11.799      | 0.023       |
++--------------------+-------------------------------------+-------------+---------------+-------------+
+| 1024               | 1                                   | N           |   16.591      | 0.129       |
++--------------------+-------------------------------------+-------------+---------------+-------------+
+| 2048               | 1                                   | N           |   41.319      | 0.416       |
++--------------------+-------------------------------------+-------------+---------------+-------------+
+| 4096               | 1                                   | N           |  172.369      | 0.797       |
++--------------------+-------------------------------------+-------------+---------------+-------------+
+| 8192               | 1                                   | N           | 1073.250      | 1.024       |
++--------------------+-------------------------------------+-------------+---------------+-------------+
+| 16384              | 1                                   | N           | 9060.830      | 0.971       |
++--------------------+-------------------------------------+-------------+---------------+-------------+
 
 .. rubric:: Performance Result (Caching):
 
-+-------------+--------------+------------+-------------+
-| Matrix Size | Cache (Y/N)  | TimeApiMS  | PerfApiTops |
-+=============+==============+============+=============+
-| 256         | Y            |    1.380   | 0.024       |
-+-------------+--------------+------------+-------------+
-| 512         | Y            |    4.038   | 0.066       |
-+-------------+--------------+------------+-------------+
-| 1024        | Y            |    4.383   | 0.490       |
-+-------------+--------------+------------+-------------+
-| 2048        | Y            |   21.282   | 0.807       |
-+-------------+--------------+------------+-------------+
-| 4096        | Y            |  149.755   | 0.918       |
-+-------------+--------------+------------+-------------+
-| 8192        | Y            | 1042.860   | 1.054       |
-+-------------+--------------+------------+-------------+
-| 16384       | Y            | 9045.700   | 0.972       |
-+-------------+--------------+------------+-------------+
-
-
-1.6 Reference
-^^^^^^^^^^^^^^
++--------------------+-------------------------------------+-------------+---------------+-------------+
+| Square Matrix Size | matrix paris running simultaneously | Cache (Y/N) | API time(ms)  | TFlops/sec  |
++====================+=====================================+=============+===============+=============+
+| 256                | 1                                   | Y           |    1.380      | 0.024       |
++--------------------+-------------------------------------+-------------+---------------+-------------+
+| 512                | 1                                   | Y           |    4.038      | 0.066       |
++--------------------+-------------------------------------+-------------+---------------+-------------+
+| 1024               | 1                                   | Y           |    4.383      | 0.490       |
++--------------------+-------------------------------------+-------------+---------------+-------------+
+| 2048               | 1                                   | Y           |   21.282      | 0.807       |
++--------------------+-------------------------------------+-------------+---------------+-------------+
+| 4096               | 1                                   | Y           |  149.755      | 0.918       |
++--------------------+-------------------------------------+-------------+---------------+-------------+
+| 8192               | 1                                   | Y           | 1042.860      | 1.054       |
++--------------------+-------------------------------------+-------------+---------------+-------------+
+| 16384              | 1                                   | Y           | 9045.700      | 0.972       |
++--------------------+-------------------------------------+-------------+---------------+-------------+
+
+
+1.4 Reference
+-------------
 
 [1] `Improving Performance of Math Functions with Intel® Math Kernel Library`_
 
@@ -144,115 +166,95 @@ The run-script runs the GEMM benchmark with a number of threads, data type, and
 .. _Benchmarking GEMM on Intel® Architecture Processors: https://software.intel.com/en-us/articles/benchmarking-gemm-with-intel-mkl-and-blis-on-intel-processors
 
 
-2. xfblasGemm - Xilinx's XFBLAS library
-----------------------------------------
+2. Benchmarking xfblasGemm - Xilinx's XFBLAS library
+------------------------------------------------------
+
+Before benchmarking xfblashGemm, please download `xf blas xclbin files`_, unzip the file with "tar -xvzf" command, and copy the folder u250_xdma_201830_2 to directory L3/overlay.
+
+.. _xf blas xclbin files: https://www.xilinx.com/bin/public/openDownload?filename=vitis_BLAS_library_r1.0_xclbin.tar
+
+2.1 Benchmarking Steps 
+----------------------
+
+2.1.1 Generate test inputs and golden reference
+-----------------------------------------------
+
+Follow the MKL_benchmark_ steps to run MKL benchmarks, for float and short data type to generate test inputs and golden reference. To generate test inputs and golden reference for float data type, please run the following command.
+
+.. code-block:: bash
+
+  ./run_gemm_mkl.sh 16 float a
+
+To generate test inputs and golden reference for short data type, please run the following command.
+
+.. code-block:: bash
+
+  ./run_gemm_mkl.sh 16 short a
+
+2.1.2 Build benchmark application
+---------------------------------
+
+Before benchmark the xfblasGemm, please build the host executable for the corresponding .xclbin files via following script
+
+.. code-block:: bash
 
-You can use the run-script to benchmark our Xilinx's XFBLAS library for the GEMM routine and verify with the golden result generated by Intel® Math Kernel Library.
+  ./build_gemm_bench.sh confi_info_file
 
-2.1 Benchmarking Procedures
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+2.1.3 Run benchmark
+-------------------
 
-The run-script runs the GEMM benchmark with xclbin and cfg files. Then, it will explore the GEMM's matrix size from 256 to 8192.
+The run-script runs the GEMM benchmark with xclbin and cfg files. It will explore the GEMM's matrix size from 256 to 8192.
 
 .. code-block:: bash
  
-  ./run_gemm_benchmark.sh path_to_xclbin path_to_config_info
+  ./run_gemm_benchmark.sh xclbin_file config_info_file
   
 .. rubric:: where:
 
-- **path_to_xclbin** refers to the location of xclbin 
-- **path_to_config_info** refers to the location of cfg file.
+- **xclbin_fuke** refers to the gemx.xclbin file, including the path.
+- **config_info_file** refers to config_info.dat file, including the path.
   
-2.2 Running on Nimbix Cloud
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-- Follow the user guide "Vitis On Nimbix" to login to your Nimbix account
-- Launch application "Xilinx SDAccel Development & Alveo FPGA 2018.3" and select "Desktop Mode with FPGA"
-- Choose machine type "16 core, 128 GB RAM, Xilinx Alveo U200 FPGA (nx5u_xdma_201830_1)"
-- Copy the L3/bencharks/gemm directory to the Nimbix machine, and navigate to the gemm directory
-- Run Xilinx's XFBLAS APIs according to the above benchmark procedures.
-
-2.3 Performance Result on Nimbix Cloud
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+2.2 Performance Results on Nimbix Cloud
+-----------------------------------------
 
 .. rubric:: Configuration:
 
 .. list-table::
 	:widths: 20 80
-	
+
 	*
 		- fpga_model
-		- Xilinx Alveo U200 FPGA (nx5u_xdma_201830_1)
+		- Xilinx Alveo U250 FPGA (nx5u_xdma_201830_2)
 	*
 		- Frequency
-		- 124 Mhz
+		- 150 Mhz
 	*
 		- data_type
 		- float
-		
-.. rubric:: Performance Result:
-
-+-------------+--------------+------------+-------------+
-| Matrix Size | EffApiPct    | TimeApiMS  | PerfApiTops |
-+=============+==============+============+=============+
-| 256         | 34.859       |  1.516     |      0.022  |
-+-------------+--------------+------------+-------------+
-| 512         | 70.170       |  6.026     |      0.045  |
-+-------------+--------------+------------+-------------+
-| 1024        | 89.511       |  37.788    |      0.057  |
-+-------------+--------------+------------+-------------+
-| 2048        | 93.373       |  289.805   |      0.059  |
-+-------------+--------------+------------+-------------+
-| 4096        | 96.731       |  2237.969  |      0.061  |
-+-------------+--------------+------------+-------------+
-| 8192        | 97.281       |  17802.523 |      0.062  |
-+-------------+--------------+------------+-------------+
-| 16384       | 98.057       | 141292.933 |      0.062  |
-+-------------+--------------+------------+-------------+
-
-2.4 Performance Result on Nimbix Cloud (float32, asynchronous)
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-.. rubric:: Configuration:
-
-.. list-table::
-	:widths: 20 80
-	
 	*
-		- fpga_model
-		- Xilinx Alveo U200 FPGA (nx5u_xdma_201830_2)
+		- build command 
+		- ./build_gemm_bench.sh ../../overlay/u250_xdma_201830_2/gemm_float_4kernel/config_info.dat
 	*
-		- kernel#
-		- 2
-	*
-		- Frequency
-		- 114 Mhz
-	*
-		- data_type
-		- float32
+		- benchmark command
+		- ./run_gemm_bench.sh ../../overlay/u250_xdma_201830_2/gemm_float_4kernel/gemx.xclbin ../../overlay/u250_xdma_201830_2/gemm_float_4kernel/confi_info.dat
 		
 .. rubric:: Performance Result:
 
-+-------------+--------------+------------+-------------+
-| Matrix Size | EffApiPct (%)| TimeApiMS  | PerfApiTops |
-+=============+==============+============+=============+
-| 256         | 28.778       |  1.998     |      0.034  |
-+-------------+--------------+------------+-------------+
-| 512         | 61.213       |  7.513     |      0.072  |
-+-------------+--------------+------------+-------------+
-| 1024        | 81.209       |  45.306    |      0.095  |
-+-------------+--------------+------------+-------------+
-| 2048        | 88.514       |  332.532   |      0.103  |
-+-------------+--------------+------------+-------------+
-| 4096        | 93.797       |  2510.409  |      0.110  |
-+-------------+--------------+------------+-------------+
-| 8192        | 95.073       |  19813.749 |      0.111  |
-+-------------+--------------+------------+-------------+
-| 16384       | 95.801       | 157306.027 |      0.112  |
-+-------------+--------------+------------+-------------+
-
-2.5 Performance Result on Nimbix Cloud (int16, asynchronous)
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
++--------------------+-------------------------------------+--------------+-------------+
+| Square Matrix Size | matrix paris running simultaneously | API time(ms) | TFlops/sec  |
++====================+=====================================+==============+=============+
+| 256                | 4                                   |  2.715       |      0.049  |
++--------------------+-------------------------------------+--------------+-------------+
+| 512                | 4                                   |  7.223       |      0.149  |
++--------------------+-------------------------------------+--------------+-------------+
+| 1024               | 4                                   |  40.020      |      0.214  |
++--------------------+-------------------------------------+--------------+-------------+
+| 2048               | 4                                   |  292.971     |      0.234  |
++--------------------+-------------------------------------+--------------+-------------+
+| 4096               | 4                                   |  1990.240    |      0.276  |
++--------------------+-------------------------------------+--------------+-------------+
+| 8192               | 4                                   |  15317.589   |      0.287  |
++--------------------+-------------------------------------+--------------+-------------+
 
 .. rubric:: Configuration:
 
@@ -261,75 +263,35 @@ The run-script runs the GEMM benchmark with xclbin and cfg files. Then, it will
 	
 	*
 		- fpga_model
-		- Xilinx Alveo U200 FPGA (nx5u_xdma_201830_2)
-	*
-		- kernel#
-		- 2
+		- Xilinx Alveo U250 FPGA (nx5u_xdma_201830_2)
 	*
 		- Frequency
-		- 240 Mhz
+		- 231 Mhz
 	*
 		- data_type
-		- int16
-		
-.. rubric:: Performance Result:
-
-+-------------+--------------+------------+-------------+
-| Matrix Size | EffApiPct (%)| TimeApiMS  | PerfApiTops |
-+=============+==============+============+=============+
-| 256         | 5.383        |  1.268     |      0.053  |
-+-------------+--------------+------------+-------------+
-| 512         | 21.094       |  2.589     |      0.208  |
-+-------------+--------------+------------+-------------+
-| 1024        | 39.554       |  11.046    |      0.389  |
-+-------------+--------------+------------+-------------+
-| 2048        | 62.193       |  56.200    |      0.612  |
-+-------------+--------------+------------+-------------+
-| 4096        | 73.463       |  380.626   |      0.722  |
-+-------------+--------------+------------+-------------+
-| 8192        | 76.867       |  2910.186  |      0.756  |
-+-------------+--------------+------------+-------------+
-| 16384       | 77.626       |  23053.77  |      0.763  |
-+-------------+--------------+------------+-------------+
-
-2.6 Performance Result on xbxcloud5 (int16, asynchronous)
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-.. rubric:: Configuration:
-
-.. list-table::
-	:widths: 20 80
-	
+		- short
 	*
-		- fpga_model
-		- Xilinx Alveo VCU1525 FPGA
+		- build command 
+		- ./build_gemm_bench.sh ../../overlay/u250_xdma_201830_2/gemm_float_4kernel/config_info.dat
 	*
-		- kernel#
-		- 2
-	*
-		- Frequency
-		- 240 Mhz
-	*
-		- data_type
-		- int16
+		- benchmark command
+		- ./run_gemm_bench.sh ../../overlay/u250_xdma_201830_2/gemm_float_4kernel/gemx.xclbin ../../overlay/u250_xdma_201830_2/gemm_float_4kernel/confi_info.dat
 		
 .. rubric:: Performance Result:
 
-+-------------+--------------+------------+-------------+
-| Matrix Size | EffApiPct (%)| TimeApiMS  | PerfApiTops |
-+=============+==============+============+=============+
-| 256         | 6.544        |  1.043     |      0.065  |
-+-------------+--------------+------------+-------------+
-| 512         | 30.082       |  1.816     |      0.297  |
-+-------------+--------------+------------+-------------+
-| 1024        | 60.016       |  7.280     |      0.591  |
-+-------------+--------------+------------+-------------+
-| 2048        | 79.433       |  44.003    |      0.781  |
-+-------------+--------------+------------+-------------+
-| 4096        | 89.734       |  311.611   |      0.882  |
-+-------------+--------------+------------+-------------+
-| 8192        | 95.224       |  2349.166  |      0.936  |
-+-------------+--------------+------------+-------------+
-| 16384       | 97.416       |  18370.401 |      0.958  |
-+-------------+--------------+------------+-------------+
++--------------------+-------------------------------------+--------------+-------------+
+| Square Matrix Size | matrix paris running simultaneously | API time(ms) | Tops/sec    |
++====================+=====================================+==============+=============+
+| 256                | 4                                   |  1.436       |      0.093  |
++--------------------+-------------------------------------+--------------+-------------+
+| 512                | 4                                   |  2.589       |      0.415  |
++--------------------+-------------------------------------+--------------+-------------+
+| 1024               | 4                                   |  13.885      |      0.619  |
++--------------------+-------------------------------------+--------------+-------------+
+| 2048               | 4                                   |  61.879      |      1.111  |
++--------------------+-------------------------------------+--------------+-------------+
+| 4096               | 4                                   |  416.086     |      1.321  |
++--------------------+-------------------------------------+--------------+-------------+
+| 8192               | 4                                   |  3443.76     |      1.277  |
++--------------------+-------------------------------------+--------------+-------------+